[Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing

Vladimir Sementsov-Ogievskiy posted 3 patches 7 years, 7 months ago
[Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Vladimir Sementsov-Ogievskiy 7 years, 7 months ago
We need to synchronize backup job with reading from fleecing image
like it was done in block/replication.c.

Otherwise, the following situation is theoretically possible:

1. client start reading
2. client understand, that there is no corresponding cluster in
   fleecing image
3. client is going to read from backing file (i.e. active image)
4. guest writes to active image
5. this write is stopped by backup(sync=none) and cluster is copied to
   fleecing image
6. guest write continues...
7. and client reads _new_ (or partly new) date from active image

So, this fleecing-filter should be above fleecing image, the whole
picture of fleecing looks like this:

    +-------+           +------------+
    |       |           |            |
    | guest |           | NBD client +<------+
    |       |           |            |       |
    ++-----++           +------------+       |only read
     |     ^                                 |
     | IO  |                                 |
     v     |                           +-----+------+
    ++-----+---------+                 |            |
    |                |                 |  internal  |
    |  active image  +----+            | NBD server |
    |                |    |            |            |
    +-+--------------+    |backup      +-+----------+
      ^                   |sync=none     ^
      |backing            |              |only read
      |                   |              |
    +-+--------------+    |       +------+----------+
    |                |    |       |                 |
    | fleecing image +<---+       | fleecing filter |
    |                |            |                 |
    +--------+-------+            +-----+-----------+
             ^                          |
             |                          |
             +--------------------------+
                       file

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 qapi/block-core.json    |  6 ++--
 block/fleecing-filter.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
 block/Makefile.objs     |  1 +
 3 files changed, 85 insertions(+), 2 deletions(-)
 create mode 100644 block/fleecing-filter.c

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 577ce5e999..43872c3d79 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2542,7 +2542,8 @@
             'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
             'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
             'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
-            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
+            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
+            'fleecing-filter' ] }
 
 ##
 # @BlockdevOptionsFile:
@@ -3594,7 +3595,8 @@
       'vmdk':       'BlockdevOptionsGenericCOWFormat',
       'vpc':        'BlockdevOptionsGenericFormat',
       'vvfat':      'BlockdevOptionsVVFAT',
-      'vxhs':       'BlockdevOptionsVxHS'
+      'vxhs':       'BlockdevOptionsVxHS',
+      'fleecing-filter': 'BlockdevOptionsGenericFormat'
   } }
 
 ##
diff --git a/block/fleecing-filter.c b/block/fleecing-filter.c
new file mode 100644
index 0000000000..b501887c10
--- /dev/null
+++ b/block/fleecing-filter.c
@@ -0,0 +1,80 @@
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+#include "block/block_backup.h"
+
+static int64_t fleecing_getlength(BlockDriverState *bs)
+{
+    return bdrv_getlength(bs->file->bs);
+}
+
+static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
+                                           uint64_t offset, uint64_t bytes,
+                                           QEMUIOVector *qiov, int flags)
+{
+    int ret;
+    BlockJob *job = bs->file->bs->backing->bs->job;
+    CowRequest req;
+
+    backup_wait_for_overlapping_requests(job, offset, bytes);
+    backup_cow_request_begin(&req, job, offset, bytes);
+
+    ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
+
+    backup_cow_request_end(&req);
+
+    return ret;
+}
+
+static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
+                                            uint64_t offset, uint64_t bytes,
+                                            QEMUIOVector *qiov, int flags)
+{
+    return -EINVAL;
+}
+
+static bool fleecing_recurse_is_first_non_filter(BlockDriverState *bs,
+                                                 BlockDriverState *candidate)
+{
+    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
+}
+
+static int fleecing_open(BlockDriverState *bs, QDict *options,
+                         int flags, Error **errp)
+{
+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
+                               errp);
+
+    return bs->file ? 0 : -EINVAL;
+}
+
+static void fleecing_close(BlockDriverState *bs)
+{
+    /* Do nothing, we have to add .bdrv_close, because bdrv_close() don't check
+     * it, just call. */
+}
+
+BlockDriver bdrv_fleecing_filter = {
+    .format_name = "fleecing-filter",
+    .protocol_name = "fleecing-filter",
+    .instance_size = 0,
+
+    .bdrv_open = fleecing_open,
+    .bdrv_close = fleecing_close,
+
+    .bdrv_getlength = fleecing_getlength,
+    .bdrv_co_preadv = fleecing_co_preadv,
+    .bdrv_co_pwritev = fleecing_co_pwritev,
+
+    .is_filter = true,
+    .bdrv_recurse_is_first_non_filter = fleecing_recurse_is_first_non_filter,
+    .bdrv_child_perm        = bdrv_filter_default_perms,
+};
+
+static void bdrv_fleecing_init(void)
+{
+    bdrv_register(&bdrv_fleecing_filter);
+}
+
+block_init(bdrv_fleecing_init);
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 899bfb5e2c..aa0a6dd971 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -27,6 +27,7 @@ block-obj-y += write-threshold.o
 block-obj-y += backup.o
 block-obj-$(CONFIG_REPLICATION) += replication.o
 block-obj-y += throttle.o copy-on-read.o
+block-obj-y += fleecing-filter.o
 
 block-obj-y += crypto.o
 
-- 
2.11.1


Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Eric Blake 7 years, 7 months ago
On 06/29/2018 10:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> We need to synchronize backup job with reading from fleecing image
> like it was done in block/replication.c.
> 
> Otherwise, the following situation is theoretically possible:
> 

Grammar suggestions:

> 1. client start reading

client starts reading

> 2. client understand, that there is no corresponding cluster in
>     fleecing image
> 3. client is going to read from backing file (i.e. active image)

client sees that no corresponding cluster has been allocated in the 
fleecing image, so the request is forwarded to the backing file

> 4. guest writes to active image
> 5. this write is stopped by backup(sync=none) and cluster is copied to
>     fleecing image
> 6. guest write continues...
> 7. and client reads _new_ (or partly new) date from active image

Interesting race. Can it actually happen, or does our read code already 
serialize writes to the same area while a read is underway?

In short, I see what problem you are claiming exists: the moment the 
client starts reading from the backing file, that portion of the backing 
file must remain unchanged until after the client is done reading.  But 
I don't know enough details of the block layer to know if this is 
actually a problem, or if adding the new filter is just overhead.

> 
> So, this fleecing-filter should be above fleecing image, the whole
> picture of fleecing looks like this:
> 
>      +-------+           +------------+
>      |       |           |            |
>      | guest |           | NBD client +<------+
>      |       |           |            |       |
>      ++-----++           +------------+       |only read
>       |     ^                                 |
>       | IO  |                                 |
>       v     |                           +-----+------+
>      ++-----+---------+                 |            |
>      |                |                 |  internal  |
>      |  active image  +----+            | NBD server |
>      |                |    |            |            |
>      +-+--------------+    |backup      +-+----------+
>        ^                   |sync=none     ^
>        |backing            |              |only read
>        |                   |              |
>      +-+--------------+    |       +------+----------+
>      |                |    |       |                 |
>      | fleecing image +<---+       | fleecing filter |
>      |                |            |                 |
>      +--------+-------+            +-----+-----------+
>               ^                          |
>               |                          |
>               +--------------------------+
>                         file

Can you also show the sequence of QMP commands to set up this structure 
(or maybe you do in 3/3; which I haven't looked at yet).

> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>   qapi/block-core.json    |  6 ++--
>   block/fleecing-filter.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
>   block/Makefile.objs     |  1 +
>   3 files changed, 85 insertions(+), 2 deletions(-)
>   create mode 100644 block/fleecing-filter.c
> 
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 577ce5e999..43872c3d79 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2542,7 +2542,8 @@
>               'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
>               'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
>               'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
> -            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
> +            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
> +            'fleecing-filter' ] }

Missing a 'since 3.0' documentation blurb; also, this enum has been kept 
sorted, so your new filter needs to come earlier.

>   
>   ##
>   # @BlockdevOptionsFile:
> @@ -3594,7 +3595,8 @@
>         'vmdk':       'BlockdevOptionsGenericCOWFormat',
>         'vpc':        'BlockdevOptionsGenericFormat',
>         'vvfat':      'BlockdevOptionsVVFAT',
> -      'vxhs':       'BlockdevOptionsVxHS'
> +      'vxhs':       'BlockdevOptionsVxHS',
> +      'fleecing-filter': 'BlockdevOptionsGenericFormat'

Again, this has been kept sorted.

> +static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
> +                                           uint64_t offset, uint64_t bytes,
> +                                           QEMUIOVector *qiov, int flags)
> +{
> +    int ret;
> +    BlockJob *job = bs->file->bs->backing->bs->job;
> +    CowRequest req;
> +
> +    backup_wait_for_overlapping_requests(job, offset, bytes);
> +    backup_cow_request_begin(&req, job, offset, bytes);
> +
> +    ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
> +
> +    backup_cow_request_end(&req);
> +
> +    return ret;
> +}

So the idea here is that you force a serializing request to ensure that 
there are no other writes to the area in the meantime.

> +
> +static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
> +                                            uint64_t offset, uint64_t bytes,
> +                                            QEMUIOVector *qiov, int flags)
> +{
> +    return -EINVAL;

and you force this to be a read-only interface. (Does the block layer 
actually require us to provide a pwritev callback, or can we leave it 
NULL instead?)

> +BlockDriver bdrv_fleecing_filter = {
> +    .format_name = "fleecing-filter",
> +    .protocol_name = "fleecing-filter",
> +    .instance_size = 0,
> +
> +    .bdrv_open = fleecing_open,
> +    .bdrv_close = fleecing_close,
> +
> +    .bdrv_getlength = fleecing_getlength,
> +    .bdrv_co_preadv = fleecing_co_preadv,
> +    .bdrv_co_pwritev = fleecing_co_pwritev,
> +
> +    .is_filter = true,
> +    .bdrv_recurse_is_first_non_filter = fleecing_recurse_is_first_non_filter,
> +    .bdrv_child_perm        = bdrv_filter_default_perms,

No .bdrv_co_block_status callback?  That probably hurts querying for 
sparse regions.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Fam Zheng 7 years, 7 months ago
On Fri, 06/29 12:24, Eric Blake wrote:
> On 06/29/2018 10:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> > We need to synchronize backup job with reading from fleecing image
> > like it was done in block/replication.c.
> > 
> > Otherwise, the following situation is theoretically possible:
> > 
> 
> Grammar suggestions:
> 
> > 1. client start reading
> 
> client starts reading
> 
> > 2. client understand, that there is no corresponding cluster in
> >     fleecing image
> > 3. client is going to read from backing file (i.e. active image)
> 
> client sees that no corresponding cluster has been allocated in the fleecing
> image, so the request is forwarded to the backing file
> 
> > 4. guest writes to active image
> > 5. this write is stopped by backup(sync=none) and cluster is copied to
> >     fleecing image
> > 6. guest write continues...
> > 7. and client reads _new_ (or partly new) date from active image
> 
> Interesting race. Can it actually happen, or does our read code already
> serialize writes to the same area while a read is underway?

Yes, I wonder why wait_serialising_requests() is not enough. If it's possible,
can we have a test case (with help of blkdebug, for example)?

> 
> In short, I see what problem you are claiming exists: the moment the client
> starts reading from the backing file, that portion of the backing file must
> remain unchanged until after the client is done reading.  But I don't know
> enough details of the block layer to know if this is actually a problem, or
> if adding the new filter is just overhead.

Fam

Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Vladimir Sementsov-Ogievskiy 7 years, 7 months ago
02.07.2018 09:35, Fam Zheng wrote:
> On Fri, 06/29 12:24, Eric Blake wrote:
>> On 06/29/2018 10:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> We need to synchronize backup job with reading from fleecing image
>>> like it was done in block/replication.c.
>>>
>>> Otherwise, the following situation is theoretically possible:
>>>
>> Grammar suggestions:
>>
>>> 1. client start reading
>> client starts reading
>>
>>> 2. client understand, that there is no corresponding cluster in
>>>      fleecing image
>>> 3. client is going to read from backing file (i.e. active image)
>> client sees that no corresponding cluster has been allocated in the fleecing
>> image, so the request is forwarded to the backing file
>>
>>> 4. guest writes to active image
>>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>>>      fleecing image
>>> 6. guest write continues...
>>> 7. and client reads _new_ (or partly new) date from active image
>> Interesting race. Can it actually happen, or does our read code already
>> serialize writes to the same area while a read is underway?
> Yes, I wonder why wait_serialising_requests() is not enough. If it's possible,
> can we have a test case (with help of blkdebug, for example)?

Hmm, only unaligned and COPY_ON_READ requests are marked serializing.. 
So If I understand correctly, nothing special prevents intersection, 
it's a problem of the guest. But in backup case, it's our problem.

>
>> In short, I see what problem you are claiming exists: the moment the client
>> starts reading from the backing file, that portion of the backing file must
>> remain unchanged until after the client is done reading.  But I don't know
>> enough details of the block layer to know if this is actually a problem, or
>> if adding the new filter is just overhead.
> Fam


-- 
Best regards,
Vladimir


Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Vladimir Sementsov-Ogievskiy 7 years, 7 months ago
29.06.2018 20:24, Eric Blake wrote:
> On 06/29/2018 10:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>> We need to synchronize backup job with reading from fleecing image
>> like it was done in block/replication.c.
>>
>> Otherwise, the following situation is theoretically possible:
>>
>
> Grammar suggestions:
>
>> 1. client start reading
>
> client starts reading
>
>> 2. client understand, that there is no corresponding cluster in
>>     fleecing image
>> 3. client is going to read from backing file (i.e. active image)
>
> client sees that no corresponding cluster has been allocated in the 
> fleecing image, so the request is forwarded to the backing file
>
>> 4. guest writes to active image
>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>>     fleecing image
>> 6. guest write continues...
>> 7. and client reads _new_ (or partly new) date from active image
>
> Interesting race. Can it actually happen, or does our read code 
> already serialize writes to the same area while a read is underway?
>
> In short, I see what problem you are claiming exists: the moment the 
> client starts reading from the backing file, that portion of the 
> backing file must remain unchanged until after the client is done 
> reading.  But I don't know enough details of the block layer to know 
> if this is actually a problem, or if adding the new filter is just 
> overhead.


Looking at the code, more real example (but I still have no reproducer):

1. client starts reading and take qcow2 mutex in qcow2_co_preadv, and 
goes up to l2 table loading (assume cache miss)
2) guest write => backup COW => qcow2 write => try to take qcow2 mutex 
=> waiting
3. l2 table loaded, we see that cluster is UNALLOCATED, go to "case 
QCOW2_CLUSTER_UNALLOCATED" and unlock mutex before 
bdrv_co_preadv(bs->backing, ...)
4) aha, mutex unlocked, backup COW continues, and we finally finish 
guest write and change cluster in our active disk
5. actually, do bdrv_co_preadv(bs->backing, ...) and read _new updated_ 
data.



>
>>
>> So, this fleecing-filter should be above fleecing image, the whole
>> picture of fleecing looks like this:
>>
>>      +-------+           +------------+
>>      |       |           |            |
>>      | guest |           | NBD client +<------+
>>      |       |           |            |       |
>>      ++-----++           +------------+       |only read
>>       |     ^                                 |
>>       | IO  |                                 |
>>       v     |                           +-----+------+
>>      ++-----+---------+                 |            |
>>      |                |                 |  internal  |
>>      |  active image  +----+            | NBD server |
>>      |                |    |            |            |
>>      +-+--------------+    |backup      +-+----------+
>>        ^                   |sync=none     ^
>>        |backing            |              |only read
>>        |                   |              |
>>      +-+--------------+    |       +------+----------+
>>      |                |    |       |                 |
>>      | fleecing image +<---+       | fleecing filter |
>>      |                |            |                 |
>>      +--------+-------+            +-----+-----------+
>>               ^                          |
>>               |                          |
>>               +--------------------------+
>>                         file
>
> Can you also show the sequence of QMP commands to set up this 
> structure (or maybe you do in 3/3; which I haven't looked at yet).
>
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   qapi/block-core.json    |  6 ++--
>>   block/fleecing-filter.c | 80 
>> +++++++++++++++++++++++++++++++++++++++++++++++++
>>   block/Makefile.objs     |  1 +
>>   3 files changed, 85 insertions(+), 2 deletions(-)
>>   create mode 100644 block/fleecing-filter.c
>>
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index 577ce5e999..43872c3d79 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -2542,7 +2542,8 @@
>>               'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 
>> 'nfs',
>>               'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 
>> 'qcow2', 'qed',
>>               'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
>> -            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 
>> 'vxhs' ] }
>> +            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
>> +            'fleecing-filter' ] }
>
> Missing a 'since 3.0' documentation blurb; also, this enum has been 
> kept sorted, so your new filter needs to come earlier.
>
>>     ##
>>   # @BlockdevOptionsFile:
>> @@ -3594,7 +3595,8 @@
>>         'vmdk': 'BlockdevOptionsGenericCOWFormat',
>>         'vpc':        'BlockdevOptionsGenericFormat',
>>         'vvfat':      'BlockdevOptionsVVFAT',
>> -      'vxhs':       'BlockdevOptionsVxHS'
>> +      'vxhs':       'BlockdevOptionsVxHS',
>> +      'fleecing-filter': 'BlockdevOptionsGenericFormat'
>
> Again, this has been kept sorted.
>
>> +static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
>> +                                           uint64_t offset, uint64_t 
>> bytes,
>> + QEMUIOVector *qiov, int flags)
>> +{
>> +    int ret;
>> +    BlockJob *job = bs->file->bs->backing->bs->job;
>> +    CowRequest req;
>> +
>> +    backup_wait_for_overlapping_requests(job, offset, bytes);
>> +    backup_cow_request_begin(&req, job, offset, bytes);
>> +
>> +    ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
>> +
>> +    backup_cow_request_end(&req);
>> +
>> +    return ret;
>> +}
>
> So the idea here is that you force a serializing request to ensure 
> that there are no other writes to the area in the meantime.
>
>> +
>> +static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
>> +                                            uint64_t offset, 
>> uint64_t bytes,
>> + QEMUIOVector *qiov, int flags)
>> +{
>> +    return -EINVAL;
>
> and you force this to be a read-only interface. (Does the block layer 
> actually require us to provide a pwritev callback, or can we leave it 
> NULL instead?)
>
>> +BlockDriver bdrv_fleecing_filter = {
>> +    .format_name = "fleecing-filter",
>> +    .protocol_name = "fleecing-filter",
>> +    .instance_size = 0,
>> +
>> +    .bdrv_open = fleecing_open,
>> +    .bdrv_close = fleecing_close,
>> +
>> +    .bdrv_getlength = fleecing_getlength,
>> +    .bdrv_co_preadv = fleecing_co_preadv,
>> +    .bdrv_co_pwritev = fleecing_co_pwritev,
>> +
>> +    .is_filter = true,
>> +    .bdrv_recurse_is_first_non_filter = 
>> fleecing_recurse_is_first_non_filter,
>> +    .bdrv_child_perm        = bdrv_filter_default_perms,
>
> No .bdrv_co_block_status callback?  That probably hurts querying for 
> sparse regions.
>

hm, worth add.. and it possibly needs synchronization with backup too.

-- 
Best regards,
Vladimir


Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by John Snow 7 years, 7 months ago

On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> We need to synchronize backup job with reading from fleecing image
> like it was done in block/replication.c.
> 
> Otherwise, the following situation is theoretically possible:
> 
> 1. client start reading
> 2. client understand, that there is no corresponding cluster in
>    fleecing image

I don't think the client refocuses the read, but QEMU does. (the client
has no idea if it is reading from the fleecing node or the backing file.)

... but understood:

> 3. client is going to read from backing file (i.e. active image)
> 4. guest writes to active image

My question here is if QEMU will allow reads and writes to interleave in
general. If the client is reading from the backing file, the active
image, will QEMU allow a write to modify that data before we're done
getting that data?

> 5. this write is stopped by backup(sync=none) and cluster is copied to
>    fleecing image
> 6. guest write continues...
> 7. and client reads _new_ (or partly new) date from active image
> 

I can't answer this for myself one way or the other right now, but I
imagine you observed a failure in practice in your test setups, which
motivated this patch?

A test or some proof would help justification for this patch. It would
also help prove that it solves what it claims to!

> So, this fleecing-filter should be above fleecing image, the whole
> picture of fleecing looks like this:
> 
>     +-------+           +------------+
>     |       |           |            |
>     | guest |           | NBD client +<------+
>     |       |           |            |       |
>     ++-----++           +------------+       |only read
>      |     ^                                 |
>      | IO  |                                 |
>      v     |                           +-----+------+
>     ++-----+---------+                 |            |
>     |                |                 |  internal  |
>     |  active image  +----+            | NBD server |
>     |                |    |            |            |
>     +-+--------------+    |backup      +-+----------+
>       ^                   |sync=none     ^
>       |backing            |              |only read
>       |                   |              |
>     +-+--------------+    |       +------+----------+
>     |                |    |       |                 |
>     | fleecing image +<---+       | fleecing filter |
>     |                |            |                 |
>     +--------+-------+            +-----+-----------+
>              ^                          |
>              |                          |
>              +--------------------------+
>                        file
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  qapi/block-core.json    |  6 ++--
>  block/fleecing-filter.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
>  block/Makefile.objs     |  1 +
>  3 files changed, 85 insertions(+), 2 deletions(-)
>  create mode 100644 block/fleecing-filter.c
> 
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 577ce5e999..43872c3d79 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2542,7 +2542,8 @@
>              'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
>              'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
>              'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
> -            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
> +            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
> +            'fleecing-filter' ] }
>  
>  ##
>  # @BlockdevOptionsFile:
> @@ -3594,7 +3595,8 @@
>        'vmdk':       'BlockdevOptionsGenericCOWFormat',
>        'vpc':        'BlockdevOptionsGenericFormat',
>        'vvfat':      'BlockdevOptionsVVFAT',
> -      'vxhs':       'BlockdevOptionsVxHS'
> +      'vxhs':       'BlockdevOptionsVxHS',
> +      'fleecing-filter': 'BlockdevOptionsGenericFormat'
>    } }
>  
>  ##
> diff --git a/block/fleecing-filter.c b/block/fleecing-filter.c
> new file mode 100644
> index 0000000000..b501887c10
> --- /dev/null
> +++ b/block/fleecing-filter.c
> @@ -0,0 +1,80 @@
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "block/blockjob.h"
> +#include "block/block_int.h"
> +#include "block/block_backup.h"
> +
> +static int64_t fleecing_getlength(BlockDriverState *bs)
> +{
> +    return bdrv_getlength(bs->file->bs);
> +}
> +
> +static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
> +                                           uint64_t offset, uint64_t bytes,
> +                                           QEMUIOVector *qiov, int flags)
> +{
> +    int ret;
> +    BlockJob *job = bs->file->bs->backing->bs->job;
> +    CowRequest req;
> +
> +    backup_wait_for_overlapping_requests(job, offset, bytes);
> +    backup_cow_request_begin(&req, job, offset, bytes);
> +
> +    ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
> +
> +    backup_cow_request_end(&req);
> +
> +    return ret;
> +}
> +
> +static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
> +                                            uint64_t offset, uint64_t bytes,
> +                                            QEMUIOVector *qiov, int flags)
> +{
> +    return -EINVAL;
> +}
> +
> +static bool fleecing_recurse_is_first_non_filter(BlockDriverState *bs,
> +                                                 BlockDriverState *candidate)
> +{
> +    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
> +}
> +
> +static int fleecing_open(BlockDriverState *bs, QDict *options,
> +                         int flags, Error **errp)
> +{
> +    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
> +                               errp);
> +
> +    return bs->file ? 0 : -EINVAL;
> +}
> +
> +static void fleecing_close(BlockDriverState *bs)
> +{
> +    /* Do nothing, we have to add .bdrv_close, because bdrv_close() don't check
> +     * it, just call. */
> +}
> +
> +BlockDriver bdrv_fleecing_filter = {
> +    .format_name = "fleecing-filter",
> +    .protocol_name = "fleecing-filter",
> +    .instance_size = 0,
> +
> +    .bdrv_open = fleecing_open,
> +    .bdrv_close = fleecing_close,
> +
> +    .bdrv_getlength = fleecing_getlength,
> +    .bdrv_co_preadv = fleecing_co_preadv,
> +    .bdrv_co_pwritev = fleecing_co_pwritev,
> +
> +    .is_filter = true,
> +    .bdrv_recurse_is_first_non_filter = fleecing_recurse_is_first_non_filter,
> +    .bdrv_child_perm        = bdrv_filter_default_perms,
> +};
> +
> +static void bdrv_fleecing_init(void)
> +{
> +    bdrv_register(&bdrv_fleecing_filter);
> +}
> +
> +block_init(bdrv_fleecing_init);
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 899bfb5e2c..aa0a6dd971 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -27,6 +27,7 @@ block-obj-y += write-threshold.o
>  block-obj-y += backup.o
>  block-obj-$(CONFIG_REPLICATION) += replication.o
>  block-obj-y += throttle.o copy-on-read.o
> +block-obj-y += fleecing-filter.o
>  
>  block-obj-y += crypto.o
>  
> 

Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Eric Blake 7 years, 7 months ago
On 06/29/2018 12:30 PM, John Snow wrote:
> 
> 
> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>> We need to synchronize backup job with reading from fleecing image
>> like it was done in block/replication.c.
>>
>> Otherwise, the following situation is theoretically possible:
>>
>> 1. client start reading
>> 2. client understand, that there is no corresponding cluster in
>>     fleecing image
> 
> I don't think the client refocuses the read, but QEMU does. (the client
> has no idea if it is reading from the fleecing node or the backing file.)
> 
> ... but understood:
> 
>> 3. client is going to read from backing file (i.e. active image)
>> 4. guest writes to active image
> 
> My question here is if QEMU will allow reads and writes to interleave in
> general. If the client is reading from the backing file, the active
> image, will QEMU allow a write to modify that data before we're done
> getting that data?
> 
>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>>     fleecing image
>> 6. guest write continues...
>> 7. and client reads _new_ (or partly new) date from active image
>>
> 
> I can't answer this for myself one way or the other right now, but I
> imagine you observed a failure in practice in your test setups, which
> motivated this patch?
> 
> A test or some proof would help justification for this patch. It would
> also help prove that it solves what it claims to!

In fact, do we really need a new filter, or do we just need to make the 
"sync":"none" blockdev-backup job take the appropriate synchronization 
locks?

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Vladimir Sementsov-Ogievskiy 7 years, 7 months ago
29.06.2018 20:40, Eric Blake wrote:
> On 06/29/2018 12:30 PM, John Snow wrote:
>>
>>
>> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> We need to synchronize backup job with reading from fleecing image
>>> like it was done in block/replication.c.
>>>
>>> Otherwise, the following situation is theoretically possible:
>>>
>>> 1. client start reading
>>> 2. client understand, that there is no corresponding cluster in
>>>     fleecing image
>>
>> I don't think the client refocuses the read, but QEMU does. (the client
>> has no idea if it is reading from the fleecing node or the backing 
>> file.)
>>
>> ... but understood:
>>
>>> 3. client is going to read from backing file (i.e. active image)
>>> 4. guest writes to active image
>>
>> My question here is if QEMU will allow reads and writes to interleave in
>> general. If the client is reading from the backing file, the active
>> image, will QEMU allow a write to modify that data before we're done
>> getting that data?
>>
>>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>>>     fleecing image
>>> 6. guest write continues...
>>> 7. and client reads _new_ (or partly new) date from active image
>>>
>>
>> I can't answer this for myself one way or the other right now, but I
>> imagine you observed a failure in practice in your test setups, which
>> motivated this patch?
>>
>> A test or some proof would help justification for this patch. It would
>> also help prove that it solves what it claims to!
>
> In fact, do we really need a new filter, or do we just need to make 
> the "sync":"none" blockdev-backup job take the appropriate 
> synchronization locks?
>

How? We'll need additional mechanism like serializing requests.. Or a 
way to reuse serializing requests. Using backup internal synchronization 
looks simpler, and it is already used in block replication.
On the other hand, I think the best thing is instead refuse 
backup(sync=none), and do the whole functionality for external backup in 
the special filter driver, but for now I'd prefer to start with already 
existing staff.

-- 
Best regards,
Vladimir


Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Kevin Wolf 7 years, 7 months ago
Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 29.06.2018 20:40, Eric Blake wrote:
> > On 06/29/2018 12:30 PM, John Snow wrote:
> > > 
> > > 
> > > On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> > > > We need to synchronize backup job with reading from fleecing image
> > > > like it was done in block/replication.c.
> > > > 
> > > > Otherwise, the following situation is theoretically possible:
> > > > 
> > > > 1. client start reading
> > > > 2. client understand, that there is no corresponding cluster in
> > > >     fleecing image
> > > 
> > > I don't think the client refocuses the read, but QEMU does. (the client
> > > has no idea if it is reading from the fleecing node or the backing
> > > file.)
> > > 
> > > ... but understood:
> > > 
> > > > 3. client is going to read from backing file (i.e. active image)
> > > > 4. guest writes to active image
> > > 
> > > My question here is if QEMU will allow reads and writes to interleave in
> > > general. If the client is reading from the backing file, the active
> > > image, will QEMU allow a write to modify that data before we're done
> > > getting that data?
> > > 
> > > > 5. this write is stopped by backup(sync=none) and cluster is copied to
> > > >     fleecing image
> > > > 6. guest write continues...
> > > > 7. and client reads _new_ (or partly new) date from active image
> > > > 
> > > 
> > > I can't answer this for myself one way or the other right now, but I
> > > imagine you observed a failure in practice in your test setups, which
> > > motivated this patch?
> > > 
> > > A test or some proof would help justification for this patch. It would
> > > also help prove that it solves what it claims to!
> > 
> > In fact, do we really need a new filter, or do we just need to make the
> > "sync":"none" blockdev-backup job take the appropriate synchronization
> > locks?
> > 
> 
> How? We'll need additional mechanism like serializing requests.. Or a way to
> reuse serializing requests. Using backup internal synchronization looks
> simpler, and it is already used in block replication.

But it also just an ugly hack that fixes one special case and leaves
everything else broken. replication is usually not a good example for
anything. It always gives me bad surprises when I have to look at it.

We'll have to figure out where to fix this problem (or what it really
is, once you look more than just at fleecing), but I think requiring the
user to add a filter driver to work around missing serialisation in
other code, and corrupting their image if they forget to, is not a
reasonable solution.

I see at least two things wrong in this context:

* The permissions don't seem to match reality. The NBD server
  unconditionally shares PERM_WRITE, which is wrong in this case. The
  client wants to see a point-in-time snapshot that never changes. This
  should become an option so that it can be properly reflected in the
  permissions used.

* Once we have proper permissions, the fleecing setup breaks down
  because the guest needs PERM_WRITE on the backing file, but the
  fleecing overlay allows that only if the NBD client allows it (which
  it doesn't for fleecing).

  Now we can implement an exception right into backup that installs a
  backup filter driver between source and target if the source is the
  backing file of the target. The filter driver would be similar to the
  commit filter driver in that it simply promises !PERM_WRITE to its
  parents, but allows PERM_WRITE on the source because it has installed
  the before_write_notifier that guarantees this condition.

  All writes to the target that are made by the backup job in this setup
  (including before_write_notifier writes) need to be marked as
  serialising so that any concurrent reads are completed first.

And if we decide to add a target filter to backup, we should probably at
the same time use a filter driver for intercepting source writes instead
of using before_write_notifier.

Max, I think you intended to make both source and target children of the
same block job node (or at least for mirror). But wouldn't that create
loops in a setup like this? I think we may need two filters that are
only connected through the block job, but not with block graph edges.

Kevin

Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Vladimir Sementsov-Ogievskiy 7 years, 7 months ago
03.07.2018 14:15, Kevin Wolf wrote:
> Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 29.06.2018 20:40, Eric Blake wrote:
>>> On 06/29/2018 12:30 PM, John Snow wrote:
>>>>
>>>> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>>> We need to synchronize backup job with reading from fleecing image
>>>>> like it was done in block/replication.c.
>>>>>
>>>>> Otherwise, the following situation is theoretically possible:
>>>>>
>>>>> 1. client start reading
>>>>> 2. client understand, that there is no corresponding cluster in
>>>>>      fleecing image
>>>> I don't think the client refocuses the read, but QEMU does. (the client
>>>> has no idea if it is reading from the fleecing node or the backing
>>>> file.)
>>>>
>>>> ... but understood:
>>>>
>>>>> 3. client is going to read from backing file (i.e. active image)
>>>>> 4. guest writes to active image
>>>> My question here is if QEMU will allow reads and writes to interleave in
>>>> general. If the client is reading from the backing file, the active
>>>> image, will QEMU allow a write to modify that data before we're done
>>>> getting that data?
>>>>
>>>>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>>>>>      fleecing image
>>>>> 6. guest write continues...
>>>>> 7. and client reads _new_ (or partly new) date from active image
>>>>>
>>>> I can't answer this for myself one way or the other right now, but I
>>>> imagine you observed a failure in practice in your test setups, which
>>>> motivated this patch?
>>>>
>>>> A test or some proof would help justification for this patch. It would
>>>> also help prove that it solves what it claims to!
>>> In fact, do we really need a new filter, or do we just need to make the
>>> "sync":"none" blockdev-backup job take the appropriate synchronization
>>> locks?
>>>
>> How? We'll need additional mechanism like serializing requests.. Or a way to
>> reuse serializing requests. Using backup internal synchronization looks
>> simpler, and it is already used in block replication.
> But it also just an ugly hack

agree.

>   that fixes one special case and leaves
> everything else broken. replication is usually not a good example for
> anything. It always gives me bad surprises when I have to look at it.
>
> We'll have to figure out where to fix this problem (or what it really
> is, once you look more than just at fleecing), but I think requiring the
> user to add a filter driver to work around missing serialisation in
> other code, and corrupting their image if they forget to, is not a
> reasonable solution.
>
> I see at least two things wrong in this context:
>
> * The permissions don't seem to match reality. The NBD server
>    unconditionally shares PERM_WRITE, which is wrong in this case. The
>    client wants to see a point-in-time snapshot that never changes. This
>    should become an option so that it can be properly reflected in the
>    permissions used.
>
> * Once we have proper permissions, the fleecing setup breaks down
>    because the guest needs PERM_WRITE on the backing file, but the
>    fleecing overlay allows that only if the NBD client allows it (which
>    it doesn't for fleecing).
>
>    Now we can implement an exception right into backup that installs a
>    backup filter driver between source and target if the source is the
>    backing file of the target. The filter driver would be similar to the
>    commit filter driver in that it simply promises !PERM_WRITE to its
>    parents, but allows PERM_WRITE on the source because it has installed
>    the before_write_notifier that guarantees this condition.
>
>    All writes to the target that are made by the backup job in this setup
>    (including before_write_notifier writes) need to be marked as
>    serialising so that any concurrent reads are completed first.
>
> And if we decide to add a target filter to backup, we should probably at
> the same time use a filter driver for intercepting source writes instead
> of using before_write_notifier.

and this is the point, where we can drop backup job at all from fleecing 
scheme, as actually backup(sync=none) == such special filter driver

>
> Max, I think you intended to make both source and target children of the
> same block job node (or at least for mirror). But wouldn't that create
> loops in a setup like this? I think we may need two filters that are
> only connected through the block job, but not with block graph edges.
>
> Kevin

-- 
Best regards,
Vladimir


Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Vladimir Sementsov-Ogievskiy 7 years, 7 months ago
03.07.2018 14:15, Kevin Wolf wrote:
> Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 29.06.2018 20:40, Eric Blake wrote:
>>> On 06/29/2018 12:30 PM, John Snow wrote:
>>>>
>>>> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>>> We need to synchronize backup job with reading from fleecing image
>>>>> like it was done in block/replication.c.
>>>>>
>>>>> Otherwise, the following situation is theoretically possible:
>>>>>
>>>>> 1. client start reading
>>>>> 2. client understand, that there is no corresponding cluster in
>>>>>      fleecing image
>>>> I don't think the client refocuses the read, but QEMU does. (the client
>>>> has no idea if it is reading from the fleecing node or the backing
>>>> file.)
>>>>
>>>> ... but understood:
>>>>
>>>>> 3. client is going to read from backing file (i.e. active image)
>>>>> 4. guest writes to active image
>>>> My question here is if QEMU will allow reads and writes to interleave in
>>>> general. If the client is reading from the backing file, the active
>>>> image, will QEMU allow a write to modify that data before we're done
>>>> getting that data?
>>>>
>>>>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>>>>>      fleecing image
>>>>> 6. guest write continues...
>>>>> 7. and client reads _new_ (or partly new) date from active image
>>>>>
>>>> I can't answer this for myself one way or the other right now, but I
>>>> imagine you observed a failure in practice in your test setups, which
>>>> motivated this patch?
>>>>
>>>> A test or some proof would help justification for this patch. It would
>>>> also help prove that it solves what it claims to!
>>> In fact, do we really need a new filter, or do we just need to make the
>>> "sync":"none" blockdev-backup job take the appropriate synchronization
>>> locks?
>>>
>> How? We'll need additional mechanism like serializing requests.. Or a way to
>> reuse serializing requests. Using backup internal synchronization looks
>> simpler, and it is already used in block replication.
> But it also just an ugly hack that fixes one special case and leaves
> everything else broken. replication is usually not a good example for
> anything. It always gives me bad surprises when I have to look at it.
>
> We'll have to figure out where to fix this problem (or what it really
> is, once you look more than just at fleecing), but I think requiring the
> user to add a filter driver to work around missing serialisation in
> other code, and corrupting their image if they forget to, is not a
> reasonable solution.
>
> I see at least two things wrong in this context:
>
> * The permissions don't seem to match reality. The NBD server
>    unconditionally shares PERM_WRITE, which is wrong in this case. The
>    client wants to see a point-in-time snapshot that never changes. This
>    should become an option so that it can be properly reflected in the
>    permissions used.
>
> * Once we have proper permissions, the fleecing setup breaks down
>    because the guest needs PERM_WRITE on the backing file, but the
>    fleecing overlay allows that only if the NBD client allows it (which
>    it doesn't for fleecing).
>
>    Now we can implement an exception right into backup that installs a
>    backup filter driver between source and target if the source is the
>    backing file of the target. The filter driver would be similar to the
>    commit filter driver in that it simply promises !PERM_WRITE to its
>    parents, but allows PERM_WRITE on the source because it has installed
>    the before_write_notifier that guarantees this condition.
>
>    All writes to the target that are made by the backup job in this setup
>    (including before_write_notifier writes) need to be marked as
>    serialising so that any concurrent reads are completed first.
>
> And if we decide to add a target filter to backup, we should probably at
> the same time use a filter driver for intercepting source writes instead
> of using before_write_notifier.

Hmm, is it possible to do all the staff in one super filter driver, 
which we insert into the tree like this:

top blk        fleecing qcow2
      +           +
      |           |backing
      v     <-----+
    super filter
      +
      |file
      v
    active image


And super filter do the following:

1. copy-on-write, before forwarding write to file, it do serializing 
write to fleecing qcow2
2. fake .bdrv_child_perm for fleecing qcow2, like in block commit

and no block job is needed.

>
> Max, I think you intended to make both source and target children of the
> same block job node (or at least for mirror). But wouldn't that create
> loops in a setup like this? I think we may need two filters that are
> only connected through the block job, but not with block graph edges.
>
> Kevin


-- 
Best regards,
Vladimir


Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Kevin Wolf 7 years, 7 months ago
Am 03.07.2018 um 18:11 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 03.07.2018 14:15, Kevin Wolf wrote:
> > We'll have to figure out where to fix this problem (or what it really
> > is, once you look more than just at fleecing), but I think requiring the
> > user to add a filter driver to work around missing serialisation in
> > other code, and corrupting their image if they forget to, is not a
> > reasonable solution.
> > 
> > I see at least two things wrong in this context:
> > 
> > * The permissions don't seem to match reality. The NBD server
> >    unconditionally shares PERM_WRITE, which is wrong in this case. The
> >    client wants to see a point-in-time snapshot that never changes. This
> >    should become an option so that it can be properly reflected in the
> >    permissions used.
> > 
> > * Once we have proper permissions, the fleecing setup breaks down
> >    because the guest needs PERM_WRITE on the backing file, but the
> >    fleecing overlay allows that only if the NBD client allows it (which
> >    it doesn't for fleecing).
> > 
> >    Now we can implement an exception right into backup that installs a
> >    backup filter driver between source and target if the source is the
> >    backing file of the target. The filter driver would be similar to the
> >    commit filter driver in that it simply promises !PERM_WRITE to its
> >    parents, but allows PERM_WRITE on the source because it has installed
> >    the before_write_notifier that guarantees this condition.
> > 
> >    All writes to the target that are made by the backup job in this setup
> >    (including before_write_notifier writes) need to be marked as
> >    serialising so that any concurrent reads are completed first.
> > 
> > And if we decide to add a target filter to backup, we should probably at
> > the same time use a filter driver for intercepting source writes instead
> > of using before_write_notifier.
> 
> Hmm, is it possible to do all the staff in one super filter driver, which we
> insert into the tree like this:
> 
> top blk        fleecing qcow2
>      +           +
>      |           |backing
>      v     <-----+
>    super filter
>      +
>      |file
>      v
>    active image
> 
> 
> And super filter do the following:
> 
> 1. copy-on-write, before forwarding write to file, it do serializing write
> to fleecing qcow2

This is where it breaks down. The filter driver in your graph doesn't
know fleecing.qcow2, so it can't write to it. Attaching fleecing.qcow2
as an additional child to the super filter doesn't work either because
you would create a loop then.

I think we need two separate nodes (and probably it's better to have
them managed by a block job so that both together can be checked to
result in a consistent setup).

> 2. fake .bdrv_child_perm for fleecing qcow2, like in block commit
> 
> and no block job is needed.

Kevin

Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Max Reitz 7 years, 7 months ago
On 2018-07-03 13:15, Kevin Wolf wrote:
> Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 29.06.2018 20:40, Eric Blake wrote:
>>> On 06/29/2018 12:30 PM, John Snow wrote:
>>>>
>>>>
>>>> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>>> We need to synchronize backup job with reading from fleecing image
>>>>> like it was done in block/replication.c.
>>>>>
>>>>> Otherwise, the following situation is theoretically possible:
>>>>>
>>>>> 1. client start reading
>>>>> 2. client understand, that there is no corresponding cluster in
>>>>>     fleecing image
>>>>
>>>> I don't think the client refocuses the read, but QEMU does. (the client
>>>> has no idea if it is reading from the fleecing node or the backing
>>>> file.)
>>>>
>>>> ... but understood:
>>>>
>>>>> 3. client is going to read from backing file (i.e. active image)
>>>>> 4. guest writes to active image
>>>>
>>>> My question here is if QEMU will allow reads and writes to interleave in
>>>> general. If the client is reading from the backing file, the active
>>>> image, will QEMU allow a write to modify that data before we're done
>>>> getting that data?
>>>>
>>>>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>>>>>     fleecing image
>>>>> 6. guest write continues...
>>>>> 7. and client reads _new_ (or partly new) date from active image
>>>>>
>>>>
>>>> I can't answer this for myself one way or the other right now, but I
>>>> imagine you observed a failure in practice in your test setups, which
>>>> motivated this patch?
>>>>
>>>> A test or some proof would help justification for this patch. It would
>>>> also help prove that it solves what it claims to!
>>>
>>> In fact, do we really need a new filter, or do we just need to make the
>>> "sync":"none" blockdev-backup job take the appropriate synchronization
>>> locks?
>>>
>>
>> How? We'll need additional mechanism like serializing requests.. Or a way to
>> reuse serializing requests. Using backup internal synchronization looks
>> simpler, and it is already used in block replication.
> 
> But it also just an ugly hack that fixes one special case and leaves
> everything else broken. replication is usually not a good example for
> anything. It always gives me bad surprises when I have to look at it.
> 
> We'll have to figure out where to fix this problem (or what it really
> is, once you look more than just at fleecing), but I think requiring the
> user to add a filter driver to work around missing serialisation in
> other code, and corrupting their image if they forget to, is not a
> reasonable solution.
> 
> I see at least two things wrong in this context:
> 
> * The permissions don't seem to match reality. The NBD server
>   unconditionally shares PERM_WRITE, which is wrong in this case. The
>   client wants to see a point-in-time snapshot that never changes. This
>   should become an option so that it can be properly reflected in the
>   permissions used.
> 
> * Once we have proper permissions, the fleecing setup breaks down
>   because the guest needs PERM_WRITE on the backing file, but the
>   fleecing overlay allows that only if the NBD client allows it (which
>   it doesn't for fleecing).
> 
>   Now we can implement an exception right into backup that installs a
>   backup filter driver between source and target if the source is the
>   backing file of the target. The filter driver would be similar to the
>   commit filter driver in that it simply promises !PERM_WRITE to its
>   parents, but allows PERM_WRITE on the source because it has installed
>   the before_write_notifier that guarantees this condition.

I don't quite understand what you mean "between source and target",
because *the* backup filter driver would need to be on top of both.
Unless you continue to use before-write notifiers, but I don't see why
when a driver that sits on top of source would intercept all requests
anyway.

And if we want to unify mirror and backup at any time, I think we have
to get away from before-write anyway.

>   All writes to the target that are made by the backup job in this setup
>   (including before_write_notifier writes) need to be marked as
>   serialising so that any concurrent reads are completed first.
> 
> And if we decide to add a target filter to backup, we should probably at
> the same time use a filter driver for intercepting source writes instead
> of using before_write_notifier.
> 
> Max, I think you intended to make both source and target children of the
> same block job node (or at least for mirror). But wouldn't that create
> loops in a setup like this? I think we may need two filters that are
> only connected through the block job, but not with block graph edges.

A node can only provide one data view to all of its parents.  It cannot
distinguish based on the parent.  Therefore, the main backup driver on
top of source and target would necessarily present the current disk
(changing) state, just like the mirror filter does.

Fleecing wants another view, though.  It wants an unchanging view.  It
therefore seems clear to me that we'd need to separate nodes: One
between guest and source (which performs the backup by intercepting
write requests), and one between the fleecing user (e.g. NBD) and the
target.  The former presents the current state, the latter presents the
"unchanging" point-in-time snapshot.

Sure, we can get away without the former (we do so today), but I don't
see why we would want to.  blockdev-copy (as an extension of
blockdev-mirror) would have such a node anyway, and I can see no reason
why we wouldn't attach both source and target to it.

Max

Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Vladimir Sementsov-Ogievskiy 7 years, 7 months ago
29.06.2018 20:30, John Snow wrote:
>
> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>> We need to synchronize backup job with reading from fleecing image
>> like it was done in block/replication.c.
>>
>> Otherwise, the following situation is theoretically possible:
>>
>> 1. client start reading
>> 2. client understand, that there is no corresponding cluster in
>>     fleecing image
> I don't think the client refocuses the read, but QEMU does. (the client
> has no idea if it is reading from the fleecing node or the backing file.)
>
> ... but understood:
>
>> 3. client is going to read from backing file (i.e. active image)
>> 4. guest writes to active image
> My question here is if QEMU will allow reads and writes to interleave in
> general. If the client is reading from the backing file, the active
> image, will QEMU allow a write to modify that data before we're done
> getting that data?

If I understand correctly: yes. Physical drives allows intersecting 
operations too and there no guarantee on result. We have serializing 
requests, but in general only unaligned requests are marked serializing.

>
>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>>     fleecing image
>> 6. guest write continues...
>> 7. and client reads _new_ (or partly new) date from active image
>>
> I can't answer this for myself one way or the other right now, but I
> imagine you observed a failure in practice in your test setups, which
> motivated this patch?

No. I just found this code in Qemu: block/replication already use the 
same scheme with backup(sync=none). And it uses the synchronization as 
in this patch. I've discussed it on list a bit:
http://lists.nongnu.org/archive/html/qemu-devel/2017-08/msg01807.html
https://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg03025.html

>
> A test or some proof would help justification for this patch. It would
> also help prove that it solves what it claims to!

Agree, a reproducer is better than any theoretical reflections. I'm 
thinking about it, but don't see simple solution...

>
>> So, this fleecing-filter should be above fleecing image, the whole
>> picture of fleecing looks like this:
>>
>>      +-------+           +------------+
>>      |       |           |            |
>>      | guest |           | NBD client +<------+
>>      |       |           |            |       |
>>      ++-----++           +------------+       |only read
>>       |     ^                                 |
>>       | IO  |                                 |
>>       v     |                           +-----+------+
>>      ++-----+---------+                 |            |
>>      |                |                 |  internal  |
>>      |  active image  +----+            | NBD server |
>>      |                |    |            |            |
>>      +-+--------------+    |backup      +-+----------+
>>        ^                   |sync=none     ^
>>        |backing            |              |only read
>>        |                   |              |
>>      +-+--------------+    |       +------+----------+
>>      |                |    |       |                 |
>>      | fleecing image +<---+       | fleecing filter |
>>      |                |            |                 |
>>      +--------+-------+            +-----+-----------+
>>               ^                          |
>>               |                          |
>>               +--------------------------+
>>                         file
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   qapi/block-core.json    |  6 ++--
>>   block/fleecing-filter.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   block/Makefile.objs     |  1 +
>>   3 files changed, 85 insertions(+), 2 deletions(-)
>>   create mode 100644 block/fleecing-filter.c
>>
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index 577ce5e999..43872c3d79 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -2542,7 +2542,8 @@
>>               'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
>>               'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
>>               'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
>> -            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
>> +            'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
>> +            'fleecing-filter' ] }
>>   
>>   ##
>>   # @BlockdevOptionsFile:
>> @@ -3594,7 +3595,8 @@
>>         'vmdk':       'BlockdevOptionsGenericCOWFormat',
>>         'vpc':        'BlockdevOptionsGenericFormat',
>>         'vvfat':      'BlockdevOptionsVVFAT',
>> -      'vxhs':       'BlockdevOptionsVxHS'
>> +      'vxhs':       'BlockdevOptionsVxHS',
>> +      'fleecing-filter': 'BlockdevOptionsGenericFormat'
>>     } }
>>   
>>   ##
>> diff --git a/block/fleecing-filter.c b/block/fleecing-filter.c
>> new file mode 100644
>> index 0000000000..b501887c10
>> --- /dev/null
>> +++ b/block/fleecing-filter.c
>> @@ -0,0 +1,80 @@
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +#include "block/blockjob.h"
>> +#include "block/block_int.h"
>> +#include "block/block_backup.h"
>> +
>> +static int64_t fleecing_getlength(BlockDriverState *bs)
>> +{
>> +    return bdrv_getlength(bs->file->bs);
>> +}
>> +
>> +static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
>> +                                           uint64_t offset, uint64_t bytes,
>> +                                           QEMUIOVector *qiov, int flags)
>> +{
>> +    int ret;
>> +    BlockJob *job = bs->file->bs->backing->bs->job;
>> +    CowRequest req;
>> +
>> +    backup_wait_for_overlapping_requests(job, offset, bytes);
>> +    backup_cow_request_begin(&req, job, offset, bytes);
>> +
>> +    ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
>> +
>> +    backup_cow_request_end(&req);
>> +
>> +    return ret;
>> +}
>> +
>> +static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
>> +                                            uint64_t offset, uint64_t bytes,
>> +                                            QEMUIOVector *qiov, int flags)
>> +{
>> +    return -EINVAL;
>> +}
>> +
>> +static bool fleecing_recurse_is_first_non_filter(BlockDriverState *bs,
>> +                                                 BlockDriverState *candidate)
>> +{
>> +    return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
>> +}
>> +
>> +static int fleecing_open(BlockDriverState *bs, QDict *options,
>> +                         int flags, Error **errp)
>> +{
>> +    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
>> +                               errp);
>> +
>> +    return bs->file ? 0 : -EINVAL;
>> +}
>> +
>> +static void fleecing_close(BlockDriverState *bs)
>> +{
>> +    /* Do nothing, we have to add .bdrv_close, because bdrv_close() don't check
>> +     * it, just call. */
>> +}
>> +
>> +BlockDriver bdrv_fleecing_filter = {
>> +    .format_name = "fleecing-filter",
>> +    .protocol_name = "fleecing-filter",
>> +    .instance_size = 0,
>> +
>> +    .bdrv_open = fleecing_open,
>> +    .bdrv_close = fleecing_close,
>> +
>> +    .bdrv_getlength = fleecing_getlength,
>> +    .bdrv_co_preadv = fleecing_co_preadv,
>> +    .bdrv_co_pwritev = fleecing_co_pwritev,
>> +
>> +    .is_filter = true,
>> +    .bdrv_recurse_is_first_non_filter = fleecing_recurse_is_first_non_filter,
>> +    .bdrv_child_perm        = bdrv_filter_default_perms,
>> +};
>> +
>> +static void bdrv_fleecing_init(void)
>> +{
>> +    bdrv_register(&bdrv_fleecing_filter);
>> +}
>> +
>> +block_init(bdrv_fleecing_init);
>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>> index 899bfb5e2c..aa0a6dd971 100644
>> --- a/block/Makefile.objs
>> +++ b/block/Makefile.objs
>> @@ -27,6 +27,7 @@ block-obj-y += write-threshold.o
>>   block-obj-y += backup.o
>>   block-obj-$(CONFIG_REPLICATION) += replication.o
>>   block-obj-y += throttle.o copy-on-read.o
>> +block-obj-y += fleecing-filter.o
>>   
>>   block-obj-y += crypto.o
>>   
>>


-- 
Best regards,
Vladimir

Re: [Qemu-devel] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing
Posted by Kevin Wolf 7 years, 7 months ago
Am 02.07.2018 um 13:57 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 29.06.2018 20:30, John Snow wrote:
> > 
> > On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> > > We need to synchronize backup job with reading from fleecing image
> > > like it was done in block/replication.c.
> > > 
> > > Otherwise, the following situation is theoretically possible:
> > > 
> > > 1. client start reading
> > > 2. client understand, that there is no corresponding cluster in
> > >     fleecing image
> > I don't think the client refocuses the read, but QEMU does. (the client
> > has no idea if it is reading from the fleecing node or the backing file.)
> > 
> > ... but understood:
> > 
> > > 3. client is going to read from backing file (i.e. active image)
> > > 4. guest writes to active image
> > My question here is if QEMU will allow reads and writes to interleave in
> > general. If the client is reading from the backing file, the active
> > image, will QEMU allow a write to modify that data before we're done
> > getting that data?
> 
> If I understand correctly: yes. Physical drives allows intersecting
> operations too and there no guarantee on result. We have serializing
> requests, but in general only unaligned requests are marked serializing.

Copy on read (in the context of image streaming) is the feature which
actually introduced it. It has some similarities with the backup job, so
the idea of using it for backup, too, isn't exactly revolutionary.

Kevin