We need to synchronize backup job with reading from fleecing image
like it was done in block/replication.c.
Otherwise, the following situation is theoretically possible:
1. client start reading
2. client understand, that there is no corresponding cluster in
fleecing image
3. client is going to read from backing file (i.e. active image)
4. guest writes to active image
5. this write is stopped by backup(sync=none) and cluster is copied to
fleecing image
6. guest write continues...
7. and client reads _new_ (or partly new) date from active image
So, this fleecing-filter should be above fleecing image, the whole
picture of fleecing looks like this:
+-------+ +------------+
| | | |
| guest | | NBD client +<------+
| | | | |
++-----++ +------------+ |only read
| ^ |
| IO | |
v | +-----+------+
++-----+---------+ | |
| | | internal |
| active image +----+ | NBD server |
| | | | |
+-+--------------+ |backup +-+----------+
^ |sync=none ^
|backing | |only read
| | |
+-+--------------+ | +------+----------+
| | | | |
| fleecing image +<---+ | fleecing filter |
| | | |
+--------+-------+ +-----+-----------+
^ |
| |
+--------------------------+
file
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
qapi/block-core.json | 6 ++--
block/fleecing-filter.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
block/Makefile.objs | 1 +
3 files changed, 85 insertions(+), 2 deletions(-)
create mode 100644 block/fleecing-filter.c
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 577ce5e999..43872c3d79 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2542,7 +2542,8 @@
'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
- 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
+ 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
+ 'fleecing-filter' ] }
##
# @BlockdevOptionsFile:
@@ -3594,7 +3595,8 @@
'vmdk': 'BlockdevOptionsGenericCOWFormat',
'vpc': 'BlockdevOptionsGenericFormat',
'vvfat': 'BlockdevOptionsVVFAT',
- 'vxhs': 'BlockdevOptionsVxHS'
+ 'vxhs': 'BlockdevOptionsVxHS',
+ 'fleecing-filter': 'BlockdevOptionsGenericFormat'
} }
##
diff --git a/block/fleecing-filter.c b/block/fleecing-filter.c
new file mode 100644
index 0000000000..b501887c10
--- /dev/null
+++ b/block/fleecing-filter.c
@@ -0,0 +1,80 @@
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+#include "block/block_backup.h"
+
+static int64_t fleecing_getlength(BlockDriverState *bs)
+{
+ return bdrv_getlength(bs->file->bs);
+}
+
+static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
+ uint64_t offset, uint64_t bytes,
+ QEMUIOVector *qiov, int flags)
+{
+ int ret;
+ BlockJob *job = bs->file->bs->backing->bs->job;
+ CowRequest req;
+
+ backup_wait_for_overlapping_requests(job, offset, bytes);
+ backup_cow_request_begin(&req, job, offset, bytes);
+
+ ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
+
+ backup_cow_request_end(&req);
+
+ return ret;
+}
+
+static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
+ uint64_t offset, uint64_t bytes,
+ QEMUIOVector *qiov, int flags)
+{
+ return -EINVAL;
+}
+
+static bool fleecing_recurse_is_first_non_filter(BlockDriverState *bs,
+ BlockDriverState *candidate)
+{
+ return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
+}
+
+static int fleecing_open(BlockDriverState *bs, QDict *options,
+ int flags, Error **errp)
+{
+ bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
+ errp);
+
+ return bs->file ? 0 : -EINVAL;
+}
+
+static void fleecing_close(BlockDriverState *bs)
+{
+ /* Do nothing, we have to add .bdrv_close, because bdrv_close() don't check
+ * it, just call. */
+}
+
+BlockDriver bdrv_fleecing_filter = {
+ .format_name = "fleecing-filter",
+ .protocol_name = "fleecing-filter",
+ .instance_size = 0,
+
+ .bdrv_open = fleecing_open,
+ .bdrv_close = fleecing_close,
+
+ .bdrv_getlength = fleecing_getlength,
+ .bdrv_co_preadv = fleecing_co_preadv,
+ .bdrv_co_pwritev = fleecing_co_pwritev,
+
+ .is_filter = true,
+ .bdrv_recurse_is_first_non_filter = fleecing_recurse_is_first_non_filter,
+ .bdrv_child_perm = bdrv_filter_default_perms,
+};
+
+static void bdrv_fleecing_init(void)
+{
+ bdrv_register(&bdrv_fleecing_filter);
+}
+
+block_init(bdrv_fleecing_init);
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 899bfb5e2c..aa0a6dd971 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -27,6 +27,7 @@ block-obj-y += write-threshold.o
block-obj-y += backup.o
block-obj-$(CONFIG_REPLICATION) += replication.o
block-obj-y += throttle.o copy-on-read.o
+block-obj-y += fleecing-filter.o
block-obj-y += crypto.o
--
2.11.1
On 06/29/2018 10:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> We need to synchronize backup job with reading from fleecing image
> like it was done in block/replication.c.
>
> Otherwise, the following situation is theoretically possible:
>
Grammar suggestions:
> 1. client start reading
client starts reading
> 2. client understand, that there is no corresponding cluster in
> fleecing image
> 3. client is going to read from backing file (i.e. active image)
client sees that no corresponding cluster has been allocated in the
fleecing image, so the request is forwarded to the backing file
> 4. guest writes to active image
> 5. this write is stopped by backup(sync=none) and cluster is copied to
> fleecing image
> 6. guest write continues...
> 7. and client reads _new_ (or partly new) date from active image
Interesting race. Can it actually happen, or does our read code already
serialize writes to the same area while a read is underway?
In short, I see what problem you are claiming exists: the moment the
client starts reading from the backing file, that portion of the backing
file must remain unchanged until after the client is done reading. But
I don't know enough details of the block layer to know if this is
actually a problem, or if adding the new filter is just overhead.
>
> So, this fleecing-filter should be above fleecing image, the whole
> picture of fleecing looks like this:
>
> +-------+ +------------+
> | | | |
> | guest | | NBD client +<------+
> | | | | |
> ++-----++ +------------+ |only read
> | ^ |
> | IO | |
> v | +-----+------+
> ++-----+---------+ | |
> | | | internal |
> | active image +----+ | NBD server |
> | | | | |
> +-+--------------+ |backup +-+----------+
> ^ |sync=none ^
> |backing | |only read
> | | |
> +-+--------------+ | +------+----------+
> | | | | |
> | fleecing image +<---+ | fleecing filter |
> | | | |
> +--------+-------+ +-----+-----------+
> ^ |
> | |
> +--------------------------+
> file
Can you also show the sequence of QMP commands to set up this structure
(or maybe you do in 3/3; which I haven't looked at yet).
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
> qapi/block-core.json | 6 ++--
> block/fleecing-filter.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
> block/Makefile.objs | 1 +
> 3 files changed, 85 insertions(+), 2 deletions(-)
> create mode 100644 block/fleecing-filter.c
>
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 577ce5e999..43872c3d79 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2542,7 +2542,8 @@
> 'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
> 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
> 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
> - 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
> + 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
> + 'fleecing-filter' ] }
Missing a 'since 3.0' documentation blurb; also, this enum has been kept
sorted, so your new filter needs to come earlier.
>
> ##
> # @BlockdevOptionsFile:
> @@ -3594,7 +3595,8 @@
> 'vmdk': 'BlockdevOptionsGenericCOWFormat',
> 'vpc': 'BlockdevOptionsGenericFormat',
> 'vvfat': 'BlockdevOptionsVVFAT',
> - 'vxhs': 'BlockdevOptionsVxHS'
> + 'vxhs': 'BlockdevOptionsVxHS',
> + 'fleecing-filter': 'BlockdevOptionsGenericFormat'
Again, this has been kept sorted.
> +static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
> + uint64_t offset, uint64_t bytes,
> + QEMUIOVector *qiov, int flags)
> +{
> + int ret;
> + BlockJob *job = bs->file->bs->backing->bs->job;
> + CowRequest req;
> +
> + backup_wait_for_overlapping_requests(job, offset, bytes);
> + backup_cow_request_begin(&req, job, offset, bytes);
> +
> + ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
> +
> + backup_cow_request_end(&req);
> +
> + return ret;
> +}
So the idea here is that you force a serializing request to ensure that
there are no other writes to the area in the meantime.
> +
> +static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
> + uint64_t offset, uint64_t bytes,
> + QEMUIOVector *qiov, int flags)
> +{
> + return -EINVAL;
and you force this to be a read-only interface. (Does the block layer
actually require us to provide a pwritev callback, or can we leave it
NULL instead?)
> +BlockDriver bdrv_fleecing_filter = {
> + .format_name = "fleecing-filter",
> + .protocol_name = "fleecing-filter",
> + .instance_size = 0,
> +
> + .bdrv_open = fleecing_open,
> + .bdrv_close = fleecing_close,
> +
> + .bdrv_getlength = fleecing_getlength,
> + .bdrv_co_preadv = fleecing_co_preadv,
> + .bdrv_co_pwritev = fleecing_co_pwritev,
> +
> + .is_filter = true,
> + .bdrv_recurse_is_first_non_filter = fleecing_recurse_is_first_non_filter,
> + .bdrv_child_perm = bdrv_filter_default_perms,
No .bdrv_co_block_status callback? That probably hurts querying for
sparse regions.
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3266
Virtualization: qemu.org | libvirt.org
On Fri, 06/29 12:24, Eric Blake wrote: > On 06/29/2018 10:15 AM, Vladimir Sementsov-Ogievskiy wrote: > > We need to synchronize backup job with reading from fleecing image > > like it was done in block/replication.c. > > > > Otherwise, the following situation is theoretically possible: > > > > Grammar suggestions: > > > 1. client start reading > > client starts reading > > > 2. client understand, that there is no corresponding cluster in > > fleecing image > > 3. client is going to read from backing file (i.e. active image) > > client sees that no corresponding cluster has been allocated in the fleecing > image, so the request is forwarded to the backing file > > > 4. guest writes to active image > > 5. this write is stopped by backup(sync=none) and cluster is copied to > > fleecing image > > 6. guest write continues... > > 7. and client reads _new_ (or partly new) date from active image > > Interesting race. Can it actually happen, or does our read code already > serialize writes to the same area while a read is underway? Yes, I wonder why wait_serialising_requests() is not enough. If it's possible, can we have a test case (with help of blkdebug, for example)? > > In short, I see what problem you are claiming exists: the moment the client > starts reading from the backing file, that portion of the backing file must > remain unchanged until after the client is done reading. But I don't know > enough details of the block layer to know if this is actually a problem, or > if adding the new filter is just overhead. Fam
02.07.2018 09:35, Fam Zheng wrote: > On Fri, 06/29 12:24, Eric Blake wrote: >> On 06/29/2018 10:15 AM, Vladimir Sementsov-Ogievskiy wrote: >>> We need to synchronize backup job with reading from fleecing image >>> like it was done in block/replication.c. >>> >>> Otherwise, the following situation is theoretically possible: >>> >> Grammar suggestions: >> >>> 1. client start reading >> client starts reading >> >>> 2. client understand, that there is no corresponding cluster in >>> fleecing image >>> 3. client is going to read from backing file (i.e. active image) >> client sees that no corresponding cluster has been allocated in the fleecing >> image, so the request is forwarded to the backing file >> >>> 4. guest writes to active image >>> 5. this write is stopped by backup(sync=none) and cluster is copied to >>> fleecing image >>> 6. guest write continues... >>> 7. and client reads _new_ (or partly new) date from active image >> Interesting race. Can it actually happen, or does our read code already >> serialize writes to the same area while a read is underway? > Yes, I wonder why wait_serialising_requests() is not enough. If it's possible, > can we have a test case (with help of blkdebug, for example)? Hmm, only unaligned and COPY_ON_READ requests are marked serializing.. So If I understand correctly, nothing special prevents intersection, it's a problem of the guest. But in backup case, it's our problem. > >> In short, I see what problem you are claiming exists: the moment the client >> starts reading from the backing file, that portion of the backing file must >> remain unchanged until after the client is done reading. But I don't know >> enough details of the block layer to know if this is actually a problem, or >> if adding the new filter is just overhead. > Fam -- Best regards, Vladimir
29.06.2018 20:24, Eric Blake wrote:
> On 06/29/2018 10:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>> We need to synchronize backup job with reading from fleecing image
>> like it was done in block/replication.c.
>>
>> Otherwise, the following situation is theoretically possible:
>>
>
> Grammar suggestions:
>
>> 1. client start reading
>
> client starts reading
>
>> 2. client understand, that there is no corresponding cluster in
>> fleecing image
>> 3. client is going to read from backing file (i.e. active image)
>
> client sees that no corresponding cluster has been allocated in the
> fleecing image, so the request is forwarded to the backing file
>
>> 4. guest writes to active image
>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>> fleecing image
>> 6. guest write continues...
>> 7. and client reads _new_ (or partly new) date from active image
>
> Interesting race. Can it actually happen, or does our read code
> already serialize writes to the same area while a read is underway?
>
> In short, I see what problem you are claiming exists: the moment the
> client starts reading from the backing file, that portion of the
> backing file must remain unchanged until after the client is done
> reading. But I don't know enough details of the block layer to know
> if this is actually a problem, or if adding the new filter is just
> overhead.
Looking at the code, more real example (but I still have no reproducer):
1. client starts reading and take qcow2 mutex in qcow2_co_preadv, and
goes up to l2 table loading (assume cache miss)
2) guest write => backup COW => qcow2 write => try to take qcow2 mutex
=> waiting
3. l2 table loaded, we see that cluster is UNALLOCATED, go to "case
QCOW2_CLUSTER_UNALLOCATED" and unlock mutex before
bdrv_co_preadv(bs->backing, ...)
4) aha, mutex unlocked, backup COW continues, and we finally finish
guest write and change cluster in our active disk
5. actually, do bdrv_co_preadv(bs->backing, ...) and read _new updated_
data.
>
>>
>> So, this fleecing-filter should be above fleecing image, the whole
>> picture of fleecing looks like this:
>>
>> +-------+ +------------+
>> | | | |
>> | guest | | NBD client +<------+
>> | | | | |
>> ++-----++ +------------+ |only read
>> | ^ |
>> | IO | |
>> v | +-----+------+
>> ++-----+---------+ | |
>> | | | internal |
>> | active image +----+ | NBD server |
>> | | | | |
>> +-+--------------+ |backup +-+----------+
>> ^ |sync=none ^
>> |backing | |only read
>> | | |
>> +-+--------------+ | +------+----------+
>> | | | | |
>> | fleecing image +<---+ | fleecing filter |
>> | | | |
>> +--------+-------+ +-----+-----------+
>> ^ |
>> | |
>> +--------------------------+
>> file
>
> Can you also show the sequence of QMP commands to set up this
> structure (or maybe you do in 3/3; which I haven't looked at yet).
>
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>> qapi/block-core.json | 6 ++--
>> block/fleecing-filter.c | 80
>> +++++++++++++++++++++++++++++++++++++++++++++++++
>> block/Makefile.objs | 1 +
>> 3 files changed, 85 insertions(+), 2 deletions(-)
>> create mode 100644 block/fleecing-filter.c
>>
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index 577ce5e999..43872c3d79 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -2542,7 +2542,8 @@
>> 'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd',
>> 'nfs',
>> 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
>> 'qcow2', 'qed',
>> 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
>> - 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat',
>> 'vxhs' ] }
>> + 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
>> + 'fleecing-filter' ] }
>
> Missing a 'since 3.0' documentation blurb; also, this enum has been
> kept sorted, so your new filter needs to come earlier.
>
>> ##
>> # @BlockdevOptionsFile:
>> @@ -3594,7 +3595,8 @@
>> 'vmdk': 'BlockdevOptionsGenericCOWFormat',
>> 'vpc': 'BlockdevOptionsGenericFormat',
>> 'vvfat': 'BlockdevOptionsVVFAT',
>> - 'vxhs': 'BlockdevOptionsVxHS'
>> + 'vxhs': 'BlockdevOptionsVxHS',
>> + 'fleecing-filter': 'BlockdevOptionsGenericFormat'
>
> Again, this has been kept sorted.
>
>> +static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
>> + uint64_t offset, uint64_t
>> bytes,
>> + QEMUIOVector *qiov, int flags)
>> +{
>> + int ret;
>> + BlockJob *job = bs->file->bs->backing->bs->job;
>> + CowRequest req;
>> +
>> + backup_wait_for_overlapping_requests(job, offset, bytes);
>> + backup_cow_request_begin(&req, job, offset, bytes);
>> +
>> + ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
>> +
>> + backup_cow_request_end(&req);
>> +
>> + return ret;
>> +}
>
> So the idea here is that you force a serializing request to ensure
> that there are no other writes to the area in the meantime.
>
>> +
>> +static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
>> + uint64_t offset,
>> uint64_t bytes,
>> + QEMUIOVector *qiov, int flags)
>> +{
>> + return -EINVAL;
>
> and you force this to be a read-only interface. (Does the block layer
> actually require us to provide a pwritev callback, or can we leave it
> NULL instead?)
>
>> +BlockDriver bdrv_fleecing_filter = {
>> + .format_name = "fleecing-filter",
>> + .protocol_name = "fleecing-filter",
>> + .instance_size = 0,
>> +
>> + .bdrv_open = fleecing_open,
>> + .bdrv_close = fleecing_close,
>> +
>> + .bdrv_getlength = fleecing_getlength,
>> + .bdrv_co_preadv = fleecing_co_preadv,
>> + .bdrv_co_pwritev = fleecing_co_pwritev,
>> +
>> + .is_filter = true,
>> + .bdrv_recurse_is_first_non_filter =
>> fleecing_recurse_is_first_non_filter,
>> + .bdrv_child_perm = bdrv_filter_default_perms,
>
> No .bdrv_co_block_status callback? That probably hurts querying for
> sparse regions.
>
hm, worth add.. and it possibly needs synchronization with backup too.
--
Best regards,
Vladimir
On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> We need to synchronize backup job with reading from fleecing image
> like it was done in block/replication.c.
>
> Otherwise, the following situation is theoretically possible:
>
> 1. client start reading
> 2. client understand, that there is no corresponding cluster in
> fleecing image
I don't think the client refocuses the read, but QEMU does. (the client
has no idea if it is reading from the fleecing node or the backing file.)
... but understood:
> 3. client is going to read from backing file (i.e. active image)
> 4. guest writes to active image
My question here is if QEMU will allow reads and writes to interleave in
general. If the client is reading from the backing file, the active
image, will QEMU allow a write to modify that data before we're done
getting that data?
> 5. this write is stopped by backup(sync=none) and cluster is copied to
> fleecing image
> 6. guest write continues...
> 7. and client reads _new_ (or partly new) date from active image
>
I can't answer this for myself one way or the other right now, but I
imagine you observed a failure in practice in your test setups, which
motivated this patch?
A test or some proof would help justification for this patch. It would
also help prove that it solves what it claims to!
> So, this fleecing-filter should be above fleecing image, the whole
> picture of fleecing looks like this:
>
> +-------+ +------------+
> | | | |
> | guest | | NBD client +<------+
> | | | | |
> ++-----++ +------------+ |only read
> | ^ |
> | IO | |
> v | +-----+------+
> ++-----+---------+ | |
> | | | internal |
> | active image +----+ | NBD server |
> | | | | |
> +-+--------------+ |backup +-+----------+
> ^ |sync=none ^
> |backing | |only read
> | | |
> +-+--------------+ | +------+----------+
> | | | | |
> | fleecing image +<---+ | fleecing filter |
> | | | |
> +--------+-------+ +-----+-----------+
> ^ |
> | |
> +--------------------------+
> file
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
> qapi/block-core.json | 6 ++--
> block/fleecing-filter.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
> block/Makefile.objs | 1 +
> 3 files changed, 85 insertions(+), 2 deletions(-)
> create mode 100644 block/fleecing-filter.c
>
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 577ce5e999..43872c3d79 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -2542,7 +2542,8 @@
> 'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
> 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
> 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
> - 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
> + 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
> + 'fleecing-filter' ] }
>
> ##
> # @BlockdevOptionsFile:
> @@ -3594,7 +3595,8 @@
> 'vmdk': 'BlockdevOptionsGenericCOWFormat',
> 'vpc': 'BlockdevOptionsGenericFormat',
> 'vvfat': 'BlockdevOptionsVVFAT',
> - 'vxhs': 'BlockdevOptionsVxHS'
> + 'vxhs': 'BlockdevOptionsVxHS',
> + 'fleecing-filter': 'BlockdevOptionsGenericFormat'
> } }
>
> ##
> diff --git a/block/fleecing-filter.c b/block/fleecing-filter.c
> new file mode 100644
> index 0000000000..b501887c10
> --- /dev/null
> +++ b/block/fleecing-filter.c
> @@ -0,0 +1,80 @@
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "block/blockjob.h"
> +#include "block/block_int.h"
> +#include "block/block_backup.h"
> +
> +static int64_t fleecing_getlength(BlockDriverState *bs)
> +{
> + return bdrv_getlength(bs->file->bs);
> +}
> +
> +static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
> + uint64_t offset, uint64_t bytes,
> + QEMUIOVector *qiov, int flags)
> +{
> + int ret;
> + BlockJob *job = bs->file->bs->backing->bs->job;
> + CowRequest req;
> +
> + backup_wait_for_overlapping_requests(job, offset, bytes);
> + backup_cow_request_begin(&req, job, offset, bytes);
> +
> + ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
> +
> + backup_cow_request_end(&req);
> +
> + return ret;
> +}
> +
> +static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
> + uint64_t offset, uint64_t bytes,
> + QEMUIOVector *qiov, int flags)
> +{
> + return -EINVAL;
> +}
> +
> +static bool fleecing_recurse_is_first_non_filter(BlockDriverState *bs,
> + BlockDriverState *candidate)
> +{
> + return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
> +}
> +
> +static int fleecing_open(BlockDriverState *bs, QDict *options,
> + int flags, Error **errp)
> +{
> + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
> + errp);
> +
> + return bs->file ? 0 : -EINVAL;
> +}
> +
> +static void fleecing_close(BlockDriverState *bs)
> +{
> + /* Do nothing, we have to add .bdrv_close, because bdrv_close() don't check
> + * it, just call. */
> +}
> +
> +BlockDriver bdrv_fleecing_filter = {
> + .format_name = "fleecing-filter",
> + .protocol_name = "fleecing-filter",
> + .instance_size = 0,
> +
> + .bdrv_open = fleecing_open,
> + .bdrv_close = fleecing_close,
> +
> + .bdrv_getlength = fleecing_getlength,
> + .bdrv_co_preadv = fleecing_co_preadv,
> + .bdrv_co_pwritev = fleecing_co_pwritev,
> +
> + .is_filter = true,
> + .bdrv_recurse_is_first_non_filter = fleecing_recurse_is_first_non_filter,
> + .bdrv_child_perm = bdrv_filter_default_perms,
> +};
> +
> +static void bdrv_fleecing_init(void)
> +{
> + bdrv_register(&bdrv_fleecing_filter);
> +}
> +
> +block_init(bdrv_fleecing_init);
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 899bfb5e2c..aa0a6dd971 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -27,6 +27,7 @@ block-obj-y += write-threshold.o
> block-obj-y += backup.o
> block-obj-$(CONFIG_REPLICATION) += replication.o
> block-obj-y += throttle.o copy-on-read.o
> +block-obj-y += fleecing-filter.o
>
> block-obj-y += crypto.o
>
>
On 06/29/2018 12:30 PM, John Snow wrote: > > > On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote: >> We need to synchronize backup job with reading from fleecing image >> like it was done in block/replication.c. >> >> Otherwise, the following situation is theoretically possible: >> >> 1. client start reading >> 2. client understand, that there is no corresponding cluster in >> fleecing image > > I don't think the client refocuses the read, but QEMU does. (the client > has no idea if it is reading from the fleecing node or the backing file.) > > ... but understood: > >> 3. client is going to read from backing file (i.e. active image) >> 4. guest writes to active image > > My question here is if QEMU will allow reads and writes to interleave in > general. If the client is reading from the backing file, the active > image, will QEMU allow a write to modify that data before we're done > getting that data? > >> 5. this write is stopped by backup(sync=none) and cluster is copied to >> fleecing image >> 6. guest write continues... >> 7. and client reads _new_ (or partly new) date from active image >> > > I can't answer this for myself one way or the other right now, but I > imagine you observed a failure in practice in your test setups, which > motivated this patch? > > A test or some proof would help justification for this patch. It would > also help prove that it solves what it claims to! In fact, do we really need a new filter, or do we just need to make the "sync":"none" blockdev-backup job take the appropriate synchronization locks? -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
29.06.2018 20:40, Eric Blake wrote: > On 06/29/2018 12:30 PM, John Snow wrote: >> >> >> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote: >>> We need to synchronize backup job with reading from fleecing image >>> like it was done in block/replication.c. >>> >>> Otherwise, the following situation is theoretically possible: >>> >>> 1. client start reading >>> 2. client understand, that there is no corresponding cluster in >>> fleecing image >> >> I don't think the client refocuses the read, but QEMU does. (the client >> has no idea if it is reading from the fleecing node or the backing >> file.) >> >> ... but understood: >> >>> 3. client is going to read from backing file (i.e. active image) >>> 4. guest writes to active image >> >> My question here is if QEMU will allow reads and writes to interleave in >> general. If the client is reading from the backing file, the active >> image, will QEMU allow a write to modify that data before we're done >> getting that data? >> >>> 5. this write is stopped by backup(sync=none) and cluster is copied to >>> fleecing image >>> 6. guest write continues... >>> 7. and client reads _new_ (or partly new) date from active image >>> >> >> I can't answer this for myself one way or the other right now, but I >> imagine you observed a failure in practice in your test setups, which >> motivated this patch? >> >> A test or some proof would help justification for this patch. It would >> also help prove that it solves what it claims to! > > In fact, do we really need a new filter, or do we just need to make > the "sync":"none" blockdev-backup job take the appropriate > synchronization locks? > How? We'll need additional mechanism like serializing requests.. Or a way to reuse serializing requests. Using backup internal synchronization looks simpler, and it is already used in block replication. On the other hand, I think the best thing is instead refuse backup(sync=none), and do the whole functionality for external backup in the special filter driver, but for now I'd prefer to start with already existing staff. -- Best regards, Vladimir
Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben: > 29.06.2018 20:40, Eric Blake wrote: > > On 06/29/2018 12:30 PM, John Snow wrote: > > > > > > > > > On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote: > > > > We need to synchronize backup job with reading from fleecing image > > > > like it was done in block/replication.c. > > > > > > > > Otherwise, the following situation is theoretically possible: > > > > > > > > 1. client start reading > > > > 2. client understand, that there is no corresponding cluster in > > > > fleecing image > > > > > > I don't think the client refocuses the read, but QEMU does. (the client > > > has no idea if it is reading from the fleecing node or the backing > > > file.) > > > > > > ... but understood: > > > > > > > 3. client is going to read from backing file (i.e. active image) > > > > 4. guest writes to active image > > > > > > My question here is if QEMU will allow reads and writes to interleave in > > > general. If the client is reading from the backing file, the active > > > image, will QEMU allow a write to modify that data before we're done > > > getting that data? > > > > > > > 5. this write is stopped by backup(sync=none) and cluster is copied to > > > > fleecing image > > > > 6. guest write continues... > > > > 7. and client reads _new_ (or partly new) date from active image > > > > > > > > > > I can't answer this for myself one way or the other right now, but I > > > imagine you observed a failure in practice in your test setups, which > > > motivated this patch? > > > > > > A test or some proof would help justification for this patch. It would > > > also help prove that it solves what it claims to! > > > > In fact, do we really need a new filter, or do we just need to make the > > "sync":"none" blockdev-backup job take the appropriate synchronization > > locks? > > > > How? We'll need additional mechanism like serializing requests.. Or a way to > reuse serializing requests. Using backup internal synchronization looks > simpler, and it is already used in block replication. But it also just an ugly hack that fixes one special case and leaves everything else broken. replication is usually not a good example for anything. It always gives me bad surprises when I have to look at it. We'll have to figure out where to fix this problem (or what it really is, once you look more than just at fleecing), but I think requiring the user to add a filter driver to work around missing serialisation in other code, and corrupting their image if they forget to, is not a reasonable solution. I see at least two things wrong in this context: * The permissions don't seem to match reality. The NBD server unconditionally shares PERM_WRITE, which is wrong in this case. The client wants to see a point-in-time snapshot that never changes. This should become an option so that it can be properly reflected in the permissions used. * Once we have proper permissions, the fleecing setup breaks down because the guest needs PERM_WRITE on the backing file, but the fleecing overlay allows that only if the NBD client allows it (which it doesn't for fleecing). Now we can implement an exception right into backup that installs a backup filter driver between source and target if the source is the backing file of the target. The filter driver would be similar to the commit filter driver in that it simply promises !PERM_WRITE to its parents, but allows PERM_WRITE on the source because it has installed the before_write_notifier that guarantees this condition. All writes to the target that are made by the backup job in this setup (including before_write_notifier writes) need to be marked as serialising so that any concurrent reads are completed first. And if we decide to add a target filter to backup, we should probably at the same time use a filter driver for intercepting source writes instead of using before_write_notifier. Max, I think you intended to make both source and target children of the same block job node (or at least for mirror). But wouldn't that create loops in a setup like this? I think we may need two filters that are only connected through the block job, but not with block graph edges. Kevin
03.07.2018 14:15, Kevin Wolf wrote: > Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben: >> 29.06.2018 20:40, Eric Blake wrote: >>> On 06/29/2018 12:30 PM, John Snow wrote: >>>> >>>> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote: >>>>> We need to synchronize backup job with reading from fleecing image >>>>> like it was done in block/replication.c. >>>>> >>>>> Otherwise, the following situation is theoretically possible: >>>>> >>>>> 1. client start reading >>>>> 2. client understand, that there is no corresponding cluster in >>>>> fleecing image >>>> I don't think the client refocuses the read, but QEMU does. (the client >>>> has no idea if it is reading from the fleecing node or the backing >>>> file.) >>>> >>>> ... but understood: >>>> >>>>> 3. client is going to read from backing file (i.e. active image) >>>>> 4. guest writes to active image >>>> My question here is if QEMU will allow reads and writes to interleave in >>>> general. If the client is reading from the backing file, the active >>>> image, will QEMU allow a write to modify that data before we're done >>>> getting that data? >>>> >>>>> 5. this write is stopped by backup(sync=none) and cluster is copied to >>>>> fleecing image >>>>> 6. guest write continues... >>>>> 7. and client reads _new_ (or partly new) date from active image >>>>> >>>> I can't answer this for myself one way or the other right now, but I >>>> imagine you observed a failure in practice in your test setups, which >>>> motivated this patch? >>>> >>>> A test or some proof would help justification for this patch. It would >>>> also help prove that it solves what it claims to! >>> In fact, do we really need a new filter, or do we just need to make the >>> "sync":"none" blockdev-backup job take the appropriate synchronization >>> locks? >>> >> How? We'll need additional mechanism like serializing requests.. Or a way to >> reuse serializing requests. Using backup internal synchronization looks >> simpler, and it is already used in block replication. > But it also just an ugly hack agree. > that fixes one special case and leaves > everything else broken. replication is usually not a good example for > anything. It always gives me bad surprises when I have to look at it. > > We'll have to figure out where to fix this problem (or what it really > is, once you look more than just at fleecing), but I think requiring the > user to add a filter driver to work around missing serialisation in > other code, and corrupting their image if they forget to, is not a > reasonable solution. > > I see at least two things wrong in this context: > > * The permissions don't seem to match reality. The NBD server > unconditionally shares PERM_WRITE, which is wrong in this case. The > client wants to see a point-in-time snapshot that never changes. This > should become an option so that it can be properly reflected in the > permissions used. > > * Once we have proper permissions, the fleecing setup breaks down > because the guest needs PERM_WRITE on the backing file, but the > fleecing overlay allows that only if the NBD client allows it (which > it doesn't for fleecing). > > Now we can implement an exception right into backup that installs a > backup filter driver between source and target if the source is the > backing file of the target. The filter driver would be similar to the > commit filter driver in that it simply promises !PERM_WRITE to its > parents, but allows PERM_WRITE on the source because it has installed > the before_write_notifier that guarantees this condition. > > All writes to the target that are made by the backup job in this setup > (including before_write_notifier writes) need to be marked as > serialising so that any concurrent reads are completed first. > > And if we decide to add a target filter to backup, we should probably at > the same time use a filter driver for intercepting source writes instead > of using before_write_notifier. and this is the point, where we can drop backup job at all from fleecing scheme, as actually backup(sync=none) == such special filter driver > > Max, I think you intended to make both source and target children of the > same block job node (or at least for mirror). But wouldn't that create > loops in a setup like this? I think we may need two filters that are > only connected through the block job, but not with block graph edges. > > Kevin -- Best regards, Vladimir
03.07.2018 14:15, Kevin Wolf wrote: > Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben: >> 29.06.2018 20:40, Eric Blake wrote: >>> On 06/29/2018 12:30 PM, John Snow wrote: >>>> >>>> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote: >>>>> We need to synchronize backup job with reading from fleecing image >>>>> like it was done in block/replication.c. >>>>> >>>>> Otherwise, the following situation is theoretically possible: >>>>> >>>>> 1. client start reading >>>>> 2. client understand, that there is no corresponding cluster in >>>>> fleecing image >>>> I don't think the client refocuses the read, but QEMU does. (the client >>>> has no idea if it is reading from the fleecing node or the backing >>>> file.) >>>> >>>> ... but understood: >>>> >>>>> 3. client is going to read from backing file (i.e. active image) >>>>> 4. guest writes to active image >>>> My question here is if QEMU will allow reads and writes to interleave in >>>> general. If the client is reading from the backing file, the active >>>> image, will QEMU allow a write to modify that data before we're done >>>> getting that data? >>>> >>>>> 5. this write is stopped by backup(sync=none) and cluster is copied to >>>>> fleecing image >>>>> 6. guest write continues... >>>>> 7. and client reads _new_ (or partly new) date from active image >>>>> >>>> I can't answer this for myself one way or the other right now, but I >>>> imagine you observed a failure in practice in your test setups, which >>>> motivated this patch? >>>> >>>> A test or some proof would help justification for this patch. It would >>>> also help prove that it solves what it claims to! >>> In fact, do we really need a new filter, or do we just need to make the >>> "sync":"none" blockdev-backup job take the appropriate synchronization >>> locks? >>> >> How? We'll need additional mechanism like serializing requests.. Or a way to >> reuse serializing requests. Using backup internal synchronization looks >> simpler, and it is already used in block replication. > But it also just an ugly hack that fixes one special case and leaves > everything else broken. replication is usually not a good example for > anything. It always gives me bad surprises when I have to look at it. > > We'll have to figure out where to fix this problem (or what it really > is, once you look more than just at fleecing), but I think requiring the > user to add a filter driver to work around missing serialisation in > other code, and corrupting their image if they forget to, is not a > reasonable solution. > > I see at least two things wrong in this context: > > * The permissions don't seem to match reality. The NBD server > unconditionally shares PERM_WRITE, which is wrong in this case. The > client wants to see a point-in-time snapshot that never changes. This > should become an option so that it can be properly reflected in the > permissions used. > > * Once we have proper permissions, the fleecing setup breaks down > because the guest needs PERM_WRITE on the backing file, but the > fleecing overlay allows that only if the NBD client allows it (which > it doesn't for fleecing). > > Now we can implement an exception right into backup that installs a > backup filter driver between source and target if the source is the > backing file of the target. The filter driver would be similar to the > commit filter driver in that it simply promises !PERM_WRITE to its > parents, but allows PERM_WRITE on the source because it has installed > the before_write_notifier that guarantees this condition. > > All writes to the target that are made by the backup job in this setup > (including before_write_notifier writes) need to be marked as > serialising so that any concurrent reads are completed first. > > And if we decide to add a target filter to backup, we should probably at > the same time use a filter driver for intercepting source writes instead > of using before_write_notifier. Hmm, is it possible to do all the staff in one super filter driver, which we insert into the tree like this: top blk fleecing qcow2 + + | |backing v <-----+ super filter + |file v active image And super filter do the following: 1. copy-on-write, before forwarding write to file, it do serializing write to fleecing qcow2 2. fake .bdrv_child_perm for fleecing qcow2, like in block commit and no block job is needed. > > Max, I think you intended to make both source and target children of the > same block job node (or at least for mirror). But wouldn't that create > loops in a setup like this? I think we may need two filters that are > only connected through the block job, but not with block graph edges. > > Kevin -- Best regards, Vladimir
Am 03.07.2018 um 18:11 hat Vladimir Sementsov-Ogievskiy geschrieben: > 03.07.2018 14:15, Kevin Wolf wrote: > > We'll have to figure out where to fix this problem (or what it really > > is, once you look more than just at fleecing), but I think requiring the > > user to add a filter driver to work around missing serialisation in > > other code, and corrupting their image if they forget to, is not a > > reasonable solution. > > > > I see at least two things wrong in this context: > > > > * The permissions don't seem to match reality. The NBD server > > unconditionally shares PERM_WRITE, which is wrong in this case. The > > client wants to see a point-in-time snapshot that never changes. This > > should become an option so that it can be properly reflected in the > > permissions used. > > > > * Once we have proper permissions, the fleecing setup breaks down > > because the guest needs PERM_WRITE on the backing file, but the > > fleecing overlay allows that only if the NBD client allows it (which > > it doesn't for fleecing). > > > > Now we can implement an exception right into backup that installs a > > backup filter driver between source and target if the source is the > > backing file of the target. The filter driver would be similar to the > > commit filter driver in that it simply promises !PERM_WRITE to its > > parents, but allows PERM_WRITE on the source because it has installed > > the before_write_notifier that guarantees this condition. > > > > All writes to the target that are made by the backup job in this setup > > (including before_write_notifier writes) need to be marked as > > serialising so that any concurrent reads are completed first. > > > > And if we decide to add a target filter to backup, we should probably at > > the same time use a filter driver for intercepting source writes instead > > of using before_write_notifier. > > Hmm, is it possible to do all the staff in one super filter driver, which we > insert into the tree like this: > > top blk fleecing qcow2 > + + > | |backing > v <-----+ > super filter > + > |file > v > active image > > > And super filter do the following: > > 1. copy-on-write, before forwarding write to file, it do serializing write > to fleecing qcow2 This is where it breaks down. The filter driver in your graph doesn't know fleecing.qcow2, so it can't write to it. Attaching fleecing.qcow2 as an additional child to the super filter doesn't work either because you would create a loop then. I think we need two separate nodes (and probably it's better to have them managed by a block job so that both together can be checked to result in a consistent setup). > 2. fake .bdrv_child_perm for fleecing qcow2, like in block commit > > and no block job is needed. Kevin
On 2018-07-03 13:15, Kevin Wolf wrote: > Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben: >> 29.06.2018 20:40, Eric Blake wrote: >>> On 06/29/2018 12:30 PM, John Snow wrote: >>>> >>>> >>>> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote: >>>>> We need to synchronize backup job with reading from fleecing image >>>>> like it was done in block/replication.c. >>>>> >>>>> Otherwise, the following situation is theoretically possible: >>>>> >>>>> 1. client start reading >>>>> 2. client understand, that there is no corresponding cluster in >>>>> fleecing image >>>> >>>> I don't think the client refocuses the read, but QEMU does. (the client >>>> has no idea if it is reading from the fleecing node or the backing >>>> file.) >>>> >>>> ... but understood: >>>> >>>>> 3. client is going to read from backing file (i.e. active image) >>>>> 4. guest writes to active image >>>> >>>> My question here is if QEMU will allow reads and writes to interleave in >>>> general. If the client is reading from the backing file, the active >>>> image, will QEMU allow a write to modify that data before we're done >>>> getting that data? >>>> >>>>> 5. this write is stopped by backup(sync=none) and cluster is copied to >>>>> fleecing image >>>>> 6. guest write continues... >>>>> 7. and client reads _new_ (or partly new) date from active image >>>>> >>>> >>>> I can't answer this for myself one way or the other right now, but I >>>> imagine you observed a failure in practice in your test setups, which >>>> motivated this patch? >>>> >>>> A test or some proof would help justification for this patch. It would >>>> also help prove that it solves what it claims to! >>> >>> In fact, do we really need a new filter, or do we just need to make the >>> "sync":"none" blockdev-backup job take the appropriate synchronization >>> locks? >>> >> >> How? We'll need additional mechanism like serializing requests.. Or a way to >> reuse serializing requests. Using backup internal synchronization looks >> simpler, and it is already used in block replication. > > But it also just an ugly hack that fixes one special case and leaves > everything else broken. replication is usually not a good example for > anything. It always gives me bad surprises when I have to look at it. > > We'll have to figure out where to fix this problem (or what it really > is, once you look more than just at fleecing), but I think requiring the > user to add a filter driver to work around missing serialisation in > other code, and corrupting their image if they forget to, is not a > reasonable solution. > > I see at least two things wrong in this context: > > * The permissions don't seem to match reality. The NBD server > unconditionally shares PERM_WRITE, which is wrong in this case. The > client wants to see a point-in-time snapshot that never changes. This > should become an option so that it can be properly reflected in the > permissions used. > > * Once we have proper permissions, the fleecing setup breaks down > because the guest needs PERM_WRITE on the backing file, but the > fleecing overlay allows that only if the NBD client allows it (which > it doesn't for fleecing). > > Now we can implement an exception right into backup that installs a > backup filter driver between source and target if the source is the > backing file of the target. The filter driver would be similar to the > commit filter driver in that it simply promises !PERM_WRITE to its > parents, but allows PERM_WRITE on the source because it has installed > the before_write_notifier that guarantees this condition. I don't quite understand what you mean "between source and target", because *the* backup filter driver would need to be on top of both. Unless you continue to use before-write notifiers, but I don't see why when a driver that sits on top of source would intercept all requests anyway. And if we want to unify mirror and backup at any time, I think we have to get away from before-write anyway. > All writes to the target that are made by the backup job in this setup > (including before_write_notifier writes) need to be marked as > serialising so that any concurrent reads are completed first. > > And if we decide to add a target filter to backup, we should probably at > the same time use a filter driver for intercepting source writes instead > of using before_write_notifier. > > Max, I think you intended to make both source and target children of the > same block job node (or at least for mirror). But wouldn't that create > loops in a setup like this? I think we may need two filters that are > only connected through the block job, but not with block graph edges. A node can only provide one data view to all of its parents. It cannot distinguish based on the parent. Therefore, the main backup driver on top of source and target would necessarily present the current disk (changing) state, just like the mirror filter does. Fleecing wants another view, though. It wants an unchanging view. It therefore seems clear to me that we'd need to separate nodes: One between guest and source (which performs the backup by intercepting write requests), and one between the fleecing user (e.g. NBD) and the target. The former presents the current state, the latter presents the "unchanging" point-in-time snapshot. Sure, we can get away without the former (we do so today), but I don't see why we would want to. blockdev-copy (as an extension of blockdev-mirror) would have such a node anyway, and I can see no reason why we wouldn't attach both source and target to it. Max
29.06.2018 20:30, John Snow wrote:
>
> On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
>> We need to synchronize backup job with reading from fleecing image
>> like it was done in block/replication.c.
>>
>> Otherwise, the following situation is theoretically possible:
>>
>> 1. client start reading
>> 2. client understand, that there is no corresponding cluster in
>> fleecing image
> I don't think the client refocuses the read, but QEMU does. (the client
> has no idea if it is reading from the fleecing node or the backing file.)
>
> ... but understood:
>
>> 3. client is going to read from backing file (i.e. active image)
>> 4. guest writes to active image
> My question here is if QEMU will allow reads and writes to interleave in
> general. If the client is reading from the backing file, the active
> image, will QEMU allow a write to modify that data before we're done
> getting that data?
If I understand correctly: yes. Physical drives allows intersecting
operations too and there no guarantee on result. We have serializing
requests, but in general only unaligned requests are marked serializing.
>
>> 5. this write is stopped by backup(sync=none) and cluster is copied to
>> fleecing image
>> 6. guest write continues...
>> 7. and client reads _new_ (or partly new) date from active image
>>
> I can't answer this for myself one way or the other right now, but I
> imagine you observed a failure in practice in your test setups, which
> motivated this patch?
No. I just found this code in Qemu: block/replication already use the
same scheme with backup(sync=none). And it uses the synchronization as
in this patch. I've discussed it on list a bit:
http://lists.nongnu.org/archive/html/qemu-devel/2017-08/msg01807.html
https://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg03025.html
>
> A test or some proof would help justification for this patch. It would
> also help prove that it solves what it claims to!
Agree, a reproducer is better than any theoretical reflections. I'm
thinking about it, but don't see simple solution...
>
>> So, this fleecing-filter should be above fleecing image, the whole
>> picture of fleecing looks like this:
>>
>> +-------+ +------------+
>> | | | |
>> | guest | | NBD client +<------+
>> | | | | |
>> ++-----++ +------------+ |only read
>> | ^ |
>> | IO | |
>> v | +-----+------+
>> ++-----+---------+ | |
>> | | | internal |
>> | active image +----+ | NBD server |
>> | | | | |
>> +-+--------------+ |backup +-+----------+
>> ^ |sync=none ^
>> |backing | |only read
>> | | |
>> +-+--------------+ | +------+----------+
>> | | | | |
>> | fleecing image +<---+ | fleecing filter |
>> | | | |
>> +--------+-------+ +-----+-----------+
>> ^ |
>> | |
>> +--------------------------+
>> file
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>> qapi/block-core.json | 6 ++--
>> block/fleecing-filter.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
>> block/Makefile.objs | 1 +
>> 3 files changed, 85 insertions(+), 2 deletions(-)
>> create mode 100644 block/fleecing-filter.c
>>
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index 577ce5e999..43872c3d79 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -2542,7 +2542,8 @@
>> 'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
>> 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
>> 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
>> - 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
>> + 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs',
>> + 'fleecing-filter' ] }
>>
>> ##
>> # @BlockdevOptionsFile:
>> @@ -3594,7 +3595,8 @@
>> 'vmdk': 'BlockdevOptionsGenericCOWFormat',
>> 'vpc': 'BlockdevOptionsGenericFormat',
>> 'vvfat': 'BlockdevOptionsVVFAT',
>> - 'vxhs': 'BlockdevOptionsVxHS'
>> + 'vxhs': 'BlockdevOptionsVxHS',
>> + 'fleecing-filter': 'BlockdevOptionsGenericFormat'
>> } }
>>
>> ##
>> diff --git a/block/fleecing-filter.c b/block/fleecing-filter.c
>> new file mode 100644
>> index 0000000000..b501887c10
>> --- /dev/null
>> +++ b/block/fleecing-filter.c
>> @@ -0,0 +1,80 @@
>> +#include "qemu/osdep.h"
>> +#include "qemu-common.h"
>> +#include "block/blockjob.h"
>> +#include "block/block_int.h"
>> +#include "block/block_backup.h"
>> +
>> +static int64_t fleecing_getlength(BlockDriverState *bs)
>> +{
>> + return bdrv_getlength(bs->file->bs);
>> +}
>> +
>> +static coroutine_fn int fleecing_co_preadv(BlockDriverState *bs,
>> + uint64_t offset, uint64_t bytes,
>> + QEMUIOVector *qiov, int flags)
>> +{
>> + int ret;
>> + BlockJob *job = bs->file->bs->backing->bs->job;
>> + CowRequest req;
>> +
>> + backup_wait_for_overlapping_requests(job, offset, bytes);
>> + backup_cow_request_begin(&req, job, offset, bytes);
>> +
>> + ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
>> +
>> + backup_cow_request_end(&req);
>> +
>> + return ret;
>> +}
>> +
>> +static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs,
>> + uint64_t offset, uint64_t bytes,
>> + QEMUIOVector *qiov, int flags)
>> +{
>> + return -EINVAL;
>> +}
>> +
>> +static bool fleecing_recurse_is_first_non_filter(BlockDriverState *bs,
>> + BlockDriverState *candidate)
>> +{
>> + return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate);
>> +}
>> +
>> +static int fleecing_open(BlockDriverState *bs, QDict *options,
>> + int flags, Error **errp)
>> +{
>> + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
>> + errp);
>> +
>> + return bs->file ? 0 : -EINVAL;
>> +}
>> +
>> +static void fleecing_close(BlockDriverState *bs)
>> +{
>> + /* Do nothing, we have to add .bdrv_close, because bdrv_close() don't check
>> + * it, just call. */
>> +}
>> +
>> +BlockDriver bdrv_fleecing_filter = {
>> + .format_name = "fleecing-filter",
>> + .protocol_name = "fleecing-filter",
>> + .instance_size = 0,
>> +
>> + .bdrv_open = fleecing_open,
>> + .bdrv_close = fleecing_close,
>> +
>> + .bdrv_getlength = fleecing_getlength,
>> + .bdrv_co_preadv = fleecing_co_preadv,
>> + .bdrv_co_pwritev = fleecing_co_pwritev,
>> +
>> + .is_filter = true,
>> + .bdrv_recurse_is_first_non_filter = fleecing_recurse_is_first_non_filter,
>> + .bdrv_child_perm = bdrv_filter_default_perms,
>> +};
>> +
>> +static void bdrv_fleecing_init(void)
>> +{
>> + bdrv_register(&bdrv_fleecing_filter);
>> +}
>> +
>> +block_init(bdrv_fleecing_init);
>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>> index 899bfb5e2c..aa0a6dd971 100644
>> --- a/block/Makefile.objs
>> +++ b/block/Makefile.objs
>> @@ -27,6 +27,7 @@ block-obj-y += write-threshold.o
>> block-obj-y += backup.o
>> block-obj-$(CONFIG_REPLICATION) += replication.o
>> block-obj-y += throttle.o copy-on-read.o
>> +block-obj-y += fleecing-filter.o
>>
>> block-obj-y += crypto.o
>>
>>
--
Best regards,
Vladimir
Am 02.07.2018 um 13:57 hat Vladimir Sementsov-Ogievskiy geschrieben: > 29.06.2018 20:30, John Snow wrote: > > > > On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote: > > > We need to synchronize backup job with reading from fleecing image > > > like it was done in block/replication.c. > > > > > > Otherwise, the following situation is theoretically possible: > > > > > > 1. client start reading > > > 2. client understand, that there is no corresponding cluster in > > > fleecing image > > I don't think the client refocuses the read, but QEMU does. (the client > > has no idea if it is reading from the fleecing node or the backing file.) > > > > ... but understood: > > > > > 3. client is going to read from backing file (i.e. active image) > > > 4. guest writes to active image > > My question here is if QEMU will allow reads and writes to interleave in > > general. If the client is reading from the backing file, the active > > image, will QEMU allow a write to modify that data before we're done > > getting that data? > > If I understand correctly: yes. Physical drives allows intersecting > operations too and there no guarantee on result. We have serializing > requests, but in general only unaligned requests are marked serializing. Copy on read (in the context of image streaming) is the feature which actually introduced it. It has some similarities with the backup job, so the idea of using it for backup, too, isn't exactly revolutionary. Kevin
© 2016 - 2026 Red Hat, Inc.