[Qemu-devel] [PATCH] qcow2: avoid lseek on block_status if possible

Vladimir Sementsov-Ogievskiy posted 1 patch 6 years, 9 months ago
Test docker-clang@ubuntu passed
Test asan passed
Test docker-mingw@fedora passed
Test checkpatch passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20190125142138.76710-1-vsementsov@virtuozzo.com
Maintainers: Kevin Wolf <kwolf@redhat.com>, Fam Zheng <fam@euphon.net>, Max Reitz <mreitz@redhat.com>, Stefan Hajnoczi <stefanha@redhat.com>
block/qcow2.h             |  1 +
include/block/block_int.h |  7 +++++++
block/io.c                |  3 ++-
block/qcow2-refcount.c    | 36 ++++++++++++++++++++++++++++++++++++
block/qcow2.c             |  7 +++++++
5 files changed, 53 insertions(+), 1 deletion(-)
[Qemu-devel] [PATCH] qcow2: avoid lseek on block_status if possible
Posted by Vladimir Sementsov-Ogievskiy 6 years, 9 months ago
drv_co_block_status digs bs->file for additional, more accurate search
for hole inside region, reported as DATA by bs since 5daa74a6ebc.

This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
knows, where are holes and where is data. But every block_status
request calls lseek additionally. Assume a big disk, full of
data, in any iterative copying block job (or img convert) we'll call
lseek(HOLE) on every iteration, and each of these lseeks will have to
iterate through all metadata up to the end of file. It's obviously
ineffective behavior. And for many scenarios we don't need this lseek
at all.

However, lseek is needed when we have metadata-preallocated image.

So, let's detect metadata-preallocation case and don't dig qcow2's
protocol file in other cases.

The idea is to compare allocation size in POV of filesystem with
allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
significantly lower, consider it as metadata-preallocation case.

Suggested-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---

Hi!

So, to continue talk about lseek/no lseek when qcow2 block_status reports
DATA.

Results on tmpfs:
cached is lseek cache by Kevin
detect is this patch
no lseek is just remove block_status query on bs->file->bs in
         bdrv_co_block_status

    +---------------------+--------+--------+--------+----------+
    |                     | master | cached | detect | no lseek |
    +---------------------+--------+--------+--------+----------+
    | test.qcow2          | 80     | 40     | 0.169  | 0.162    |
    +---------------------+--------+--------+--------+----------+
    | test_forward.qcow2  | 79     | 0.171  | 0.169  | 0.163    |
    +---------------------+--------+--------+--------+----------+
    | test_prealloc.qcow2 | 0.054  | 0.053  | 0.055  | 0.263    |
    +---------------------+--------+--------+--------+----------+

 block/qcow2.h             |  1 +
 include/block/block_int.h |  7 +++++++
 block/io.c                |  3 ++-
 block/qcow2-refcount.c    | 36 ++++++++++++++++++++++++++++++++++++
 block/qcow2.c             |  7 +++++++
 5 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index 438a1dee9e..d7113ed44c 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -610,6 +610,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                 void *cb_opaque, Error **errp);
 int qcow2_shrink_reftable(BlockDriverState *bs);
 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
+int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
 
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
diff --git a/include/block/block_int.h b/include/block/block_int.h
index f605622216..c895ca7169 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -59,6 +59,12 @@
 
 #define BLOCK_PROBE_BUF_SIZE        512
 
+typedef enum BdrvYesNoUnknown {
+    BDRV_UNKNOWN = 0,
+    BDRV_NO,
+    BDRV_YES,
+} BdrvYesNoUnknown;
+
 enum BdrvTrackedRequestType {
     BDRV_TRACKED_READ,
     BDRV_TRACKED_WRITE,
@@ -682,6 +688,7 @@ struct BlockDriverState {
     bool probed;    /* if true, format was probed rather than specified */
     bool force_share; /* if true, always allow all shared permissions */
     bool implicit;  /* if true, this filter node was automatically inserted */
+    BdrvYesNoUnknown metadata_preallocation;
 
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
diff --git a/block/io.c b/block/io.c
index bd9d688f8b..815661750a 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2186,7 +2186,8 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
         }
     }
 
-    if (want_zero && local_file && local_file != bs &&
+    if (want_zero && bs->metadata_preallocation != BDRV_NO &&
+        local_file && local_file != bs &&
         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
         (ret & BDRV_BLOCK_OFFSET_VALID)) {
         int64_t file_pnum;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 1c63ac244a..008196d849 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -3379,3 +3379,39 @@ int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
                             "There are no references in the refcount table.");
     return -EIO;
 }
+
+int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int64_t i, end_cluster, cluster_count = 0;
+    int64_t file_length, real_allocation, metadata_allocation, file_tail;
+    uint64_t refcount;
+
+    file_length = bdrv_getlength(bs->file->bs);
+    if (file_length < 0) {
+        return file_length;
+    }
+    file_tail = offset_into_cluster(s, file_length);
+
+    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
+    if (real_allocation < 0) {
+        return real_allocation;
+    }
+
+    end_cluster = size_to_clusters(s, file_length);
+    for (i = 0; i < end_cluster; i++) {
+        int ret = qcow2_get_refcount(bs, i, &refcount);
+        if (ret < 0) {
+            return ret;
+        }
+        cluster_count += !!refcount;
+    }
+
+    metadata_allocation = cluster_count * s->cluster_size;
+    if (!!refcount && file_tail) {
+        metadata_allocation -= s->cluster_size - file_tail;
+    }
+
+    return real_allocation < 0.9 * metadata_allocation &&
+        real_allocation + s->cluster_size < metadata_allocation;
+}
diff --git a/block/qcow2.c b/block/qcow2.c
index 4897abae5e..adc9cdcb27 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1800,6 +1800,13 @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
     unsigned int bytes;
     int status = 0;
 
+    if (bs->metadata_preallocation == BDRV_UNKNOWN) {
+        ret = qcow2_detect_metadata_preallocation(bs);
+        if (ret >= 0) {
+            bs->metadata_preallocation = ret ? BDRV_YES : BDRV_NO;
+        }
+    }
+
     bytes = MIN(INT_MAX, count);
     qemu_co_mutex_lock(&s->lock);
     ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
-- 
2.18.0


Re: [Qemu-devel] [PATCH] qcow2: avoid lseek on block_status if possible
Posted by Vladimir Sementsov-Ogievskiy 6 years, 9 months ago
25.01.2019 17:21, Vladimir Sementsov-Ogievskiy wrote:
> drv_co_block_status digs bs->file for additional, more accurate search
> for hole inside region, reported as DATA by bs since 5daa74a6ebc.
> 
> This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
> knows, where are holes and where is data. But every block_status
> request calls lseek additionally. Assume a big disk, full of
> data, in any iterative copying block job (or img convert) we'll call
> lseek(HOLE) on every iteration, and each of these lseeks will have to
> iterate through all metadata up to the end of file. It's obviously
> ineffective behavior. And for many scenarios we don't need this lseek
> at all.
> 
> However, lseek is needed when we have metadata-preallocated image.
> 
> So, let's detect metadata-preallocation case and don't dig qcow2's
> protocol file in other cases.
> 
> The idea is to compare allocation size in POV of filesystem with
> allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
> significantly lower, consider it as metadata-preallocation case.
> 
> Suggested-by: Denis V. Lunev <den@openvz.org>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
> 
> Hi!
> 
> So, to continue talk about lseek/no lseek when qcow2 block_status reports
> DATA.
> 
> Results on tmpfs:
> cached is lseek cache by Kevin
> detect is this patch
> no lseek is just remove block_status query on bs->file->bs in
>           bdrv_co_block_status
> 
>      +---------------------+--------+--------+--------+----------+
>      |                     | master | cached | detect | no lseek |
>      +---------------------+--------+--------+--------+----------+
>      | test.qcow2          | 80     | 40     | 0.169  | 0.162    |
>      +---------------------+--------+--------+--------+----------+
>      | test_forward.qcow2  | 79     | 0.171  | 0.169  | 0.163    |
>      +---------------------+--------+--------+--------+----------+
>      | test_prealloc.qcow2 | 0.054  | 0.053  | 0.055  | 0.263    |
>      +---------------------+--------+--------+--------+----------+

Forgot to say, tests by Kevin from branch
https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg05463.html

Hmm. Don't we have something like tests/qemu-iotests, but for performance?
So, all these small pretty tests we have in mailing list may go as git patches?



-- 
Best regards,
Vladimir
Re: [Qemu-devel] [PATCH] qcow2: avoid lseek on block_status if possible
Posted by Kevin Wolf 6 years, 7 months ago
Am 25.01.2019 um 15:36 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 25.01.2019 17:21, Vladimir Sementsov-Ogievskiy wrote:
> > Results on tmpfs:
> > cached is lseek cache by Kevin
> > detect is this patch
> > no lseek is just remove block_status query on bs->file->bs in
> >           bdrv_co_block_status
> > 
> >      +---------------------+--------+--------+--------+----------+
> >      |                     | master | cached | detect | no lseek |
> >      +---------------------+--------+--------+--------+----------+
> >      | test.qcow2          | 80     | 40     | 0.169  | 0.162    |
> >      +---------------------+--------+--------+--------+----------+
> >      | test_forward.qcow2  | 79     | 0.171  | 0.169  | 0.163    |
> >      +---------------------+--------+--------+--------+----------+
> >      | test_prealloc.qcow2 | 0.054  | 0.053  | 0.055  | 0.263    |
> >      +---------------------+--------+--------+--------+----------+
> 
> Forgot to say, tests by Kevin from branch
> https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg05463.html
> 
> Hmm. Don't we have something like tests/qemu-iotests, but for performance?
> So, all these small pretty tests we have in mailing list may go as git patches?

Sounds like a good idea. Maybe we can just create a new subdirectory
qemu-iotests/perf/ and put some benchmark scripts there?

Of course, they wouldn't be able to tell PASS/FAIL like normal
qemu-iotests and so they wouldn't be integrated into the normal
qemu-iotests suite, but just return numbers that can be compared with
different setups or revisions on the same machine.

Kevin

Re: [Qemu-devel] [PATCH] qcow2: avoid lseek on block_status if possible
Posted by Vladimir Sementsov-Ogievskiy 6 years, 7 months ago
25.03.2019 17:56, Kevin Wolf wrote:
> Am 25.01.2019 um 15:36 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 25.01.2019 17:21, Vladimir Sementsov-Ogievskiy wrote:
>>> Results on tmpfs:
>>> cached is lseek cache by Kevin
>>> detect is this patch
>>> no lseek is just remove block_status query on bs->file->bs in
>>>            bdrv_co_block_status
>>>
>>>       +---------------------+--------+--------+--------+----------+
>>>       |                     | master | cached | detect | no lseek |
>>>       +---------------------+--------+--------+--------+----------+
>>>       | test.qcow2          | 80     | 40     | 0.169  | 0.162    |
>>>       +---------------------+--------+--------+--------+----------+
>>>       | test_forward.qcow2  | 79     | 0.171  | 0.169  | 0.163    |
>>>       +---------------------+--------+--------+--------+----------+
>>>       | test_prealloc.qcow2 | 0.054  | 0.053  | 0.055  | 0.263    |
>>>       +---------------------+--------+--------+--------+----------+
>>
>> Forgot to say, tests by Kevin from branch
>> https://lists.gnu.org/archive/html/qemu-devel/2019-01/msg05463.html
>>
>> Hmm. Don't we have something like tests/qemu-iotests, but for performance?
>> So, all these small pretty tests we have in mailing list may go as git patches?
> 
> Sounds like a good idea. Maybe we can just create a new subdirectory
> qemu-iotests/perf/ and put some benchmark scripts there?
> 
> Of course, they wouldn't be able to tell PASS/FAIL like normal
> qemu-iotests and so they wouldn't be integrated into the normal
> qemu-iotests suite, but just return numbers that can be compared with
> different setups or revisions on the same machine.
> 

I found a framework which can print nice ASCII comparison tables for performance,
it's pip package named perf (https://pypi.org/project/perf/). But the problem with it,
that in RHEL there is rpm package which conflicts with this name: python-perf, which
do absolutely another thing.. So, to install pip install perf, you should first
yum remove python-perf..

-- 
Best regards,
Vladimir
Re: [Qemu-devel] [PATCH] qcow2: avoid lseek on block_status if possible
Posted by Vladimir Sementsov-Ogievskiy 6 years, 9 months ago
ping.

Finally, what about this?

25.01.2019 17:21, Vladimir Sementsov-Ogievskiy wrote:
> drv_co_block_status digs bs->file for additional, more accurate search
> for hole inside region, reported as DATA by bs since 5daa74a6ebc.
> 
> This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
> knows, where are holes and where is data. But every block_status
> request calls lseek additionally. Assume a big disk, full of
> data, in any iterative copying block job (or img convert) we'll call
> lseek(HOLE) on every iteration, and each of these lseeks will have to
> iterate through all metadata up to the end of file. It's obviously
> ineffective behavior. And for many scenarios we don't need this lseek
> at all.
> 
> However, lseek is needed when we have metadata-preallocated image.
> 
> So, let's detect metadata-preallocation case and don't dig qcow2's
> protocol file in other cases.
> 
> The idea is to compare allocation size in POV of filesystem with
> allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
> significantly lower, consider it as metadata-preallocation case.
> 
> Suggested-by: Denis V. Lunev <den@openvz.org>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
> 
> Hi!
> 
> So, to continue talk about lseek/no lseek when qcow2 block_status reports
> DATA.
> 
> Results on tmpfs:
> cached is lseek cache by Kevin
> detect is this patch
> no lseek is just remove block_status query on bs->file->bs in
>           bdrv_co_block_status
> 
>      +---------------------+--------+--------+--------+----------+
>      |                     | master | cached | detect | no lseek |
>      +---------------------+--------+--------+--------+----------+
>      | test.qcow2          | 80     | 40     | 0.169  | 0.162    |
>      +---------------------+--------+--------+--------+----------+
>      | test_forward.qcow2  | 79     | 0.171  | 0.169  | 0.163    |
>      +---------------------+--------+--------+--------+----------+
>      | test_prealloc.qcow2 | 0.054  | 0.053  | 0.055  | 0.263    |
>      +---------------------+--------+--------+--------+----------+
> 
>   block/qcow2.h             |  1 +
>   include/block/block_int.h |  7 +++++++
>   block/io.c                |  3 ++-
>   block/qcow2-refcount.c    | 36 ++++++++++++++++++++++++++++++++++++
>   block/qcow2.c             |  7 +++++++
>   5 files changed, 53 insertions(+), 1 deletion(-)
> 
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 438a1dee9e..d7113ed44c 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -610,6 +610,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
>                                   void *cb_opaque, Error **errp);
>   int qcow2_shrink_reftable(BlockDriverState *bs);
>   int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
> +int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
>   
>   /* qcow2-cluster.c functions */
>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index f605622216..c895ca7169 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -59,6 +59,12 @@
>   
>   #define BLOCK_PROBE_BUF_SIZE        512
>   
> +typedef enum BdrvYesNoUnknown {
> +    BDRV_UNKNOWN = 0,
> +    BDRV_NO,
> +    BDRV_YES,
> +} BdrvYesNoUnknown;
> +
>   enum BdrvTrackedRequestType {
>       BDRV_TRACKED_READ,
>       BDRV_TRACKED_WRITE,
> @@ -682,6 +688,7 @@ struct BlockDriverState {
>       bool probed;    /* if true, format was probed rather than specified */
>       bool force_share; /* if true, always allow all shared permissions */
>       bool implicit;  /* if true, this filter node was automatically inserted */
> +    BdrvYesNoUnknown metadata_preallocation;
>   
>       BlockDriver *drv; /* NULL means no media */
>       void *opaque;
> diff --git a/block/io.c b/block/io.c
> index bd9d688f8b..815661750a 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -2186,7 +2186,8 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
>           }
>       }
>   
> -    if (want_zero && local_file && local_file != bs &&
> +    if (want_zero && bs->metadata_preallocation != BDRV_NO &&
> +        local_file && local_file != bs &&
>           (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
>           (ret & BDRV_BLOCK_OFFSET_VALID)) {
>           int64_t file_pnum;
> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
> index 1c63ac244a..008196d849 100644
> --- a/block/qcow2-refcount.c
> +++ b/block/qcow2-refcount.c
> @@ -3379,3 +3379,39 @@ int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
>                               "There are no references in the refcount table.");
>       return -EIO;
>   }
> +
> +int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    int64_t i, end_cluster, cluster_count = 0;
> +    int64_t file_length, real_allocation, metadata_allocation, file_tail;
> +    uint64_t refcount;
> +
> +    file_length = bdrv_getlength(bs->file->bs);
> +    if (file_length < 0) {
> +        return file_length;
> +    }
> +    file_tail = offset_into_cluster(s, file_length);
> +
> +    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
> +    if (real_allocation < 0) {
> +        return real_allocation;
> +    }
> +
> +    end_cluster = size_to_clusters(s, file_length);
> +    for (i = 0; i < end_cluster; i++) {
> +        int ret = qcow2_get_refcount(bs, i, &refcount);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        cluster_count += !!refcount;
> +    }
> +
> +    metadata_allocation = cluster_count * s->cluster_size;
> +    if (!!refcount && file_tail) {
> +        metadata_allocation -= s->cluster_size - file_tail;
> +    }
> +
> +    return real_allocation < 0.9 * metadata_allocation &&
> +        real_allocation + s->cluster_size < metadata_allocation;
> +}
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 4897abae5e..adc9cdcb27 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -1800,6 +1800,13 @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
>       unsigned int bytes;
>       int status = 0;
>   
> +    if (bs->metadata_preallocation == BDRV_UNKNOWN) {
> +        ret = qcow2_detect_metadata_preallocation(bs);
> +        if (ret >= 0) {
> +            bs->metadata_preallocation = ret ? BDRV_YES : BDRV_NO;
> +        }
> +    }
> +
>       bytes = MIN(INT_MAX, count);
>       qemu_co_mutex_lock(&s->lock);
>       ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
> 


-- 
Best regards,
Vladimir
Re: [Qemu-devel] [PATCH] qcow2: avoid lseek on block_status if possible
Posted by Vladimir Sementsov-Ogievskiy 6 years, 8 months ago
ping

06.02.2019 15:30, Vladimir Sementsov-Ogievskiy wrote:
> ping.
> 
> Finally, what about this?
> 
> 25.01.2019 17:21, Vladimir Sementsov-Ogievskiy wrote:
>> drv_co_block_status digs bs->file for additional, more accurate search
>> for hole inside region, reported as DATA by bs since 5daa74a6ebc.
>>
>> This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
>> knows, where are holes and where is data. But every block_status
>> request calls lseek additionally. Assume a big disk, full of
>> data, in any iterative copying block job (or img convert) we'll call
>> lseek(HOLE) on every iteration, and each of these lseeks will have to
>> iterate through all metadata up to the end of file. It's obviously
>> ineffective behavior. And for many scenarios we don't need this lseek
>> at all.
>>
>> However, lseek is needed when we have metadata-preallocated image.
>>
>> So, let's detect metadata-preallocation case and don't dig qcow2's
>> protocol file in other cases.
>>
>> The idea is to compare allocation size in POV of filesystem with
>> allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
>> significantly lower, consider it as metadata-preallocation case.
>>
>> Suggested-by: Denis V. Lunev <den@openvz.org>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>
>> Hi!
>>
>> So, to continue talk about lseek/no lseek when qcow2 block_status reports
>> DATA.
>>
>> Results on tmpfs:
>> cached is lseek cache by Kevin
>> detect is this patch
>> no lseek is just remove block_status query on bs->file->bs in
>>           bdrv_co_block_status
>>
>>      +---------------------+--------+--------+--------+----------+
>>      |                     | master | cached | detect | no lseek |
>>      +---------------------+--------+--------+--------+----------+
>>      | test.qcow2          | 80     | 40     | 0.169  | 0.162    |
>>      +---------------------+--------+--------+--------+----------+
>>      | test_forward.qcow2  | 79     | 0.171  | 0.169  | 0.163    |
>>      +---------------------+--------+--------+--------+----------+
>>      | test_prealloc.qcow2 | 0.054  | 0.053  | 0.055  | 0.263    |
>>      +---------------------+--------+--------+--------+----------+
>>
>>   block/qcow2.h             |  1 +
>>   include/block/block_int.h |  7 +++++++
>>   block/io.c                |  3 ++-
>>   block/qcow2-refcount.c    | 36 ++++++++++++++++++++++++++++++++++++
>>   block/qcow2.c             |  7 +++++++
>>   5 files changed, 53 insertions(+), 1 deletion(-)
>>
>> diff --git a/block/qcow2.h b/block/qcow2.h
>> index 438a1dee9e..d7113ed44c 100644
>> --- a/block/qcow2.h
>> +++ b/block/qcow2.h
>> @@ -610,6 +610,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
>>                                   void *cb_opaque, Error **errp);
>>   int qcow2_shrink_reftable(BlockDriverState *bs);
>>   int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
>> +int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
>>   /* qcow2-cluster.c functions */
>>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>> diff --git a/include/block/block_int.h b/include/block/block_int.h
>> index f605622216..c895ca7169 100644
>> --- a/include/block/block_int.h
>> +++ b/include/block/block_int.h
>> @@ -59,6 +59,12 @@
>>   #define BLOCK_PROBE_BUF_SIZE        512
>> +typedef enum BdrvYesNoUnknown {
>> +    BDRV_UNKNOWN = 0,
>> +    BDRV_NO,
>> +    BDRV_YES,
>> +} BdrvYesNoUnknown;
>> +
>>   enum BdrvTrackedRequestType {
>>       BDRV_TRACKED_READ,
>>       BDRV_TRACKED_WRITE,
>> @@ -682,6 +688,7 @@ struct BlockDriverState {
>>       bool probed;    /* if true, format was probed rather than specified */
>>       bool force_share; /* if true, always allow all shared permissions */
>>       bool implicit;  /* if true, this filter node was automatically inserted */
>> +    BdrvYesNoUnknown metadata_preallocation;
>>       BlockDriver *drv; /* NULL means no media */
>>       void *opaque;
>> diff --git a/block/io.c b/block/io.c
>> index bd9d688f8b..815661750a 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -2186,7 +2186,8 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
>>           }
>>       }
>> -    if (want_zero && local_file && local_file != bs &&
>> +    if (want_zero && bs->metadata_preallocation != BDRV_NO &&
>> +        local_file && local_file != bs &&
>>           (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
>>           (ret & BDRV_BLOCK_OFFSET_VALID)) {
>>           int64_t file_pnum;
>> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
>> index 1c63ac244a..008196d849 100644
>> --- a/block/qcow2-refcount.c
>> +++ b/block/qcow2-refcount.c
>> @@ -3379,3 +3379,39 @@ int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
>>                               "There are no references in the refcount table.");
>>       return -EIO;
>>   }
>> +
>> +int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    int64_t i, end_cluster, cluster_count = 0;
>> +    int64_t file_length, real_allocation, metadata_allocation, file_tail;
>> +    uint64_t refcount;
>> +
>> +    file_length = bdrv_getlength(bs->file->bs);
>> +    if (file_length < 0) {
>> +        return file_length;
>> +    }
>> +    file_tail = offset_into_cluster(s, file_length);
>> +
>> +    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
>> +    if (real_allocation < 0) {
>> +        return real_allocation;
>> +    }
>> +
>> +    end_cluster = size_to_clusters(s, file_length);
>> +    for (i = 0; i < end_cluster; i++) {
>> +        int ret = qcow2_get_refcount(bs, i, &refcount);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +        cluster_count += !!refcount;
>> +    }
>> +
>> +    metadata_allocation = cluster_count * s->cluster_size;
>> +    if (!!refcount && file_tail) {
>> +        metadata_allocation -= s->cluster_size - file_tail;
>> +    }
>> +
>> +    return real_allocation < 0.9 * metadata_allocation &&
>> +        real_allocation + s->cluster_size < metadata_allocation;
>> +}
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index 4897abae5e..adc9cdcb27 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -1800,6 +1800,13 @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
>>       unsigned int bytes;
>>       int status = 0;
>> +    if (bs->metadata_preallocation == BDRV_UNKNOWN) {
>> +        ret = qcow2_detect_metadata_preallocation(bs);
>> +        if (ret >= 0) {
>> +            bs->metadata_preallocation = ret ? BDRV_YES : BDRV_NO;
>> +        }
>> +    }
>> +
>>       bytes = MIN(INT_MAX, count);
>>       qemu_co_mutex_lock(&s->lock);
>>       ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
>>
> 
> 


-- 
Best regards,
Vladimir
Re: [Qemu-devel] [PATCH] qcow2: avoid lseek on block_status if possible
Posted by Vladimir Sementsov-Ogievskiy 6 years, 7 months ago
25.01.2019 17:21, Vladimir Sementsov-Ogievskiy wrote:
> drv_co_block_status digs bs->file for additional, more accurate search
> for hole inside region, reported as DATA by bs since 5daa74a6ebc.
> 
> This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
> knows, where are holes and where is data. But every block_status
> request calls lseek additionally. Assume a big disk, full of
> data, in any iterative copying block job (or img convert) we'll call
> lseek(HOLE) on every iteration, and each of these lseeks will have to
> iterate through all metadata up to the end of file. It's obviously
> ineffective behavior. And for many scenarios we don't need this lseek
> at all.
> 
> However, lseek is needed when we have metadata-preallocated image.
> 
> So, let's detect metadata-preallocation case and don't dig qcow2's
> protocol file in other cases.
> 
> The idea is to compare allocation size in POV of filesystem with
> allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
> significantly lower, consider it as metadata-preallocation case.
> 
> Suggested-by: Denis V. Lunev <den@openvz.org>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
> 
> Hi!
> 
> So, to continue talk about lseek/no lseek when qcow2 block_status reports
> DATA.
> 
> Results on tmpfs:
> cached is lseek cache by Kevin
> detect is this patch
> no lseek is just remove block_status query on bs->file->bs in
>           bdrv_co_block_status
> 
>      +---------------------+--------+--------+--------+----------+
>      |                     | master | cached | detect | no lseek |
>      +---------------------+--------+--------+--------+----------+
>      | test.qcow2          | 80     | 40     | 0.169  | 0.162    |
>      +---------------------+--------+--------+--------+----------+
>      | test_forward.qcow2  | 79     | 0.171  | 0.169  | 0.163    |
>      +---------------------+--------+--------+--------+----------+
>      | test_prealloc.qcow2 | 0.054  | 0.053  | 0.055  | 0.263    |
>      +---------------------+--------+--------+--------+----------+
> 
>   block/qcow2.h             |  1 +
>   include/block/block_int.h |  7 +++++++
>   block/io.c                |  3 ++-
>   block/qcow2-refcount.c    | 36 ++++++++++++++++++++++++++++++++++++
>   block/qcow2.c             |  7 +++++++
>   5 files changed, 53 insertions(+), 1 deletion(-)
> 
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 438a1dee9e..d7113ed44c 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -610,6 +610,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
>                                   void *cb_opaque, Error **errp);
>   int qcow2_shrink_reftable(BlockDriverState *bs);
>   int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
> +int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
>   
>   /* qcow2-cluster.c functions */
>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index f605622216..c895ca7169 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -59,6 +59,12 @@
>   
>   #define BLOCK_PROBE_BUF_SIZE        512
>   
> +typedef enum BdrvYesNoUnknown {
> +    BDRV_UNKNOWN = 0,
> +    BDRV_NO,
> +    BDRV_YES,
> +} BdrvYesNoUnknown;
> +
>   enum BdrvTrackedRequestType {
>       BDRV_TRACKED_READ,
>       BDRV_TRACKED_WRITE,
> @@ -682,6 +688,7 @@ struct BlockDriverState {
>       bool probed;    /* if true, format was probed rather than specified */
>       bool force_share; /* if true, always allow all shared permissions */
>       bool implicit;  /* if true, this filter node was automatically inserted */
> +    BdrvYesNoUnknown metadata_preallocation;
>   
>       BlockDriver *drv; /* NULL means no media */
>       void *opaque;
> diff --git a/block/io.c b/block/io.c
> index bd9d688f8b..815661750a 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -2186,7 +2186,8 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
>           }
>       }
>   
> -    if (want_zero && local_file && local_file != bs &&
> +    if (want_zero && bs->metadata_preallocation != BDRV_NO &&
> +        local_file && local_file != bs &&
>           (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
>           (ret & BDRV_BLOCK_OFFSET_VALID)) {
>           int64_t file_pnum;
> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
> index 1c63ac244a..008196d849 100644
> --- a/block/qcow2-refcount.c
> +++ b/block/qcow2-refcount.c
> @@ -3379,3 +3379,39 @@ int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
>                               "There are no references in the refcount table.");
>       return -EIO;
>   }
> +
> +int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    int64_t i, end_cluster, cluster_count = 0;
> +    int64_t file_length, real_allocation, metadata_allocation, file_tail;
> +    uint64_t refcount;
> +
> +    file_length = bdrv_getlength(bs->file->bs);
> +    if (file_length < 0) {
> +        return file_length;
> +    }
> +    file_tail = offset_into_cluster(s, file_length);
> +
> +    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
> +    if (real_allocation < 0) {
> +        return real_allocation;
> +    }
> +
> +    end_cluster = size_to_clusters(s, file_length);
> +    for (i = 0; i < end_cluster; i++) {
> +        int ret = qcow2_get_refcount(bs, i, &refcount);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        cluster_count += !!refcount;
> +    }
> +
> +    metadata_allocation = cluster_count * s->cluster_size;
> +    if (!!refcount && file_tail) {
> +        metadata_allocation -= s->cluster_size - file_tail;
> +    }
> +
> +    return real_allocation < 0.9 * metadata_allocation &&
> +        real_allocation + s->cluster_size < metadata_allocation;
> +}
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 4897abae5e..adc9cdcb27 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -1800,6 +1800,13 @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
>       unsigned int bytes;
>       int status = 0;
>   
> +    if (bs->metadata_preallocation == BDRV_UNKNOWN) {

Without locks the following leads to image corruption. Assume that refcounts metadata
read without lock may pollute refcounts cache. So:

qemu_co_mutex_lock(&s->lock);

> +        ret = qcow2_detect_metadata_preallocation(bs);

qemu_co_mutex_unlock(&s->lock);

> +        if (ret >= 0) {
> +            bs->metadata_preallocation = ret ? BDRV_YES : BDRV_NO;
> +        }
> +    }
> +
>       bytes = MIN(INT_MAX, count);
>       qemu_co_mutex_lock(&s->lock);
>       ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
> 


-- 
Best regards,
Vladimir
Re: [Qemu-devel] [PATCH] qcow2: avoid lseek on block_status if possible
Posted by Kevin Wolf 6 years, 7 months ago
Am 22.03.2019 um 17:20 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 25.01.2019 17:21, Vladimir Sementsov-Ogievskiy wrote:
> > drv_co_block_status digs bs->file for additional, more accurate search
> > for hole inside region, reported as DATA by bs since 5daa74a6ebc.
> > 
> > This accuracy is not free: assume we have qcow2 disk. Actually, qcow2
> > knows, where are holes and where is data. But every block_status
> > request calls lseek additionally. Assume a big disk, full of
> > data, in any iterative copying block job (or img convert) we'll call
> > lseek(HOLE) on every iteration, and each of these lseeks will have to
> > iterate through all metadata up to the end of file. It's obviously
> > ineffective behavior. And for many scenarios we don't need this lseek
> > at all.
> > 
> > However, lseek is needed when we have metadata-preallocated image.
> > 
> > So, let's detect metadata-preallocation case and don't dig qcow2's
> > protocol file in other cases.
> > 
> > The idea is to compare allocation size in POV of filesystem with
> > allocations size in POV of Qcow2 (by refcounts). If allocation in fs is
> > significantly lower, consider it as metadata-preallocation case.
> > 
> > Suggested-by: Denis V. Lunev <den@openvz.org>
> > Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> > ---
> > 
> > Hi!
> > 
> > So, to continue talk about lseek/no lseek when qcow2 block_status reports
> > DATA.
> > 
> > Results on tmpfs:
> > cached is lseek cache by Kevin
> > detect is this patch
> > no lseek is just remove block_status query on bs->file->bs in
> >           bdrv_co_block_status
> > 
> >      +---------------------+--------+--------+--------+----------+
> >      |                     | master | cached | detect | no lseek |
> >      +---------------------+--------+--------+--------+----------+
> >      | test.qcow2          | 80     | 40     | 0.169  | 0.162    |
> >      +---------------------+--------+--------+--------+----------+
> >      | test_forward.qcow2  | 79     | 0.171  | 0.169  | 0.163    |
> >      +---------------------+--------+--------+--------+----------+
> >      | test_prealloc.qcow2 | 0.054  | 0.053  | 0.055  | 0.263    |
> >      +---------------------+--------+--------+--------+----------+
> > 
> >   block/qcow2.h             |  1 +
> >   include/block/block_int.h |  7 +++++++
> >   block/io.c                |  3 ++-
> >   block/qcow2-refcount.c    | 36 ++++++++++++++++++++++++++++++++++++
> >   block/qcow2.c             |  7 +++++++
> >   5 files changed, 53 insertions(+), 1 deletion(-)
> > 
> > diff --git a/block/qcow2.h b/block/qcow2.h
> > index 438a1dee9e..d7113ed44c 100644
> > --- a/block/qcow2.h
> > +++ b/block/qcow2.h
> > @@ -610,6 +610,7 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
> >                                   void *cb_opaque, Error **errp);
> >   int qcow2_shrink_reftable(BlockDriverState *bs);
> >   int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size);
> > +int qcow2_detect_metadata_preallocation(BlockDriverState *bs);
> >   
> >   /* qcow2-cluster.c functions */
> >   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
> > diff --git a/include/block/block_int.h b/include/block/block_int.h
> > index f605622216..c895ca7169 100644
> > --- a/include/block/block_int.h
> > +++ b/include/block/block_int.h
> > @@ -59,6 +59,12 @@
> >   
> >   #define BLOCK_PROBE_BUF_SIZE        512
> >   
> > +typedef enum BdrvYesNoUnknown {
> > +    BDRV_UNKNOWN = 0,
> > +    BDRV_NO,
> > +    BDRV_YES,
> > +} BdrvYesNoUnknown;
> > +
> >   enum BdrvTrackedRequestType {
> >       BDRV_TRACKED_READ,
> >       BDRV_TRACKED_WRITE,
> > @@ -682,6 +688,7 @@ struct BlockDriverState {
> >       bool probed;    /* if true, format was probed rather than specified */
> >       bool force_share; /* if true, always allow all shared permissions */
> >       bool implicit;  /* if true, this filter node was automatically inserted */
> > +    BdrvYesNoUnknown metadata_preallocation;
> >   
> >       BlockDriver *drv; /* NULL means no media */
> >       void *opaque;

I'm not sure if adding a new field to BlockDriverState is justified for
this. It's already a huge struct.

Wouldn't returning a new flag like BDRV_BLOCK_RECURSE from
.bdrv_co_block_status be nicer?

> > diff --git a/block/io.c b/block/io.c
> > index bd9d688f8b..815661750a 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -2186,7 +2186,8 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
> >           }
> >       }
> >   
> > -    if (want_zero && local_file && local_file != bs &&
> > +    if (want_zero && bs->metadata_preallocation != BDRV_NO &&
> > +        local_file && local_file != bs &&
> >           (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
> >           (ret & BDRV_BLOCK_OFFSET_VALID)) {
> >           int64_t file_pnum;
> > diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
> > index 1c63ac244a..008196d849 100644
> > --- a/block/qcow2-refcount.c
> > +++ b/block/qcow2-refcount.c
> > @@ -3379,3 +3379,39 @@ int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
> >                               "There are no references in the refcount table.");
> >       return -EIO;
> >   }
> > +
> > +int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
> > +{
> > +    BDRVQcow2State *s = bs->opaque;
> > +    int64_t i, end_cluster, cluster_count = 0;
> > +    int64_t file_length, real_allocation, metadata_allocation, file_tail;
> > +    uint64_t refcount;
> > +
> > +    file_length = bdrv_getlength(bs->file->bs);
> > +    if (file_length < 0) {
> > +        return file_length;
> > +    }
> > +    file_tail = offset_into_cluster(s, file_length);
> > +
> > +    real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
> > +    if (real_allocation < 0) {
> > +        return real_allocation;
> > +    }
> > +
> > +    end_cluster = size_to_clusters(s, file_length);
> > +    for (i = 0; i < end_cluster; i++) {
> > +        int ret = qcow2_get_refcount(bs, i, &refcount);
> > +        if (ret < 0) {
> > +            return ret;
> > +        }
> > +        cluster_count += !!refcount;
> > +    }

As an optimisation, we could stop counting as soon as cluster_count gets
higher than real_allocation / 0.9 (which can be calculated once before
the loop).

> > +    metadata_allocation = cluster_count * s->cluster_size;
> > +    if (!!refcount && file_tail) {
> > +        metadata_allocation -= s->cluster_size - file_tail;
> > +    }
> > +
> > +    return real_allocation < 0.9 * metadata_allocation &&
> > +        real_allocation + s->cluster_size < metadata_allocation;
> > +}
> > diff --git a/block/qcow2.c b/block/qcow2.c
> > index 4897abae5e..adc9cdcb27 100644
> > --- a/block/qcow2.c
> > +++ b/block/qcow2.c
> > @@ -1800,6 +1800,13 @@ static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
> >       unsigned int bytes;
> >       int status = 0;
> >   
> > +    if (bs->metadata_preallocation == BDRV_UNKNOWN) {
> 
> Without locks the following leads to image corruption. Assume that refcounts metadata
> read without lock may pollute refcounts cache. So:
> 
> qemu_co_mutex_lock(&s->lock);
> 
> > +        ret = qcow2_detect_metadata_preallocation(bs);
> 
> qemu_co_mutex_unlock(&s->lock);

Well, just take the lock earlier (before this if block) instead 

> > +        if (ret >= 0) {
> > +            bs->metadata_preallocation = ret ? BDRV_YES : BDRV_NO;
> > +        }
> > +    }

At least I would make sure that for ret < 0, the function isn't called
again and again because it probably means that the protocol layer simply
doesn't support bdrv_get_allocated_file_size().

> > +
> >       bytes = MIN(INT_MAX, count);
> >       qemu_co_mutex_lock(&s->lock);
> >       ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);

Kevin