[v2] backup improvements

[Qemu-devel] [PATCH v2 6/7] block/backup: teach backup_cow_with_bounce_buffer to copy more at once

Posted by Vladimir Sementsov-Ogievskiy 6 years, 3 months ago

backup_cow_with_offload can transfer more than on cluster. Let
backup_cow_with_bounce_buffer behave similarly. It reduces number
of IO and there are no needs to copy cluster by cluster.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/backup.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index d482d93458..155e21d0a3 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -104,22 +104,25 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
                                                       int64_t start,
                                                       int64_t end,
                                                       bool is_write_notifier,
-                                                      bool *error_is_read,
-                                                      void **bounce_buffer)
+                                                      bool *error_is_read)
 {
     int ret;
     BlockBackend *blk = job->common.blk;
     int nbytes;
     int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
+    void *bounce_buffer;
 
     assert(QEMU_IS_ALIGNED(start, job->cluster_size));
-    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
-    nbytes = MIN(job->cluster_size, job->len - start);
-    if (!*bounce_buffer) {
-        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
+
+    nbytes = MIN(end - start, job->len - start);
+    bounce_buffer = blk_try_blockalign(blk, nbytes);
+    if (!bounce_buffer) {
+        return -ENOMEM;
     }
 
-    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
+    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
+
+    ret = blk_co_pread(blk, start, nbytes, bounce_buffer, read_flags);
     if (ret < 0) {
         trace_backup_do_cow_read_fail(job, start, ret);
         if (error_is_read) {
@@ -128,7 +131,7 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
         goto fail;
     }
 
-    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
+    ret = blk_co_pwrite(job->target, start, nbytes, bounce_buffer,
                         job->write_flags);
     if (ret < 0) {
         trace_backup_do_cow_write_fail(job, start, ret);
@@ -138,9 +141,12 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
         goto fail;
     }
 
+    qemu_vfree(bounce_buffer);
     return nbytes;
+
 fail:
     bdrv_set_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
+    qemu_vfree(bounce_buffer);
     return ret;
 
 }
@@ -254,7 +260,6 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
     CowRequest cow_request;
     int ret = 0;
     int64_t start, end; /* bytes */
-    void *bounce_buffer = NULL;
     int64_t skip_bytes;
 
     qemu_co_rwlock_rdlock(&job->flush_rwlock);
@@ -303,7 +308,7 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
         if (!job->use_copy_range) {
             ret = backup_cow_with_bounce_buffer(job, start, dirty_end,
                                                 is_write_notifier,
-                                                error_is_read, &bounce_buffer);
+                                                error_is_read);
         }
         if (ret < 0) {
             break;
@@ -318,10 +323,6 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
         ret = 0;
     }
 
-    if (bounce_buffer) {
-        qemu_vfree(bounce_buffer);
-    }
-
     cow_request_end(&cow_request);
 
     trace_backup_do_cow_return(job, offset, bytes, ret);
-- 
2.18.0

Re: [Qemu-devel] [PATCH v2 6/7] block/backup: teach backup_cow_with_bounce_buffer to copy more at once

Posted by Eric Blake 6 years, 3 months ago

On 8/9/19 10:32 AM, Vladimir Sementsov-Ogievskiy wrote:
> backup_cow_with_offload can transfer more than on cluster. Let

s/on/one/

> backup_cow_with_bounce_buffer behave similarly. It reduces number
> of IO and there are no needs to copy cluster by cluster.

It reduces the number of IO requests, since there is no need to copy
cluster by cluster.

> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  block/backup.c | 29 +++++++++++++++--------------
>  1 file changed, 15 insertions(+), 14 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index d482d93458..155e21d0a3 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -104,22 +104,25 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>                                                        int64_t start,
>                                                        int64_t end,
>                                                        bool is_write_notifier,
> -                                                      bool *error_is_read,
> -                                                      void **bounce_buffer)
> +                                                      bool *error_is_read)

Why is this signature changing?

>  {
>      int ret;
>      BlockBackend *blk = job->common.blk;
>      int nbytes;
>      int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
> +    void *bounce_buffer;
>  
>      assert(QEMU_IS_ALIGNED(start, job->cluster_size));
> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
> -    nbytes = MIN(job->cluster_size, job->len - start);
> -    if (!*bounce_buffer) {
> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);

Pre-patch, you allocate the bounce_buffer at most once (but limited to a
cluster size), post-patch, you are now allocating and freeing a bounce
buffer every iteration through.  There may be fewer calls because you
are allocating something bigger, but still, isn't it a good goal to try
and allocate the bounce buffer as few times as possible and reuse it,
rather than getting a fresh one each iteration?

> +
> +    nbytes = MIN(end - start, job->len - start);

What is the largest this will be? Will it exhaust memory on a 32-bit or
otherwise memory-constrained system, where you should have some maximum
cap for how large the bounce buffer will be while still getting better
performance than one cluster at a time and not slowing things down by
over-sizing malloc?  64M, maybe?

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [Qemu-devel] [PATCH v2 6/7] block/backup: teach backup_cow_with_bounce_buffer to copy more at once

Posted by Vladimir Sementsov-Ogievskiy 6 years, 3 months ago

09.08.2019 18:59, Eric Blake wrote:
> On 8/9/19 10:32 AM, Vladimir Sementsov-Ogievskiy wrote:
>> backup_cow_with_offload can transfer more than on cluster. Let
> 
> s/on/one/
> 
>> backup_cow_with_bounce_buffer behave similarly. It reduces number
>> of IO and there are no needs to copy cluster by cluster.
> 
> It reduces the number of IO requests, since there is no need to copy
> cluster by cluster.
> 
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   block/backup.c | 29 +++++++++++++++--------------
>>   1 file changed, 15 insertions(+), 14 deletions(-)
>>
>> diff --git a/block/backup.c b/block/backup.c
>> index d482d93458..155e21d0a3 100644
>> --- a/block/backup.c
>> +++ b/block/backup.c
>> @@ -104,22 +104,25 @@ static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>                                                         int64_t start,
>>                                                         int64_t end,
>>                                                         bool is_write_notifier,
>> -                                                      bool *error_is_read,
>> -                                                      void **bounce_buffer)
>> +                                                      bool *error_is_read)
> 
> Why is this signature changing?
> 
>>   {
>>       int ret;
>>       BlockBackend *blk = job->common.blk;
>>       int nbytes;
>>       int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>> +    void *bounce_buffer;
>>   
>>       assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>> -    nbytes = MIN(job->cluster_size, job->len - start);
>> -    if (!*bounce_buffer) {
>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
> 
> Pre-patch, you allocate the bounce_buffer at most once (but limited to a
> cluster size), post-patch, you are now allocating and freeing a bounce
> buffer every iteration through.  There may be fewer calls because you
> are allocating something bigger, but still, isn't it a good goal to try
> and allocate the bounce buffer as few times as possible and reuse it,
> rather than getting a fresh one each iteration?

Yes, it's a "degradation" of this patch, I was afraid of this question.
However, I doubt that it should be optimized:

1. I'm going to run several copy requests in coroutines to parallelize copying loop,
to improve performance (series for qcow2 are here
https://lists.gnu.org/archive/html/qemu-devel/2019-07/msg06654.html), so we'll need
several buffers for parallel copying requests and it's extremely easier to allocate
buffer when it's needed and free after it, than do some allocated memory sharing like
in mirror.

2. Actually it is a question about memory allocator: is it fast and optimized
enough to not care, or is it bad, and we should work-around it like in
mirror? And in my opinion (proved by a bit of benchmarking) memalign
is fast enough to don't care. I was answering similar question in more details
and with some numbers here:
https://lists.gnu.org/archive/html/qemu-devel/2018-11/msg00087.html

So, I'd prefere to not care and keep code simpler. But if you don't agree,
I can leave shared buffer here, at least until introduction of parallel requests.
Then, it will be something like qemu_try_blockalign(MIN(bytes, 64M))..

> 
>> +
>> +    nbytes = MIN(end - start, job->len - start);
> 
> What is the largest this will be? Will it exhaust memory on a 32-bit or
> otherwise memory-constrained system, where you should have some maximum
> cap for how large the bounce buffer will be while still getting better
> performance than one cluster at a time and not slowing things down by
> over-sizing malloc?  64M, maybe?
> 

Hmm, good point. I thought that it's safe to allocate buffer for a full request,
as if such buffer is already allocated for the guest request itself, it should
not be bad to allocate another one with same size. But I forgot about write-zeros
and discard operations which may be large without any allocation and here we need
to allocate anyway.

Hmm2, but we have parallel guest writes(discards) and therefore parallel
copy-before-write operations. So total allocation is not limited anyway and it
should be fixed somehow too..

-- 
Best regards,
Vladimir

Re: [Qemu-devel] [PATCH v2 6/7] block/backup: teach backup_cow_with_bounce_buffer to copy more at once

Posted by Eric Blake 6 years, 3 months ago

On 8/10/19 7:12 AM, Vladimir Sementsov-Ogievskiy wrote:
> 09.08.2019 18:59, Eric Blake wrote:
>> On 8/9/19 10:32 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> backup_cow_with_offload can transfer more than on cluster. Let
>>
>> s/on/one/
>>
>>> backup_cow_with_bounce_buffer behave similarly. It reduces number
>>> of IO and there are no needs to copy cluster by cluster.
>>
>> It reduces the number of IO requests, since there is no need to copy
>> cluster by cluster.

>>> -                                                      bool *error_is_read,
>>> -                                                      void **bounce_buffer)
>>> +                                                      bool *error_is_read)
>>
>> Why is this signature changing?
>>

> 
> 2. Actually it is a question about memory allocator: is it fast and optimized
> enough to not care, or is it bad, and we should work-around it like in
> mirror? And in my opinion (proved by a bit of benchmarking) memalign
> is fast enough to don't care. I was answering similar question in more details
> and with some numbers here:
> https://lists.gnu.org/archive/html/qemu-devel/2018-11/msg00087.html
> 
> So, I'd prefere to not care and keep code simpler. But if you don't agree,
> I can leave shared buffer here, at least until introduction of parallel requests.
> Then, it will be something like qemu_try_blockalign(MIN(bytes, 64M))..

It may still be worth capping at 64M.  I'm not opposed to a change to
per-request allocation rather than trying to reuse a buffer, if it is
going to make parallelization efforts easier; but I am worried about the
maximum memory usage.  I'm more worried that you are trying to cram two
distinct changes into one patch, and didn't even mention the change
about a change from buffer reuse to per-request allocations, in the
commit message.  If you make that sort of change, it should be called
out in the commit message as intentional, or maybe even split to a
separate patch.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [Qemu-devel] [PATCH v2 6/7] block/backup: teach backup_cow_with_bounce_buffer to copy more at once

Posted by Vladimir Sementsov-Ogievskiy 6 years, 3 months ago

10.08.2019 15:47, Eric Blake wrote:
> On 8/10/19 7:12 AM, Vladimir Sementsov-Ogievskiy wrote:
>> 09.08.2019 18:59, Eric Blake wrote:
>>> On 8/9/19 10:32 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>> backup_cow_with_offload can transfer more than on cluster. Let
>>>
>>> s/on/one/
>>>
>>>> backup_cow_with_bounce_buffer behave similarly. It reduces number
>>>> of IO and there are no needs to copy cluster by cluster.
>>>
>>> It reduces the number of IO requests, since there is no need to copy
>>> cluster by cluster.
> 
>>>> -                                                      bool *error_is_read,
>>>> -                                                      void **bounce_buffer)
>>>> +                                                      bool *error_is_read)
>>>
>>> Why is this signature changing?
>>>
> 
>>
>> 2. Actually it is a question about memory allocator: is it fast and optimized
>> enough to not care, or is it bad, and we should work-around it like in
>> mirror? And in my opinion (proved by a bit of benchmarking) memalign
>> is fast enough to don't care. I was answering similar question in more details
>> and with some numbers here:
>> https://lists.gnu.org/archive/html/qemu-devel/2018-11/msg00087.html
>>
>> So, I'd prefere to not care and keep code simpler. But if you don't agree,
>> I can leave shared buffer here, at least until introduction of parallel requests.
>> Then, it will be something like qemu_try_blockalign(MIN(bytes, 64M))..
> 
> It may still be worth capping at 64M.  I'm not opposed to a change to
> per-request allocation rather than trying to reuse a buffer, if it is
> going to make parallelization efforts easier; but I am worried about the
> maximum memory usage.  I'm more worried that you are trying to cram two
> distinct changes into one patch, and didn't even mention the change
> about a change from buffer reuse to per-request allocations, in the
> commit message.  If you make that sort of change, it should be called
> out in the commit message as intentional, or maybe even split to a
> separate patch.
> 

OK, I failed to hide it :) Will split out and add 64M limit.

-- 
Best regards,
Vladimir

[Qemu-devel] [PATCH v2 1/7] block/backup: deal with zero detection
[Qemu-devel] [PATCH v2 2/7] block/backup: refactor write_flags
[Qemu-devel] [PATCH v2 3/7] block/io: handle alignment and max_transfer for copy_range
[Qemu-devel] [PATCH v2 4/7] block/backup: drop handling of max_transfer for copy_range
[Qemu-devel] [PATCH v2 5/7] block/backup: fix backup_cow_with_offload for last cluster
[Qemu-devel] [PATCH v2 6/7] block/backup: teach backup_cow_with_bounce_buffer to copy more at once
[Qemu-devel] [PATCH v2 7/7] block/backup: merge duplicated logic into backup_do_cow