[v1] qcow2: async handling of fragmented io

[Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Vladimir Sementsov-Ogievskiy 7 years, 6 months ago

Memory allocation may become less efficient for encrypted case. It's a
payment for further asynchronous scheme.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 64 insertions(+), 50 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 65e3ca00e2..5e7f2ee318 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1808,6 +1808,67 @@ out:
     return ret;
 }
 
+static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
+                                               uint64_t file_cluster_offset,
+                                               uint64_t offset,
+                                               uint64_t bytes,
+                                               QEMUIOVector *qiov,
+                                               uint64_t qiov_offset)
+{
+    int ret;
+    BDRVQcow2State *s = bs->opaque;
+    void *crypt_buf = NULL;
+    QEMUIOVector hd_qiov;
+    int offset_in_cluster = offset_into_cluster(s, offset);
+
+    if ((file_cluster_offset & 511) != 0) {
+        return -EIO;
+    }
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    if (bs->encrypted) {
+        assert(s->crypto);
+        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+
+        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
+        qemu_iovec_add(&hd_qiov, crypt_buf, bytes);
+    } else {
+        qemu_iovec_concat(&hd_qiov, qiov, qiov_offset, bytes);
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+    ret = bdrv_co_preadv(bs->file,
+                         file_cluster_offset + offset_in_cluster,
+                         bytes, &hd_qiov, 0);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (bs->encrypted) {
+        assert(s->crypto);
+        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+        if (qcrypto_block_decrypt(s->crypto,
+                                  (s->crypt_physical_offset ?
+                                   file_cluster_offset + offset_in_cluster :
+                                   offset),
+                                  crypt_buf,
+                                  bytes,
+                                  NULL) < 0) {
+            ret = -EIO;
+            goto out;
+        }
+        qemu_iovec_from_buf(qiov, qiov_offset, crypt_buf, bytes);
+    }
+
+out:
+    qemu_vfree(crypt_buf);
+    qemu_iovec_destroy(&hd_qiov);
+
+    return ret;
+}
+
 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                         uint64_t bytes, QEMUIOVector *qiov,
                                         int flags)
@@ -1819,7 +1880,6 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
     uint64_t cluster_offset = 0;
     uint64_t bytes_done = 0;
     QEMUIOVector hd_qiov;
-    uint8_t *cluster_data = NULL;
 
     qemu_iovec_init(&hd_qiov, qiov->niov);
 
@@ -1882,57 +1942,12 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
         case QCOW2_CLUSTER_NORMAL:
             qemu_co_mutex_unlock(&s->lock);
 
-            if ((cluster_offset & 511) != 0) {
-                ret = -EIO;
-                goto fail_nolock;
-            }
-
-            if (bs->encrypted) {
-                assert(s->crypto);
-
-                /*
-                 * For encrypted images, read everything into a temporary
-                 * contiguous buffer on which the AES functions can work.
-                 */
-                if (!cluster_data) {
-                    cluster_data =
-                        qemu_try_blockalign(bs->file->bs,
-                                            QCOW_MAX_CRYPT_CLUSTERS
-                                            * s->cluster_size);
-                    if (cluster_data == NULL) {
-                        ret = -ENOMEM;
-                        goto fail_nolock;
-                    }
-                }
-
-                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
-                qemu_iovec_reset(&hd_qiov);
-                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
-            }
-
-            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-            ret = bdrv_co_preadv(bs->file,
-                                 cluster_offset + offset_in_cluster,
-                                 cur_bytes, &hd_qiov, 0);
+            ret = qcow2_co_preadv_normal(bs, cluster_offset,
+                                         offset, cur_bytes, qiov, bytes_done);
             if (ret < 0) {
                 goto fail_nolock;
             }
-            if (bs->encrypted) {
-                assert(s->crypto);
-                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-                if (qcrypto_block_decrypt(s->crypto,
-                                          (s->crypt_physical_offset ?
-                                           cluster_offset + offset_in_cluster :
-                                           offset),
-                                          cluster_data,
-                                          cur_bytes,
-                                          NULL) < 0) {
-                    ret = -EIO;
-                    goto fail_nolock;
-                }
-                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
-            }
+
             qemu_co_mutex_lock(&s->lock);
             break;
 
@@ -1953,7 +1968,6 @@ fail:
 
 fail_nolock:
     qemu_iovec_destroy(&hd_qiov);
-    qemu_vfree(cluster_data);
 
     return ret;
 }
-- 
2.11.1

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Max Reitz 7 years, 4 months ago

On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:
> Memory allocation may become less efficient for encrypted case. It's a
> payment for further asynchronous scheme.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
>  1 file changed, 64 insertions(+), 50 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 65e3ca00e2..5e7f2ee318 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -1808,6 +1808,67 @@ out:
>      return ret;
>  }
>  
> +static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
> +                                               uint64_t file_cluster_offset,
> +                                               uint64_t offset,
> +                                               uint64_t bytes,
> +                                               QEMUIOVector *qiov,
> +                                               uint64_t qiov_offset)
> +{
> +    int ret;
> +    BDRVQcow2State *s = bs->opaque;
> +    void *crypt_buf = NULL;
> +    QEMUIOVector hd_qiov;
> +    int offset_in_cluster = offset_into_cluster(s, offset);
> +
> +    if ((file_cluster_offset & 511) != 0) {
> +        return -EIO;
> +    }
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);

So you're not just re-allocating the bounce buffer for every single
call, but also this I/O vector.  Hm.

> +    if (bs->encrypted) {
> +        assert(s->crypto);
> +        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
> +
> +        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);

1. Why did you remove the comment?

2. The check for whether crypt_buf was successfully allocated is missing.

3. Do you have any benchmarks for encrypted images?  Having to allocate
the bounce buffer for potentially every single cluster sounds rather bad
to me.

> +        qemu_iovec_add(&hd_qiov, crypt_buf, bytes);
> +    } else {
> +        qemu_iovec_concat(&hd_qiov, qiov, qiov_offset, bytes);

qcow2_co_preadv() continues to do this as well.  That's superfluous now.

> +    }
> +
> +    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
> +    ret = bdrv_co_preadv(bs->file,
> +                         file_cluster_offset + offset_in_cluster,
> +                         bytes, &hd_qiov, 0);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    if (bs->encrypted) {
> +        assert(s->crypto);
> +        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
> +        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
> +        if (qcrypto_block_decrypt(s->crypto,
> +                                  (s->crypt_physical_offset ?
> +                                   file_cluster_offset + offset_in_cluster :
> +                                   offset),
> +                                  crypt_buf,
> +                                  bytes,
> +                                  NULL) < 0) {

What's the reason against decrypting this in-place in qiov so we could
save the bounce buffer?  We allow offsets in clusters, so why can't we
just call this function once per involved I/O vector entry?

Max

> +            ret = -EIO;
> +            goto out;
> +        }
> +        qemu_iovec_from_buf(qiov, qiov_offset, crypt_buf, bytes);
> +    }
> +
> +out:
> +    qemu_vfree(crypt_buf);
> +    qemu_iovec_destroy(&hd_qiov);
> +
> +    return ret;
> +}
> +
>  static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
>                                          uint64_t bytes, QEMUIOVector *qiov,
>                                          int flags)
> @@ -1819,7 +1880,6 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
>      uint64_t cluster_offset = 0;
>      uint64_t bytes_done = 0;
>      QEMUIOVector hd_qiov;
> -    uint8_t *cluster_data = NULL;
>  
>      qemu_iovec_init(&hd_qiov, qiov->niov);
>  
> @@ -1882,57 +1942,12 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
>          case QCOW2_CLUSTER_NORMAL:
>              qemu_co_mutex_unlock(&s->lock);
>  
> -            if ((cluster_offset & 511) != 0) {
> -                ret = -EIO;
> -                goto fail_nolock;
> -            }
> -
> -            if (bs->encrypted) {
> -                assert(s->crypto);
> -
> -                /*
> -                 * For encrypted images, read everything into a temporary
> -                 * contiguous buffer on which the AES functions can work.
> -                 */
> -                if (!cluster_data) {
> -                    cluster_data =
> -                        qemu_try_blockalign(bs->file->bs,
> -                                            QCOW_MAX_CRYPT_CLUSTERS
> -                                            * s->cluster_size);
> -                    if (cluster_data == NULL) {
> -                        ret = -ENOMEM;
> -                        goto fail_nolock;
> -                    }
> -                }
> -
> -                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
> -                qemu_iovec_reset(&hd_qiov);
> -                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
> -            }
> -
> -            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
> -            ret = bdrv_co_preadv(bs->file,
> -                                 cluster_offset + offset_in_cluster,
> -                                 cur_bytes, &hd_qiov, 0);
> +            ret = qcow2_co_preadv_normal(bs, cluster_offset,
> +                                         offset, cur_bytes, qiov, bytes_done);
>              if (ret < 0) {
>                  goto fail_nolock;
>              }
> -            if (bs->encrypted) {
> -                assert(s->crypto);
> -                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
> -                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
> -                if (qcrypto_block_decrypt(s->crypto,
> -                                          (s->crypt_physical_offset ?
> -                                           cluster_offset + offset_in_cluster :
> -                                           offset),
> -                                          cluster_data,
> -                                          cur_bytes,
> -                                          NULL) < 0) {
> -                    ret = -EIO;
> -                    goto fail_nolock;
> -                }
> -                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
> -            }
> +
>              qemu_co_mutex_lock(&s->lock);
>              break;
>  
> @@ -1953,7 +1968,6 @@ fail:
>  
>  fail_nolock:
>      qemu_iovec_destroy(&hd_qiov);
> -    qemu_vfree(cluster_data);
>  
>      return ret;
>  }
>

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Vladimir Sementsov-Ogievskiy 7 years, 4 months ago

27.09.2018 20:35, Max Reitz wrote:
> On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:
>> Memory allocation may become less efficient for encrypted case. It's a
>> payment for further asynchronous scheme.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
>>   1 file changed, 64 insertions(+), 50 deletions(-)
>>
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index 65e3ca00e2..5e7f2ee318 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -1808,6 +1808,67 @@ out:
>>       return ret;
>>   }
>>   
>> +static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
>> +                                               uint64_t file_cluster_offset,
>> +                                               uint64_t offset,
>> +                                               uint64_t bytes,
>> +                                               QEMUIOVector *qiov,
>> +                                               uint64_t qiov_offset)
>> +{
>> +    int ret;
>> +    BDRVQcow2State *s = bs->opaque;
>> +    void *crypt_buf = NULL;
>> +    QEMUIOVector hd_qiov;
>> +    int offset_in_cluster = offset_into_cluster(s, offset);
>> +
>> +    if ((file_cluster_offset & 511) != 0) {
>> +        return -EIO;
>> +    }
>> +
>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> So you're not just re-allocating the bounce buffer for every single
> call, but also this I/O vector.  Hm.
>
>> +    if (bs->encrypted) {
>> +        assert(s->crypto);
>> +        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
>> +
>> +        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
> 1. Why did you remove the comment?
>
> 2. The check for whether crypt_buf was successfully allocated is missing.
>
> 3. Do you have any benchmarks for encrypted images?  Having to allocate
> the bounce buffer for potentially every single cluster sounds rather bad
> to me.

Hmm, no excuses. 1- Will fix, 2 - will fix, 3 - will fix or at least 
test the performance.

>
>> +        qemu_iovec_add(&hd_qiov, crypt_buf, bytes);
>> +    } else {
>> +        qemu_iovec_concat(&hd_qiov, qiov, qiov_offset, bytes);
> qcow2_co_preadv() continues to do this as well.  That's superfluous now.

hd_qiov is local here.. or what do you mean?

>
>> +    }
>> +
>> +    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
>> +    ret = bdrv_co_preadv(bs->file,
>> +                         file_cluster_offset + offset_in_cluster,
>> +                         bytes, &hd_qiov, 0);
>> +    if (ret < 0) {
>> +        goto out;
>> +    }
>> +
>> +    if (bs->encrypted) {
>> +        assert(s->crypto);
>> +        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
>> +        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
>> +        if (qcrypto_block_decrypt(s->crypto,
>> +                                  (s->crypt_physical_offset ?
>> +                                   file_cluster_offset + offset_in_cluster :
>> +                                   offset),
>> +                                  crypt_buf,
>> +                                  bytes,
>> +                                  NULL) < 0) {
> What's the reason against decrypting this in-place in qiov so we could
> save the bounce buffer?  We allow offsets in clusters, so why can't we
> just call this function once per involved I/O vector entry?

Isn't it unsafe to do decryption in guest buffers?

>
> Max
>
>> +            ret = -EIO;
>> +            goto out;
>> +        }
>> +        qemu_iovec_from_buf(qiov, qiov_offset, crypt_buf, bytes);
>> +    }
>> +
>> +out:
>> +    qemu_vfree(crypt_buf);
>> +    qemu_iovec_destroy(&hd_qiov);
>> +
>> +    return ret;
>> +}
>> +
>>   static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
>>                                           uint64_t bytes, QEMUIOVector *qiov,
>>                                           int flags)
>> @@ -1819,7 +1880,6 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
>>       uint64_t cluster_offset = 0;
>>       uint64_t bytes_done = 0;
>>       QEMUIOVector hd_qiov;
>> -    uint8_t *cluster_data = NULL;
>>   
>>       qemu_iovec_init(&hd_qiov, qiov->niov);
>>   
>> @@ -1882,57 +1942,12 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
>>           case QCOW2_CLUSTER_NORMAL:
>>               qemu_co_mutex_unlock(&s->lock);
>>   
>> -            if ((cluster_offset & 511) != 0) {
>> -                ret = -EIO;
>> -                goto fail_nolock;
>> -            }
>> -
>> -            if (bs->encrypted) {
>> -                assert(s->crypto);
>> -
>> -                /*
>> -                 * For encrypted images, read everything into a temporary
>> -                 * contiguous buffer on which the AES functions can work.
>> -                 */
>> -                if (!cluster_data) {
>> -                    cluster_data =
>> -                        qemu_try_blockalign(bs->file->bs,
>> -                                            QCOW_MAX_CRYPT_CLUSTERS
>> -                                            * s->cluster_size);
>> -                    if (cluster_data == NULL) {
>> -                        ret = -ENOMEM;
>> -                        goto fail_nolock;
>> -                    }
>> -                }
>> -
>> -                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
>> -                qemu_iovec_reset(&hd_qiov);
>> -                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
>> -            }
>> -
>> -            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
>> -            ret = bdrv_co_preadv(bs->file,
>> -                                 cluster_offset + offset_in_cluster,
>> -                                 cur_bytes, &hd_qiov, 0);
>> +            ret = qcow2_co_preadv_normal(bs, cluster_offset,
>> +                                         offset, cur_bytes, qiov, bytes_done);
>>               if (ret < 0) {
>>                   goto fail_nolock;
>>               }
>> -            if (bs->encrypted) {
>> -                assert(s->crypto);
>> -                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
>> -                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
>> -                if (qcrypto_block_decrypt(s->crypto,
>> -                                          (s->crypt_physical_offset ?
>> -                                           cluster_offset + offset_in_cluster :
>> -                                           offset),
>> -                                          cluster_data,
>> -                                          cur_bytes,
>> -                                          NULL) < 0) {
>> -                    ret = -EIO;
>> -                    goto fail_nolock;
>> -                }
>> -                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
>> -            }
>> +
>>               qemu_co_mutex_lock(&s->lock);
>>               break;
>>   
>> @@ -1953,7 +1968,6 @@ fail:
>>   
>>   fail_nolock:
>>       qemu_iovec_destroy(&hd_qiov);
>> -    qemu_vfree(cluster_data);
>>   
>>       return ret;
>>   }
>>
>


-- 
Best regards,
Vladimir

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Max Reitz 7 years, 4 months ago

On 01.10.18 17:14, Vladimir Sementsov-Ogievskiy wrote:
> 27.09.2018 20:35, Max Reitz wrote:
>> On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:
>>> Memory allocation may become less efficient for encrypted case. It's a
>>> payment for further asynchronous scheme.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>> ---
>>>  block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
>>>  1 file changed, 64 insertions(+), 50 deletions(-)
>>>
>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>> index 65e3ca00e2..5e7f2ee318 100644
>>> --- a/block/qcow2.c
>>> +++ b/block/qcow2.c
>>> @@ -1808,6 +1808,67 @@ out:
>>>      return ret;
>>>  }
>>>  
>>> +static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
>>> +                                               uint64_t file_cluster_offset,
>>> +                                               uint64_t offset,
>>> +                                               uint64_t bytes,
>>> +                                               QEMUIOVector *qiov,
>>> +                                               uint64_t qiov_offset)
>>> +{
>>> +    int ret;
>>> +    BDRVQcow2State *s = bs->opaque;
>>> +    void *crypt_buf = NULL;
>>> +    QEMUIOVector hd_qiov;
>>> +    int offset_in_cluster = offset_into_cluster(s, offset);
>>> +
>>> +    if ((file_cluster_offset & 511) != 0) {
>>> +        return -EIO;
>>> +    }
>>> +
>>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> So you're not just re-allocating the bounce buffer for every single
>> call, but also this I/O vector.  Hm.
>>
>>> +    if (bs->encrypted) {
>>> +        assert(s->crypto);
>>> +        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
>>> +
>>> +        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
>> 1. Why did you remove the comment?
>>
>> 2. The check for whether crypt_buf was successfully allocated is missing.
>>
>> 3. Do you have any benchmarks for encrypted images?  Having to allocate
>> the bounce buffer for potentially every single cluster sounds rather bad
>> to me.
> 
> Hmm, no excuses. 1- Will fix, 2 - will fix, 3 - will fix or at least
> test the performance.
> 
>>> +        qemu_iovec_add(&hd_qiov, crypt_buf, bytes);
>>> +    } else {
>>> +        qemu_iovec_concat(&hd_qiov, qiov, qiov_offset, bytes);
>> qcow2_co_preadv() continues to do this as well.  That's superfluous now.
> 
> hd_qiov is local here.. or what do you mean?

qcow2_co_preadv() continues to have its own hd_qiov and appends qiov to
it (just like here), but it's unused for normal clusters.  So it doesn't
have to do that for normal clusters.

>>> +    }
>>> +
>>> +    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
>>> +    ret = bdrv_co_preadv(bs->file,
>>> +                         file_cluster_offset + offset_in_cluster,
>>> +                         bytes, &hd_qiov, 0);
>>> +    if (ret < 0) {
>>> +        goto out;
>>> +    }
>>> +
>>> +    if (bs->encrypted) {
>>> +        assert(s->crypto);
>>> +        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
>>> +        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
>>> +        if (qcrypto_block_decrypt(s->crypto,
>>> +                                  (s->crypt_physical_offset ?
>>> +                                   file_cluster_offset + offset_in_cluster :
>>> +                                   offset),
>>> +                                  crypt_buf,
>>> +                                  bytes,
>>> +                                  NULL) < 0) {
>> What's the reason against decrypting this in-place in qiov so we could
>> save the bounce buffer?  We allow offsets in clusters, so why can't we
>> just call this function once per involved I/O vector entry?
> 
> Isn't it unsafe to do decryption in guest buffers?

I don't know.  Why would it be?  Because...

>> Max
>>
>>> +            ret = -EIO;
>>> +            goto out;
>>> +        }
>>> +        qemu_iovec_from_buf(qiov, qiov_offset, crypt_buf, bytes);

...we're putting the decrypted content there anyway.

The only issue I see is that the guest may see encrypted content for a
short duration.  Hm.  Maybe we don't want that.  In which case it
probably has to stay as it is.

Max

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Vladimir Sementsov-Ogievskiy 7 years, 4 months ago

01.10.2018 18:39, Max Reitz wrote:
> On 01.10.18 17:14, Vladimir Sementsov-Ogievskiy wrote:
>> 27.09.2018 20:35, Max Reitz wrote:
>>> On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:
>>>> Memory allocation may become less efficient for encrypted case. It's a
>>>> payment for further asynchronous scheme.
>>>>
>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>> ---
>>>>   block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
>>>>   1 file changed, 64 insertions(+), 50 deletions(-)
>>>>
>>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>>> index 65e3ca00e2..5e7f2ee318 100644
>>>> --- a/block/qcow2.c
>>>> +++ b/block/qcow2.c
>>>> @@ -1808,6 +1808,67 @@ out:
>>>>       return ret;
>>>>   }
>>>>   
>>>> +static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
>>>> +                                               uint64_t file_cluster_offset,
>>>> +                                               uint64_t offset,
>>>> +                                               uint64_t bytes,
>>>> +                                               QEMUIOVector *qiov,
>>>> +                                               uint64_t qiov_offset)
>>>> +{
>>>> +    int ret;
>>>> +    BDRVQcow2State *s = bs->opaque;
>>>> +    void *crypt_buf = NULL;
>>>> +    QEMUIOVector hd_qiov;
>>>> +    int offset_in_cluster = offset_into_cluster(s, offset);
>>>> +
>>>> +    if ((file_cluster_offset & 511) != 0) {
>>>> +        return -EIO;
>>>> +    }
>>>> +
>>>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>>> So you're not just re-allocating the bounce buffer for every single
>>> call, but also this I/O vector.  Hm.
>>>
>>>> +    if (bs->encrypted) {
>>>> +        assert(s->crypto);
>>>> +        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
>>>> +
>>>> +        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
>>> 1. Why did you remove the comment?
>>>
>>> 2. The check for whether crypt_buf was successfully allocated is missing.
>>>
>>> 3. Do you have any benchmarks for encrypted images?  Having to allocate
>>> the bounce buffer for potentially every single cluster sounds rather bad
>>> to me.
>> Hmm, no excuses. 1- Will fix, 2 - will fix, 3 - will fix or at least
>> test the performance.
>>
>>>> +        qemu_iovec_add(&hd_qiov, crypt_buf, bytes);
>>>> +    } else {
>>>> +        qemu_iovec_concat(&hd_qiov, qiov, qiov_offset, bytes);
>>> qcow2_co_preadv() continues to do this as well.  That's superfluous now.
>> hd_qiov is local here.. or what do you mean?
> qcow2_co_preadv() continues to have its own hd_qiov and appends qiov to
> it (just like here), but it's unused for normal clusters.  So it doesn't
> have to do that for normal clusters.

ah, yes, understand. Will think how to properly handle it.

>
>>>> +    }
>>>> +
>>>> +    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
>>>> +    ret = bdrv_co_preadv(bs->file,
>>>> +                         file_cluster_offset + offset_in_cluster,
>>>> +                         bytes, &hd_qiov, 0);
>>>> +    if (ret < 0) {
>>>> +        goto out;
>>>> +    }
>>>> +
>>>> +    if (bs->encrypted) {
>>>> +        assert(s->crypto);
>>>> +        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
>>>> +        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
>>>> +        if (qcrypto_block_decrypt(s->crypto,
>>>> +                                  (s->crypt_physical_offset ?
>>>> +                                   file_cluster_offset + offset_in_cluster :
>>>> +                                   offset),
>>>> +                                  crypt_buf,
>>>> +                                  bytes,
>>>> +                                  NULL) < 0) {
>>> What's the reason against decrypting this in-place in qiov so we could
>>> save the bounce buffer?  We allow offsets in clusters, so why can't we
>>> just call this function once per involved I/O vector entry?
>> Isn't it unsafe to do decryption in guest buffers?
> I don't know.  Why would it be?  Because...
>
>>> Max
>>>
>>>> +            ret = -EIO;
>>>> +            goto out;
>>>> +        }
>>>> +        qemu_iovec_from_buf(qiov, qiov_offset, crypt_buf, bytes);
> ...we're putting the decrypted content there anyway.
>
> The only issue I see is that the guest may see encrypted content for a
> short duration.  Hm.  Maybe we don't want that.  In which case it
> probably has to stay as it is.

Yes, I mean exactly this thing.

>
> Max
>


-- 
Best regards,
Vladimir

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Vladimir Sementsov-Ogievskiy 7 years, 3 months ago

27.09.2018 20:35, Max Reitz wrote:

On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:


Memory allocation may become less efficient for encrypted case. It's a
payment for further asynchronous scheme.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com><mailto:vsementsov@virtuozzo.com>
---
 block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 64 insertions(+), 50 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 65e3ca00e2..5e7f2ee318 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1808,6 +1808,67 @@ out:
     return ret;
 }

+static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
+                                               uint64_t file_cluster_offset,
+                                               uint64_t offset,
+                                               uint64_t bytes,
+                                               QEMUIOVector *qiov,
+                                               uint64_t qiov_offset)
+{
+    int ret;
+    BDRVQcow2State *s = bs->opaque;
+    void *crypt_buf = NULL;
+    QEMUIOVector hd_qiov;
+    int offset_in_cluster = offset_into_cluster(s, offset);
+
+    if ((file_cluster_offset & 511) != 0) {
+        return -EIO;
+    }
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);


So you're not just re-allocating the bounce buffer for every single
call, but also this I/O vector.  Hm.



+    if (bs->encrypted) {
+        assert(s->crypto);
+        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+
+        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);


1. Why did you remove the comment?

2. The check for whether crypt_buf was successfully allocated is missing.

3. Do you have any benchmarks for encrypted images?  Having to allocate
the bounce buffer for potentially every single cluster sounds rather bad
to me.

Hmm, about this.

Now we have compress threads to do several compress operations in parallel. And I plan to do the same thing for decompression, encryption and decryption. So, we'll definitely need several cluster-size buffers to do all these operations. How many? The first thought is just as many as maximum number of threads (or twice as many, to have in and out buffers for compression). But it's wrong, consider for example encryption on write:

1. get free thread for encryption (may be yield if maximum thread number achieved, waiting until all threads are busy)
2. get buffer (per thread)
3. thread handles encryption
a. free thread here?
4. write encrypted data to underlying file
b. free thread here?

Of course, we want free thread as soon as possible, in "a." not in "b.". And this mean, that we should free buffer in "a." too, so we should copy data to locally allocated buffer, or, better, we just use local buffer from the beginning, and thread don't own it's own buffer..

So, what are the options we have?

1. the most simple thing: just allocate buffers for each request locally, like it is already done for compression.
pros: simple
cons: allocate/free every time may influence performance, as you noticed

2. keep a pool of such buffers, so when buffer freed, it's just queued to the list of free buffers
pros:
   reduce number of real alloc/free calls
   we can limit the pool maximum size
   we can allocate new buffers on demand, not the whole pool at start
cons:
   hmm, so, it looks like custom allocator. Is it really needed? Is it really faster than just use system allocator and call alloc/free every time we need a buffer?

3. should not we use g_slice_alloc instead of inventing it in "2."

So, I've decided to do some tests, and here are results (memcpy is for 32K, all other things allocates 64K in a loop, list_alloc is a simple realization of "2.":

nothing: 200000000 it / 0.301361 sec = 663655696.202532 it/s
memcpy: 2000000 it / 1.633015 sec = 1224728.554970 it/s
g_malloc: 200000000 it / 5.346707 sec = 37406202.540530 it/s
g_slice_alloc: 200000000 it / 7.812036 sec = 25601520.402792 it/s
list_alloc: 200000000 it / 5.432771 sec = 36813626.268630 it/s
posix_memalign: 20000000 it / 1.025543 sec = 19501864.376084 it/s
aligned_alloc: 20000000 it / 1.035639 sec = 19311752.149739 it/s

So, you see that yes, list_alloc is twice as fast as posix_memalign, but on the other hand, simple memcpy is a lot slower (16 times slower) (and I don't say about real disk IO which will be even more slower), so, should we care about allocation, what do you thing?
test is attached.





+        qemu_iovec_add(&hd_qiov, crypt_buf, bytes);
+    } else {
+        qemu_iovec_concat(&hd_qiov, qiov, qiov_offset, bytes);


qcow2_co_preadv() continues to do this as well.  That's superfluous now.



+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+    ret = bdrv_co_preadv(bs->file,
+                         file_cluster_offset + offset_in_cluster,
+                         bytes, &hd_qiov, 0);
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (bs->encrypted) {
+        assert(s->crypto);
+        assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+        assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+        if (qcrypto_block_decrypt(s->crypto,
+                                  (s->crypt_physical_offset ?
+                                   file_cluster_offset + offset_in_cluster :
+                                   offset),
+                                  crypt_buf,
+                                  bytes,
+                                  NULL) < 0) {


What's the reason against decrypting this in-place in qiov so we could
save the bounce buffer?  We allow offsets in clusters, so why can't we
just call this function once per involved I/O vector entry?

Max



+            ret = -EIO;
+            goto out;
+        }
+        qemu_iovec_from_buf(qiov, qiov_offset, crypt_buf, bytes);
+    }
+
+out:
+    qemu_vfree(crypt_buf);
+    qemu_iovec_destroy(&hd_qiov);
+
+    return ret;
+}
+
 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
                                         uint64_t bytes, QEMUIOVector *qiov,
                                         int flags)
@@ -1819,7 +1880,6 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
     uint64_t cluster_offset = 0;
     uint64_t bytes_done = 0;
     QEMUIOVector hd_qiov;
-    uint8_t *cluster_data = NULL;

     qemu_iovec_init(&hd_qiov, qiov->niov);

@@ -1882,57 +1942,12 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
         case QCOW2_CLUSTER_NORMAL:
             qemu_co_mutex_unlock(&s->lock);

-            if ((cluster_offset & 511) != 0) {
-                ret = -EIO;
-                goto fail_nolock;
-            }
-
-            if (bs->encrypted) {
-                assert(s->crypto);
-
-                /*
-                 * For encrypted images, read everything into a temporary
-                 * contiguous buffer on which the AES functions can work.
-                 */
-                if (!cluster_data) {
-                    cluster_data =
-                        qemu_try_blockalign(bs->file->bs,
-                                            QCOW_MAX_CRYPT_CLUSTERS
-                                            * s->cluster_size);
-                    if (cluster_data == NULL) {
-                        ret = -ENOMEM;
-                        goto fail_nolock;
-                    }
-                }
-
-                assert(cur_bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
-                qemu_iovec_reset(&hd_qiov);
-                qemu_iovec_add(&hd_qiov, cluster_data, cur_bytes);
-            }
-
-            BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-            ret = bdrv_co_preadv(bs->file,
-                                 cluster_offset + offset_in_cluster,
-                                 cur_bytes, &hd_qiov, 0);
+            ret = qcow2_co_preadv_normal(bs, cluster_offset,
+                                         offset, cur_bytes, qiov, bytes_done);
             if (ret < 0) {
                 goto fail_nolock;
             }
-            if (bs->encrypted) {
-                assert(s->crypto);
-                assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
-                assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-                if (qcrypto_block_decrypt(s->crypto,
-                                          (s->crypt_physical_offset ?
-                                           cluster_offset + offset_in_cluster :
-                                           offset),
-                                          cluster_data,
-                                          cur_bytes,
-                                          NULL) < 0) {
-                    ret = -EIO;
-                    goto fail_nolock;
-                }
-                qemu_iovec_from_buf(qiov, bytes_done, cluster_data, cur_bytes);
-            }
+
             qemu_co_mutex_lock(&s->lock);
             break;

@@ -1953,7 +1968,6 @@ fail:

 fail_nolock:
     qemu_iovec_destroy(&hd_qiov);
-    qemu_vfree(cluster_data);

     return ret;
 }








--
Best regards,
Vladimir

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Max Reitz 7 years, 3 months ago

On 01.11.18 13:17, Vladimir Sementsov-Ogievskiy wrote:
> 27.09.2018 20:35, Max Reitz wrote:
>> On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:
>>> Memory allocation may become less efficient for encrypted case. It's a
>>> payment for further asynchronous scheme.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>> ---
>>>  block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
>>>  1 file changed, 64 insertions(+), 50 deletions(-)
>>>
>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>> index 65e3ca00e2..5e7f2ee318 100644
>>> --- a/block/qcow2.c
>>> +++ b/block/qcow2.c
>>> @@ -1808,6 +1808,67 @@ out:
>>>      return ret;
>>>  }
>>>  
>>> +static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
>>> +                                               uint64_t file_cluster_offset,
>>> +                                               uint64_t offset,
>>> +                                               uint64_t bytes,
>>> +                                               QEMUIOVector *qiov,
>>> +                                               uint64_t qiov_offset)
>>> +{
>>> +    int ret;
>>> +    BDRVQcow2State *s = bs->opaque;
>>> +    void *crypt_buf = NULL;
>>> +    QEMUIOVector hd_qiov;
>>> +    int offset_in_cluster = offset_into_cluster(s, offset);
>>> +
>>> +    if ((file_cluster_offset & 511) != 0) {
>>> +        return -EIO;
>>> +    }
>>> +
>>> +    qemu_iovec_init(&hd_qiov, qiov->niov);
>> So you're not just re-allocating the bounce buffer for every single
>> call, but also this I/O vector.  Hm.
>>
>>> +    if (bs->encrypted) {
>>> +        assert(s->crypto);
>>> +        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
>>> +
>>> +        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
>> 1. Why did you remove the comment?
>>
>> 2. The check for whether crypt_buf was successfully allocated is missing.
>>
>> 3. Do you have any benchmarks for encrypted images?  Having to allocate
>> the bounce buffer for potentially every single cluster sounds rather bad
>> to me.
> 
> Hmm, about this.
> 
> Now we have compress threads to do several compress operations in
> parallel. And I plan to do the same thing for decompression, encryption
> and decryption. So, we'll definitely need several cluster-size buffers
> to do all these operations. How many? The first thought is just as many
> as maximum number of threads (or twice as many, to have in and out
> buffers for compression). But it's wrong, consider for example
> encryption on write:
> 
> 1. get free thread for encryption (may be yield if maximum thread number
> achieved, waiting until all threads are busy)
> 2. get buffer (per thread)
> 3. thread handles encryption
> a. free thread here?
> 4. write encrypted data to underlying file
> b. free thread here?
> 
> Of course, we want free thread as soon as possible, in "a." not in "b.".
> And this mean, that we should free buffer in "a." too, so we should copy
> data to locally allocated buffer, or, better, we just use local buffer
> from the beginning, and thread don't own it's own buffer..
> 
> So, what are the options we have?
> 
> 1. the most simple thing: just allocate buffers for each request
> locally, like it is already done for compression.
> pros: simple
> cons: allocate/free every time may influence performance, as you noticed
> 
> 2. keep a pool of such buffers, so when buffer freed, it's just queued
> to the list of free buffers
> pros:
>    reduce number of real alloc/free calls
>    we can limit the pool maximum size
>    we can allocate new buffers on demand, not the whole pool at start
> cons:
>    hmm, so, it looks like custom allocator. Is it really needed? Is it
> really faster than just use system allocator and call alloc/free every
> time we need a buffer?
> 
> 3. should not we use g_slice_alloc instead of inventing it in "2."
> 
> So, I've decided to do some tests, and here are results (memcpy is for
> 32K, all other things allocates 64K in a loop, list_alloc is a simple
> realization of "2.":
> 
> nothing: 200000000 it / 0.301361 sec = 663655696.202532 it/s
> memcpy: 2000000 it / 1.633015 sec = 1224728.554970 it/s
> g_malloc: 200000000 it / 5.346707 sec = 37406202.540530 it/s
> g_slice_alloc: 200000000 it / 7.812036 sec = 25601520.402792 it/s
> list_alloc: 200000000 it / 5.432771 sec = 36813626.268630 it/s
> posix_memalign: 20000000 it / 1.025543 sec = 19501864.376084 it/s
> aligned_alloc: 20000000 it / 1.035639 sec = 19311752.149739 it/s
> 
> So, you see that yes, list_alloc is twice as fast as posix_memalign, but
> on the other hand, simple memcpy is a lot slower (16 times slower) (and
> I don't say about real disk IO which will be even more slower), so,
> should we care about allocation, what do you thing?
> test is attached.

Agreed.  Thanks for testing.

I am a bit worried about the maximum memory usage still.  With 16
workers, having bounce buffers can mean up to 32 MB of memory usage with
the default cluster size (and 1 GB with 2 MB clusters).  Then again, it
should be easy to limit memory usage even without a custom pool (just
some variable that says how much memory there is left).  Also, it'd be
OK to do that on top.

Max

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Kevin Wolf 7 years, 3 months ago

(Broken quoting in text/plain again)

Am 01.11.2018 um 13:17 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 27.09.2018 20:35, Max Reitz wrote:
> 
>     On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:
> 
>         Memory allocation may become less efficient for encrypted case. It's a
>         payment for further asynchronous scheme.
> 
>         Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>         ---
>          block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
>          1 file changed, 64 insertions(+), 50 deletions(-)
> 
>         diff --git a/block/qcow2.c b/block/qcow2.c
>         index 65e3ca00e2..5e7f2ee318 100644
>         --- a/block/qcow2.c
>         +++ b/block/qcow2.c
>         @@ -1808,6 +1808,67 @@ out:
>              return ret;
>          }
> 
>         +static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
>         +                                               uint64_t file_cluster_offset,
>         +                                               uint64_t offset,
>         +                                               uint64_t bytes,
>         +                                               QEMUIOVector *qiov,
>         +                                               uint64_t qiov_offset)
>         +{
>         +    int ret;
>         +    BDRVQcow2State *s = bs->opaque;
>         +    void *crypt_buf = NULL;
>         +    QEMUIOVector hd_qiov;
>         +    int offset_in_cluster = offset_into_cluster(s, offset);
>         +
>         +    if ((file_cluster_offset & 511) != 0) {
>         +        return -EIO;
>         +    }
>         +
>         +    qemu_iovec_init(&hd_qiov, qiov->niov);
> 
>     So you're not just re-allocating the bounce buffer for every single
>     call, but also this I/O vector.  Hm.

And this one is actually at least a little more serious, I think.

Decryption and decompression (including copying between the original
qiov and the bounce buffer) are already very slow, so allocating a
bounce buffer or not shouldn't make any noticable difference.

But I'd like to keep allocations out of the fast path as much as we can.

>         +    if (bs->encrypted) {
>         +        assert(s->crypto);
>         +        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
>         +
>         +        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
> 
>     1. Why did you remove the comment?
> 
>     2. The check for whether crypt_buf was successfully allocated is missing.
> 
>     3. Do you have any benchmarks for encrypted images?  Having to allocate
>     the bounce buffer for potentially every single cluster sounds rather bad
>     to me.

Actually, benchmarks for normal, fully allocated images would be a
little more interesting. Though I'm not sure if qcow2 actually performs
well enough that we'll see any difference even there (when we measured
memory allocation overhead etc. for raw images in the context of
comparing coroutines to AIO callback styes, the difference was already
hard to see).

Kevin

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Vladimir Sementsov-Ogievskiy 7 years, 3 months ago

07.11.2018 21:16, Kevin Wolf wrote:
> (Broken quoting in text/plain again)
>
> Am 01.11.2018 um 13:17 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 27.09.2018 20:35, Max Reitz wrote:
>>
>>      On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:
>>
>>          Memory allocation may become less efficient for encrypted case. It's a
>>          payment for further asynchronous scheme.
>>
>>          Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>          ---
>>           block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
>>           1 file changed, 64 insertions(+), 50 deletions(-)
>>
>>          diff --git a/block/qcow2.c b/block/qcow2.c
>>          index 65e3ca00e2..5e7f2ee318 100644
>>          --- a/block/qcow2.c
>>          +++ b/block/qcow2.c
>>          @@ -1808,6 +1808,67 @@ out:
>>               return ret;
>>           }
>>
>>          +static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
>>          +                                               uint64_t file_cluster_offset,
>>          +                                               uint64_t offset,
>>          +                                               uint64_t bytes,
>>          +                                               QEMUIOVector *qiov,
>>          +                                               uint64_t qiov_offset)
>>          +{
>>          +    int ret;
>>          +    BDRVQcow2State *s = bs->opaque;
>>          +    void *crypt_buf = NULL;
>>          +    QEMUIOVector hd_qiov;
>>          +    int offset_in_cluster = offset_into_cluster(s, offset);
>>          +
>>          +    if ((file_cluster_offset & 511) != 0) {
>>          +        return -EIO;
>>          +    }
>>          +
>>          +    qemu_iovec_init(&hd_qiov, qiov->niov);
>>
>>      So you're not just re-allocating the bounce buffer for every single
>>      call, but also this I/O vector.  Hm.
> And this one is actually at least a little more serious, I think.
>
> Decryption and decompression (including copying between the original
> qiov and the bounce buffer) are already very slow, so allocating a
> bounce buffer or not shouldn't make any noticable difference.
>
> But I'd like to keep allocations out of the fast path as much as we can.
>
>>          +    if (bs->encrypted) {
>>          +        assert(s->crypto);
>>          +        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
>>          +
>>          +        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
>>
>>      1. Why did you remove the comment?
>>
>>      2. The check for whether crypt_buf was successfully allocated is missing.
>>
>>      3. Do you have any benchmarks for encrypted images?  Having to allocate
>>      the bounce buffer for potentially every single cluster sounds rather bad
>>      to me.
> Actually, benchmarks for normal, fully allocated images would be a
> little more interesting. Though I'm not sure if qcow2 actually performs
> well enough that we'll see any difference even there (when we measured
> memory allocation overhead etc. for raw images in the context of
> comparing coroutines to AIO callback styes, the difference was already
> hard to see).

Ok, I'll benchmark it and/or improve fast path (you mean code path, 
where we skip coroutines creation, to handle the whole request at once, 
yes?).

Hmm, all this staff with hd_qiov's in many places is due to we don't 
have qiov_offset parameter in .bdrv_co_preadv.. Is it a bad idea to add it?

Anyway, I now think, it's better to start with decompress-threads 
series, as compress-threads are already merged, then bring encryption to 
threads too, and then return to these series. This way will remove some 
questions about allocation, and may be it would be simpler and more 
consistent to bring more things to coroutines, not only normal clusters.

>
> Kevin


-- 
Best regards,
Vladimir

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Kevin Wolf 7 years, 3 months ago

Am 08.11.2018 um 11:02 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 07.11.2018 21:16, Kevin Wolf wrote:
> > (Broken quoting in text/plain again)
> >
> > Am 01.11.2018 um 13:17 hat Vladimir Sementsov-Ogievskiy geschrieben:
> >> 27.09.2018 20:35, Max Reitz wrote:
> >>
> >>      On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:
> >>
> >>          Memory allocation may become less efficient for encrypted case. It's a
> >>          payment for further asynchronous scheme.
> >>
> >>          Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> >>          ---
> >>           block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
> >>           1 file changed, 64 insertions(+), 50 deletions(-)
> >>
> >>          diff --git a/block/qcow2.c b/block/qcow2.c
> >>          index 65e3ca00e2..5e7f2ee318 100644
> >>          --- a/block/qcow2.c
> >>          +++ b/block/qcow2.c
> >>          @@ -1808,6 +1808,67 @@ out:
> >>               return ret;
> >>           }
> >>
> >>          +static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
> >>          +                                               uint64_t file_cluster_offset,
> >>          +                                               uint64_t offset,
> >>          +                                               uint64_t bytes,
> >>          +                                               QEMUIOVector *qiov,
> >>          +                                               uint64_t qiov_offset)
> >>          +{
> >>          +    int ret;
> >>          +    BDRVQcow2State *s = bs->opaque;
> >>          +    void *crypt_buf = NULL;
> >>          +    QEMUIOVector hd_qiov;
> >>          +    int offset_in_cluster = offset_into_cluster(s, offset);
> >>          +
> >>          +    if ((file_cluster_offset & 511) != 0) {
> >>          +        return -EIO;
> >>          +    }
> >>          +
> >>          +    qemu_iovec_init(&hd_qiov, qiov->niov);
> >>
> >>      So you're not just re-allocating the bounce buffer for every single
> >>      call, but also this I/O vector.  Hm.
> > And this one is actually at least a little more serious, I think.
> >
> > Decryption and decompression (including copying between the original
> > qiov and the bounce buffer) are already very slow, so allocating a
> > bounce buffer or not shouldn't make any noticable difference.
> >
> > But I'd like to keep allocations out of the fast path as much as we can.
> >
> >>          +    if (bs->encrypted) {
> >>          +        assert(s->crypto);
> >>          +        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
> >>          +
> >>          +        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
> >>
> >>      1. Why did you remove the comment?
> >>
> >>      2. The check for whether crypt_buf was successfully allocated is missing.
> >>
> >>      3. Do you have any benchmarks for encrypted images?  Having to allocate
> >>      the bounce buffer for potentially every single cluster sounds rather bad
> >>      to me.
> > Actually, benchmarks for normal, fully allocated images would be a
> > little more interesting. Though I'm not sure if qcow2 actually performs
> > well enough that we'll see any difference even there (when we measured
> > memory allocation overhead etc. for raw images in the context of
> > comparing coroutines to AIO callback styes, the difference was already
> > hard to see).
> 
> Ok, I'll benchmark it and/or improve fast path (you mean code path, 
> where we skip coroutines creation, to handle the whole request at once, 
> yes?).

I had less the specific code path in mind, but the case of reading
unallocated clusters or reading/writing an already allocated area of
normal or zero clusters (no compression, no encrpytion, no other fancy
stuff). These are the cases that qcow2 images will hit the most in
practice.

> Hmm, all this staff with hd_qiov's in many places is due to we don't
> have qiov_offset parameter in .bdrv_co_preadv.. Is it a bad idea to
> add it?

I actually thought the same yesterday, though I'm not completely sure
yet about it. It makes the interfaces a bit uglier, but it could indeed
save us some work in the I/O path, so unless we find bigger reasons
against it, maybe we should do that.

> Anyway, I now think, it's better to start with decompress-threads 
> series, as compress-threads are already merged, then bring encryption to 
> threads too, and then return to these series. This way will remove some 
> questions about allocation, and may be it would be simpler and more 
> consistent to bring more things to coroutines, not only normal clusters.

Okay, we can do that.

Kevin

Re: [Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

Posted by Vladimir Sementsov-Ogievskiy 7 years, 3 months ago

08.11.2018 13:33, Kevin Wolf wrote:
> Am 08.11.2018 um 11:02 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 07.11.2018 21:16, Kevin Wolf wrote:
>>> (Broken quoting in text/plain again)
>>>
>>> Am 01.11.2018 um 13:17 hat Vladimir Sementsov-Ogievskiy geschrieben:
>>>> 27.09.2018 20:35, Max Reitz wrote:
>>>>
>>>>       On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:
>>>>
>>>>           Memory allocation may become less efficient for encrypted case. It's a
>>>>           payment for further asynchronous scheme.
>>>>
>>>>           Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>           ---
>>>>            block/qcow2.c | 114 ++++++++++++++++++++++++++++++++--------------------------
>>>>            1 file changed, 64 insertions(+), 50 deletions(-)
>>>>
>>>>           diff --git a/block/qcow2.c b/block/qcow2.c
>>>>           index 65e3ca00e2..5e7f2ee318 100644
>>>>           --- a/block/qcow2.c
>>>>           +++ b/block/qcow2.c
>>>>           @@ -1808,6 +1808,67 @@ out:
>>>>                return ret;
>>>>            }
>>>>
>>>>           +static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
>>>>           +                                               uint64_t file_cluster_offset,
>>>>           +                                               uint64_t offset,
>>>>           +                                               uint64_t bytes,
>>>>           +                                               QEMUIOVector *qiov,
>>>>           +                                               uint64_t qiov_offset)
>>>>           +{
>>>>           +    int ret;
>>>>           +    BDRVQcow2State *s = bs->opaque;
>>>>           +    void *crypt_buf = NULL;
>>>>           +    QEMUIOVector hd_qiov;
>>>>           +    int offset_in_cluster = offset_into_cluster(s, offset);
>>>>           +
>>>>           +    if ((file_cluster_offset & 511) != 0) {
>>>>           +        return -EIO;
>>>>           +    }
>>>>           +
>>>>           +    qemu_iovec_init(&hd_qiov, qiov->niov);
>>>>
>>>>       So you're not just re-allocating the bounce buffer for every single
>>>>       call, but also this I/O vector.  Hm.
>>> And this one is actually at least a little more serious, I think.
>>>
>>> Decryption and decompression (including copying between the original
>>> qiov and the bounce buffer) are already very slow, so allocating a
>>> bounce buffer or not shouldn't make any noticable difference.
>>>
>>> But I'd like to keep allocations out of the fast path as much as we can.
>>>
>>>>           +    if (bs->encrypted) {
>>>>           +        assert(s->crypto);
>>>>           +        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
>>>>           +
>>>>           +        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
>>>>
>>>>       1. Why did you remove the comment?
>>>>
>>>>       2. The check for whether crypt_buf was successfully allocated is missing.
>>>>
>>>>       3. Do you have any benchmarks for encrypted images?  Having to allocate
>>>>       the bounce buffer for potentially every single cluster sounds rather bad
>>>>       to me.
>>> Actually, benchmarks for normal, fully allocated images would be a
>>> little more interesting. Though I'm not sure if qcow2 actually performs
>>> well enough that we'll see any difference even there (when we measured
>>> memory allocation overhead etc. for raw images in the context of
>>> comparing coroutines to AIO callback styes, the difference was already
>>> hard to see).
>>
>> Ok, I'll benchmark it and/or improve fast path (you mean code path,
>> where we skip coroutines creation, to handle the whole request at once,
>> yes?).
> 
> I had less the specific code path in mind, but the case of reading
> unallocated clusters or reading/writing an already allocated area of
> normal or zero clusters (no compression, no encrpytion, no other fancy
> stuff). These are the cases that qcow2 images will hit the most in
> practice.

There are already some benchmarks in the thread started from the cover 
letter of the series.

> 
>> Hmm, all this staff with hd_qiov's in many places is due to we don't
>> have qiov_offset parameter in .bdrv_co_preadv.. Is it a bad idea to
>> add it?
> 
> I actually thought the same yesterday, though I'm not completely sure
> yet about it. It makes the interfaces a bit uglier, but it could indeed
> save us some work in the I/O path, so unless we find bigger reasons
> against it, maybe we should do that.
> 
>> Anyway, I now think, it's better to start with decompress-threads
>> series, as compress-threads are already merged, then bring encryption to
>> threads too, and then return to these series. This way will remove some
>> questions about allocation, and may be it would be simpler and more
>> consistent to bring more things to coroutines, not only normal clusters.
> 
> Okay, we can do that.
> 
> Kevin
> 


-- 
Best regards,
Vladimir

[Qemu-devel] [PATCH 1/7] qcow2: move qemu_co_mutex_lock below decryption procedure
[Qemu-devel] [PATCH 2/7] qcow2: bdrv_co_pwritev: move encryption code out of lock
[Qemu-devel] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv
[Qemu-devel] [PATCH 4/7] qcow2: async scheme for qcow2_co_preadv
[Qemu-devel] [PATCH 5/7] qcow2: refactor qcow2_co_pwritev: split out qcow2_co_do_pwritev
[Qemu-devel] [PATCH 6/7] qcow2: refactor qcow2_co_pwritev locals scope
[Qemu-devel] [PATCH 7/7] qcow2: async scheme for qcow2_co_pwritev