[v1] Add qemu-img checksum command using blkhash

[PATCH 3/3] qemu-img: Speed up checksum

Posted by Nir Soffer 3 years, 5 months ago

Add coroutine based loop inspired by `qemu-img convert` design.

Changes compared to `qemu-img convert`:

- State for the entire image is kept in ImgChecksumState

- State for single worker coroutine is kept in ImgChecksumworker.

- "Writes" are always in-order, ensured using a queue.

- Calling block status once per image extent, when the current extent is
  consumed by the workers.

- Using 1m buffer size - testings shows that this gives best read
  performance both with buffered and direct I/O.

- Number of coroutines is not configurable. Testing does not show
  improvement when using more than 8 coroutines.

- Progress include entire image, not only the allocated state.

Comparing to the simple read loop shows that this version is up to 4.67
times faster when computing a checksum for an image full of zeroes. For
real images it is 1.59 times faster with direct I/O, and with buffered
I/O there is no difference.

Test results on Dell PowerEdge R640 in a CentOS Stream 9 container:

| image    | size | i/o       | before         | after          | change |
|----------|------|-----------|----------------|----------------|--------|
| zero [1] |   6g | buffered  | 1.600s ±0.014s | 0.342s ±0.016s |  x4.67 |
| zero     |   6g | direct    | 4.684s ±0.093s | 2.211s ±0.009s |  x2.12 |
| real [2] |   6g | buffered  | 1.841s ±0.075s | 1.806s ±0.036s |  x1.02 |
| real     |   6g | direct    | 3.094s ±0.079s | 1.947s ±0.017s |  x1.59 |
| nbd  [3] |   6g | buffered  | 2.455s ±0.183s | 1.808s ±0.016s |  x1.36 |
| nbd      |   6g | direct    | 3.540s ±0.020s | 1.749s ±0.018s |  x2.02 |

[1] raw image full of zeroes
[2] raw fedora 35 image with additional random data, 50% full
[3] image [2] exported by qemu-nbd via unix socket

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 qemu-img.c | 343 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 270 insertions(+), 73 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 7edcfe4bc8..bfa8e2862f 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1613,48 +1613,288 @@ out:
     qemu_vfree(buf2);
     blk_unref(blk2);
 out2:
     blk_unref(blk1);
 out3:
     qemu_progress_end();
     return ret;
 }
 
 #ifdef CONFIG_BLKHASH
+
+#define CHECKSUM_COROUTINES 8
+#define CHECKSUM_BUF_SIZE (1 * MiB)
+#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX)
+
+typedef struct ImgChecksumState ImgChecksumState;
+
+typedef struct ImgChecksumWorker {
+    QTAILQ_ENTRY(ImgChecksumWorker) entry;
+    ImgChecksumState *state;
+    Coroutine *co;
+    uint8_t *buf;
+
+    /* The current chunk. */
+    int64_t offset;
+    int64_t length;
+    bool zero;
+
+    /* Always true for zero extent, false for data extent. Set to true
+     * when reading the chunk completes. */
+    bool ready;
+} ImgChecksumWorker;
+
+struct ImgChecksumState {
+    const char *filename;
+    BlockBackend *blk;
+    BlockDriverState *bs;
+    int64_t total_size;
+
+    /* Current extent, modified in checksum_co_next. */
+    int64_t offset;
+    int64_t length;
+    bool zero;
+
+    int running_coroutines;
+    CoMutex lock;
+    ImgChecksumWorker workers[CHECKSUM_COROUTINES];
+
+    /* Ensure in-order updates. Update are scheduled at the tail of the
+     * queue and processed from the head of the queue when a worker is
+     * ready. */
+    QTAILQ_HEAD(, ImgChecksumWorker) update_queue;
+
+    struct blkhash *hash;
+    int ret;
+};
+
+static int checksum_block_status(ImgChecksumState *s)
+{
+    int64_t length;
+    int status;
+
+    /* Must be called when current extent is consumed. */
+    assert(s->length == 0);
+
+    status = bdrv_block_status_above(s->bs, NULL, s->offset,
+                                     s->total_size - s->offset, &length, NULL,
+                                     NULL);
+    if (status < 0) {
+        error_report("Error checking status at offset %" PRId64 " for %s",
+                     s->offset, s->filename);
+        s->ret = status;
+        return -1;
+    }
+
+    assert(length > 0);
+
+    s->length = length;
+    s->zero = !!(status & BDRV_BLOCK_ZERO);
+
+    return 0;
+}
+
+/**
+ * Grab the next chunk from the current extent, getting the next extent if
+ * needed, and schecule the next update at the end fo the update queue.
+ *
+ * Retrun true if the worker has work to do, false if the worker has
+ * finished or there was an error getting the next extent.
+ */
+static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+
+    qemu_co_mutex_lock(&s->lock);
+
+    if (s->offset == s->total_size || s->ret != -EINPROGRESS) {
+        qemu_co_mutex_unlock(&s->lock);
+        return false;
+    }
+
+    if (s->length == 0 && checksum_block_status(s)) {
+        qemu_co_mutex_unlock(&s->lock);
+        return false;
+    }
+
+    /* Grab one chunk from current extent. */
+    w->offset = s->offset;
+    w->length = MIN(s->length,
+                    s->zero ?  CHECKSUM_ZERO_SIZE : CHECKSUM_BUF_SIZE);
+    w->zero = s->zero;
+    w->ready = s->zero;
+
+    /* Advance state to the next chunk. */
+    s->offset += w->length;
+    s->length -= w->length;
+
+    /* Schedule this chunk update. */
+    QTAILQ_INSERT_TAIL(&s->update_queue, w, entry);
+
+    qemu_co_mutex_unlock(&s->lock);
+
+    return true;
+}
+
+/* Wait until this chunk is in the head of the update queue. */
+static void coroutine_fn checksum_co_wait_for_update(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+
+    /* Must be called only when we are ready to update the hash. */
+    assert(w->ready);
+
+    while (QTAILQ_FIRST(&s->update_queue) != w) {
+        qemu_coroutine_yield();
+    }
+
+    QTAILQ_REMOVE(&s->update_queue, w, entry);
+}
+
+/* Enter the next worker coroutine if the worker is ready. */
+static void coroutine_fn checksum_co_enter_next(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+    ImgChecksumWorker *next;
+
+    if (!QTAILQ_EMPTY(&s->update_queue)) {
+        next = QTAILQ_FIRST(&s->update_queue);
+        if (next->ready)
+            qemu_coroutine_enter(next->co);
+    }
+}
+
+static int coroutine_fn checksum_co_read(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+    int err;
+
+    err = blk_co_pread(s->blk, w->offset, w->length, w->buf, 0);
+    if (err < 0) {
+        error_report("Error reading at offset %" PRId64 " of %s: %s",
+                     w->offset, s->filename, strerror(-err));
+        /* Unschedule this chunk. */
+        QTAILQ_REMOVE(&s->update_queue, w, entry);
+        s->ret = err;
+        return -1;
+    }
+
+    w->ready = true;
+
+    return 0;
+}
+
+static int checksum_update_hash(ImgChecksumWorker *w)
+{
+    ImgChecksumState *s = w->state;
+    int err;
+
+    if (w->zero) {
+        err = blkhash_zero(s->hash, w->length);
+        if (err) {
+            error_report("Error zeroing hash at offset %" PRId64 " of %s: %s",
+                         w->offset, s->filename, strerror(err));
+            s->ret = -err;
+            return -1;
+        }
+    } else {
+        err = blkhash_update(s->hash, w->buf, w->length);
+        if (err) {
+            error_report("Error updating hash at offset %" PRId64 " of %s: %s",
+                         w->offset, s->filename, strerror(err));
+            s->ret = -err;
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static void coroutine_fn checksum_co_loop(void *opaque)
+{
+    ImgChecksumWorker *w = opaque;
+    ImgChecksumState *s = w->state;
+
+    s->running_coroutines++;
+    w->buf = blk_blockalign(s->blk, CHECKSUM_BUF_SIZE);
+
+    while (checksum_co_next(w)) {
+        if (!w->zero && checksum_co_read(w)) {
+            break;
+        }
+        checksum_co_wait_for_update(w);
+        if (s->ret == -EINPROGRESS) {
+            if (checksum_update_hash(w)) {
+                break;
+            }
+            qemu_progress_print(((float) w->length / s->total_size) * 100, 100);
+        }
+        checksum_co_enter_next(w);
+    }
+
+    qemu_vfree(w->buf);
+    w->co = NULL;
+    s->running_coroutines--;
+
+    if (s->running_coroutines == 0 && s->ret == -EINPROGRESS) {
+        /* the checksum job finished successfully */
+        s->ret = 0;
+    }
+}
+
+static int checksum_loop(ImgChecksumState *s)
+{
+    int i;
+
+    s->ret = -EINPROGRESS;
+
+    qemu_co_mutex_init(&s->lock);
+    QTAILQ_INIT(&s->update_queue);
+
+    for (i = 0; i < CHECKSUM_COROUTINES; i++) {
+        ImgChecksumWorker *w = &s->workers[i];
+        w->state = s;
+        w->co = qemu_coroutine_create(checksum_co_loop, w);
+        qemu_coroutine_enter(w->co);
+    }
+
+    while (s->running_coroutines) {
+        main_loop_wait(false);
+    }
+
+    return s->ret;
+}
+
 /*
  * Compute image checksum.
  */
 static int img_checksum(int argc, char **argv)
 {
     const char *digest_name = "sha256";
     const size_t block_size = 64 * KiB;
 
     const char *format = NULL;
     const char *cache = BDRV_DEFAULT_CACHE;
-    const char *filename;
-    BlockBackend *blk;
-    BlockDriverState *bs;
-    uint8_t *buf = NULL;
-    int64_t pnum;
     bool progress = false;
+    bool image_opts = false;
+
     int flags = 0;
-    bool writethrough;
-    int64_t total_size;
-    int64_t offset = 0;
+    bool writethrough = false;
     int c;
-    bool image_opts = false;
-    struct blkhash *h = NULL;
-    unsigned char digest[64];
-    unsigned int digest_len;
+
     int ret = 1;
     int err;
 
+    ImgChecksumState s = {0};
+    unsigned char digest[64];
+    unsigned int digest_len;
+
     for (;;) {
         static const struct option long_options[] = {
             {"help", no_argument, 0, 'h'},
             {"object", required_argument, 0, OPTION_OBJECT},
             {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
             {0, 0, 0, 0}
         };
         c = getopt_long(argc, argv, ":hf:T:p",
                         long_options, NULL);
         if (c == -1) {
@@ -1697,119 +1937,76 @@ static int img_checksum(int argc, char **argv)
         case OPTION_IMAGE_OPTS:
             image_opts = true;
             break;
         }
     }
 
     if (optind != argc - 1) {
         error_exit("Expecting image file name");
     }
 
-    filename = argv[optind++];
+    s.filename = argv[optind++];
 
     err = bdrv_parse_cache_mode(cache, &flags, &writethrough);
     if (err < 0) {
         error_report("Invalid source cache option: %s", cache);
         return ret;
     }
 
-    blk = img_open(image_opts, filename, format, flags, writethrough, false,
-                   false);
-    if (!blk) {
+    s.blk = img_open(image_opts, s.filename, format, flags, writethrough,
+                     false, false);
+    if (!s.blk) {
         return ret;
     }
 
     /* Initialize before using goto out. */
     qemu_progress_init(progress, 2.0);
 
-    bs = blk_bs(blk);
-    buf = blk_blockalign(blk, IO_BUF_SIZE);
+    s.bs = blk_bs(s.blk);
 
-    total_size = blk_getlength(blk);
-    if (total_size < 0) {
+    s.total_size = blk_getlength(s.blk);
+    if (s.total_size < 0) {
         error_report("Can't get size of %s: %s",
-                     filename, strerror(-total_size));
+                     s.filename, strerror(-s.total_size));
         goto out;
     }
 
-    h = blkhash_new(block_size, digest_name);
-    if (!h) {
+    s.hash = blkhash_new(block_size, digest_name);
+    if (!s.hash) {
         error_report("Can't create blkhash: %s", strerror(errno));
         goto out;
     }
 
     qemu_progress_print(0, 100);
 
-    while (offset < total_size) {
-        int status;
-        int64_t chunk;
-
-        status = bdrv_block_status_above(bs, NULL, offset,
-                                         total_size - offset, &pnum, NULL,
-                                         NULL);
-        if (status < 0) {
-            error_report("Error checking status at offset %" PRId64 " for %s",
-                         offset, filename);
-            goto out;
-        }
-
-        assert(pnum);
-        chunk = pnum;
-
-        if (status & BDRV_BLOCK_ZERO) {
-            chunk = MIN(chunk, SIZE_MAX);
-            err = blkhash_zero(h, chunk);
-            if (err) {
-                error_report("Error zeroing hash at offset %" PRId64
-                             " of %s: %s",
-                             offset, filename, strerror(err));
-                goto out;
-            }
-        } else {
-            chunk = MIN(chunk, IO_BUF_SIZE);
-            err = blk_pread(blk, offset, chunk, buf, 0);
-            if (err < 0) {
-                error_report("Error reading at offset %" PRId64 " of %s: %s",
-                             offset, filename, strerror(-err));
-                goto out;
-            }
-            err = blkhash_update(h, buf, chunk);
-            if (err) {
-                error_report("Error updating hash at offset %" PRId64
-                             " of %s: %s",
-                             offset, filename, strerror(err));
-                goto out;
-            }
-        }
-
-        offset += chunk;
-        qemu_progress_print(((float) chunk / total_size) * 100, 100);
+    err = checksum_loop(&s);
+    if (err) {
+        goto out;
     }
 
-    err = blkhash_final(h, digest, &digest_len);
+    err = blkhash_final(s.hash, digest, &digest_len);
     if (err) {
         error_report("Error finalizing hash of %s: %s",
-                     filename, strerror(err));
+                     s.filename, strerror(err));
         goto out;
     }
 
     for (unsigned i = 0; i < digest_len; i++) {
         printf("%02x", digest[i]);
     }
-    printf("  %s%s", filename, progress ? "" : "\n");
+    printf("  %s%s", s.filename, progress ? "" : "\n");
 
     ret = 0;
 
 out:
-    blkhash_free(h);
-    qemu_vfree(buf);
-    blk_unref(blk);
+    blkhash_free(s.hash);
+    blk_unref(s.blk);
     qemu_progress_end();
 
     return ret;
 }
 #endif /* CONFIG_BLKHASH */
 
 /* Convenience wrapper around qmp_block_dirty_bitmap_merge */
 static void do_dirty_bitmap_merge(const char *dst_node, const char *dst_name,
                                   const char *src_node, const char *src_name,
                                   Error **errp)
-- 
2.37.2

Re: [PATCH 3/3] qemu-img: Speed up checksum

Posted by Hanna Reitz 3 years, 3 months ago

On 01.09.22 16:32, Nir Soffer wrote:
> Add coroutine based loop inspired by `qemu-img convert` design.
>
> Changes compared to `qemu-img convert`:
>
> - State for the entire image is kept in ImgChecksumState
>
> - State for single worker coroutine is kept in ImgChecksumworker.
>
> - "Writes" are always in-order, ensured using a queue.
>
> - Calling block status once per image extent, when the current extent is
>    consumed by the workers.
>
> - Using 1m buffer size - testings shows that this gives best read
>    performance both with buffered and direct I/O.

Why does patch 1 then choose to use 2 MB?

> - Number of coroutines is not configurable. Testing does not show
>    improvement when using more than 8 coroutines.
>
> - Progress include entire image, not only the allocated state.
>
> Comparing to the simple read loop shows that this version is up to 4.67
> times faster when computing a checksum for an image full of zeroes. For
> real images it is 1.59 times faster with direct I/O, and with buffered
> I/O there is no difference.
>
> Test results on Dell PowerEdge R640 in a CentOS Stream 9 container:
>
> | image    | size | i/o       | before         | after          | change |
> |----------|------|-----------|----------------|----------------|--------|
> | zero [1] |   6g | buffered  | 1.600s ±0.014s | 0.342s ±0.016s |  x4.67 |
> | zero     |   6g | direct    | 4.684s ±0.093s | 2.211s ±0.009s |  x2.12 |
> | real [2] |   6g | buffered  | 1.841s ±0.075s | 1.806s ±0.036s |  x1.02 |
> | real     |   6g | direct    | 3.094s ±0.079s | 1.947s ±0.017s |  x1.59 |
> | nbd  [3] |   6g | buffered  | 2.455s ±0.183s | 1.808s ±0.016s |  x1.36 |
> | nbd      |   6g | direct    | 3.540s ±0.020s | 1.749s ±0.018s |  x2.02 |
>
> [1] raw image full of zeroes
> [2] raw fedora 35 image with additional random data, 50% full
> [3] image [2] exported by qemu-nbd via unix socket
>
> Signed-off-by: Nir Soffer <nsoffer@redhat.com>
> ---
>   qemu-img.c | 343 +++++++++++++++++++++++++++++++++++++++++------------
>   1 file changed, 270 insertions(+), 73 deletions(-)

Looks good!

Just a couple of style comments below.

> diff --git a/qemu-img.c b/qemu-img.c
> index 7edcfe4bc8..bfa8e2862f 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -1613,48 +1613,288 @@ out:
>       qemu_vfree(buf2);
>       blk_unref(blk2);
>   out2:
>       blk_unref(blk1);
>   out3:
>       qemu_progress_end();
>       return ret;
>   }
>   
>   #ifdef CONFIG_BLKHASH
> +
> +#define CHECKSUM_COROUTINES 8
> +#define CHECKSUM_BUF_SIZE (1 * MiB)
> +#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX)
> +
> +typedef struct ImgChecksumState ImgChecksumState;
> +
> +typedef struct ImgChecksumWorker {
> +    QTAILQ_ENTRY(ImgChecksumWorker) entry;
> +    ImgChecksumState *state;
> +    Coroutine *co;
> +    uint8_t *buf;
> +
> +    /* The current chunk. */
> +    int64_t offset;
> +    int64_t length;
> +    bool zero;
> +
> +    /* Always true for zero extent, false for data extent. Set to true
> +     * when reading the chunk completes. */

Qemu codestyle requires /* and */ to be on separate lines for multi-line 
comments (see checkpatch.pl).

> +    bool ready;
> +} ImgChecksumWorker;
> +
> +struct ImgChecksumState {
> +    const char *filename;
> +    BlockBackend *blk;
> +    BlockDriverState *bs;
> +    int64_t total_size;
> +
> +    /* Current extent, modified in checksum_co_next. */
> +    int64_t offset;
> +    int64_t length;
> +    bool zero;
> +
> +    int running_coroutines;
> +    CoMutex lock;
> +    ImgChecksumWorker workers[CHECKSUM_COROUTINES];
> +
> +    /* Ensure in-order updates. Update are scheduled at the tail of the
> +     * queue and processed from the head of the queue when a worker is
> +     * ready. */

Qemu codestyle requires /* and */ to be on separate lines for multi-line 
comments.

> +    QTAILQ_HEAD(, ImgChecksumWorker) update_queue;
> +
> +    struct blkhash *hash;
> +    int ret;
> +};

[...]

> +static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w)
> +{
> +    ImgChecksumState *s = w->state;
> +
> +    qemu_co_mutex_lock(&s->lock);
> +
> +    if (s->offset == s->total_size || s->ret != -EINPROGRESS) {
> +        qemu_co_mutex_unlock(&s->lock);
> +        return false;
> +    }
> +
> +    if (s->length == 0 && checksum_block_status(s)) {

I’d prefer `checksum_block_status(s) < 0` so that it is clear that 
negative values indicate errors.  Otherwise, based on this code alone, 
it looks to me more like `checksum_block_status()` returned a boolean, 
where `false` would generally indicate an error, which is confusing.

Same in other places below.

> +        qemu_co_mutex_unlock(&s->lock);
> +        return false;
> +    }

[...]

> +/* Enter the next worker coroutine if the worker is ready. */
> +static void coroutine_fn checksum_co_enter_next(ImgChecksumWorker *w)
> +{
> +    ImgChecksumState *s = w->state;
> +    ImgChecksumWorker *next;
> +
> +    if (!QTAILQ_EMPTY(&s->update_queue)) {
> +        next = QTAILQ_FIRST(&s->update_queue);
> +        if (next->ready)
> +            qemu_coroutine_enter(next->co);

Qemu codestyle requires braces here.

Hanna

Re: [PATCH 3/3] qemu-img: Speed up checksum

Posted by Nir Soffer 3 years, 3 months ago

On Wed, Oct 26, 2022 at 4:54 PM Hanna Reitz <hreitz@redhat.com> wrote:

> On 01.09.22 16:32, Nir Soffer wrote:
> > Add coroutine based loop inspired by `qemu-img convert` design.
> >
> > Changes compared to `qemu-img convert`:
> >
> > - State for the entire image is kept in ImgChecksumState
> >
> > - State for single worker coroutine is kept in ImgChecksumworker.
> >
> > - "Writes" are always in-order, ensured using a queue.
> >
> > - Calling block status once per image extent, when the current extent is
> >    consumed by the workers.
> >
> > - Using 1m buffer size - testings shows that this gives best read
> >    performance both with buffered and direct I/O.
>
> Why does patch 1 then choose to use 2 MB?
>

The first patch uses sync I/O, and in this case 2 MB is a little faster.


> > - Number of coroutines is not configurable. Testing does not show
> >    improvement when using more than 8 coroutines.
> >
> > - Progress include entire image, not only the allocated state.
> >
> > Comparing to the simple read loop shows that this version is up to 4.67
> > times faster when computing a checksum for an image full of zeroes. For
> > real images it is 1.59 times faster with direct I/O, and with buffered
> > I/O there is no difference.
> >
> > Test results on Dell PowerEdge R640 in a CentOS Stream 9 container:
> >
> > | image    | size | i/o       | before         | after          | change
> |
> >
> |----------|------|-----------|----------------|----------------|--------|
> > | zero [1] |   6g | buffered  | 1.600s ±0.014s | 0.342s ±0.016s |  x4.67
> |
> > | zero     |   6g | direct    | 4.684s ±0.093s | 2.211s ±0.009s |  x2.12
> |
> > | real [2] |   6g | buffered  | 1.841s ±0.075s | 1.806s ±0.036s |  x1.02
> |
> > | real     |   6g | direct    | 3.094s ±0.079s | 1.947s ±0.017s |  x1.59
> |
> > | nbd  [3] |   6g | buffered  | 2.455s ±0.183s | 1.808s ±0.016s |  x1.36
> |
> > | nbd      |   6g | direct    | 3.540s ±0.020s | 1.749s ±0.018s |  x2.02
> |
> >
> > [1] raw image full of zeroes
> > [2] raw fedora 35 image with additional random data, 50% full
> > [3] image [2] exported by qemu-nbd via unix socket
> >
> > Signed-off-by: Nir Soffer <nsoffer@redhat.com>
> > ---
> >   qemu-img.c | 343 +++++++++++++++++++++++++++++++++++++++++------------
> >   1 file changed, 270 insertions(+), 73 deletions(-)
>
> Looks good!
>
> Just a couple of style comments below.
>
> > diff --git a/qemu-img.c b/qemu-img.c
> > index 7edcfe4bc8..bfa8e2862f 100644
> > --- a/qemu-img.c
> > +++ b/qemu-img.c
> > @@ -1613,48 +1613,288 @@ out:
> >       qemu_vfree(buf2);
> >       blk_unref(blk2);
> >   out2:
> >       blk_unref(blk1);
> >   out3:
> >       qemu_progress_end();
> >       return ret;
> >   }
> >
> >   #ifdef CONFIG_BLKHASH
> > +
> > +#define CHECKSUM_COROUTINES 8
> > +#define CHECKSUM_BUF_SIZE (1 * MiB)
> > +#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX)
> > +
> > +typedef struct ImgChecksumState ImgChecksumState;
> > +
> > +typedef struct ImgChecksumWorker {
> > +    QTAILQ_ENTRY(ImgChecksumWorker) entry;
> > +    ImgChecksumState *state;
> > +    Coroutine *co;
> > +    uint8_t *buf;
> > +
> > +    /* The current chunk. */
> > +    int64_t offset;
> > +    int64_t length;
> > +    bool zero;
> > +
> > +    /* Always true for zero extent, false for data extent. Set to true
> > +     * when reading the chunk completes. */
>
> Qemu codestyle requires /* and */ to be on separate lines for multi-line
> comments (see checkpatch.pl).
>

I'll change that. Do we have a good way to run checkpatch.pl when using
git-publish?

Maybe a way to run checkpatch.pl on all patches generated by git publish
automatically?


> > +    bool ready;
> > +} ImgChecksumWorker;
> > +
> > +struct ImgChecksumState {
> > +    const char *filename;
> > +    BlockBackend *blk;
> > +    BlockDriverState *bs;
> > +    int64_t total_size;
> > +
> > +    /* Current extent, modified in checksum_co_next. */
> > +    int64_t offset;
> > +    int64_t length;
> > +    bool zero;
> > +
> > +    int running_coroutines;
> > +    CoMutex lock;
> > +    ImgChecksumWorker workers[CHECKSUM_COROUTINES];
> > +
> > +    /* Ensure in-order updates. Update are scheduled at the tail of the
> > +     * queue and processed from the head of the queue when a worker is
> > +     * ready. */
>
> Qemu codestyle requires /* and */ to be on separate lines for multi-line
> comments.
>
> > +    QTAILQ_HEAD(, ImgChecksumWorker) update_queue;
> > +
> > +    struct blkhash *hash;
> > +    int ret;
> > +};
>
> [...]
>
> > +static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w)
> > +{
> > +    ImgChecksumState *s = w->state;
> > +
> > +    qemu_co_mutex_lock(&s->lock);
> > +
> > +    if (s->offset == s->total_size || s->ret != -EINPROGRESS) {
> > +        qemu_co_mutex_unlock(&s->lock);
> > +        return false;
> > +    }
> > +
> > +    if (s->length == 0 && checksum_block_status(s)) {
>
> I’d prefer `checksum_block_status(s) < 0` so that it is clear that
> negative values indicate errors.  Otherwise, based on this code alone,
> it looks to me more like `checksum_block_status()` returned a boolean,
> where `false` would generally indicate an error, which is confusing.
>

Sure, will change.


>
> Same in other places below.
>
> > +        qemu_co_mutex_unlock(&s->lock);
> > +        return false;
> > +    }
>
> [...]
>
> > +/* Enter the next worker coroutine if the worker is ready. */
> > +static void coroutine_fn checksum_co_enter_next(ImgChecksumWorker *w)
> > +{
> > +    ImgChecksumState *s = w->state;
> > +    ImgChecksumWorker *next;
> > +
> > +    if (!QTAILQ_EMPTY(&s->update_queue)) {
> > +        next = QTAILQ_FIRST(&s->update_queue);
> > +        if (next->ready)
> > +            qemu_coroutine_enter(next->co);
>
> Qemu codestyle requires braces here.
>

Will add in next version.

Before beautifying the first commit, should I squash this commit into it?

The first commit was an attempt to do the simplest possible implementation,
but since this commit looks good (with some style issues), do we really
need the
first one?

Nir

Re: [PATCH 3/3] qemu-img: Speed up checksum

Posted by Hanna Reitz 3 years, 3 months ago

On 30.10.22 18:38, Nir Soffer wrote:
> On Wed, Oct 26, 2022 at 4:54 PM Hanna Reitz <hreitz@redhat.com> wrote:
>
>     On 01.09.22 16:32, Nir Soffer wrote:
>     > Add coroutine based loop inspired by `qemu-img convert` design.
>     >
>     > Changes compared to `qemu-img convert`:
>     >
>     > - State for the entire image is kept in ImgChecksumState
>     >
>     > - State for single worker coroutine is kept in ImgChecksumworker.
>     >
>     > - "Writes" are always in-order, ensured using a queue.
>     >
>     > - Calling block status once per image extent, when the current
>     extent is
>     >    consumed by the workers.
>     >
>     > - Using 1m buffer size - testings shows that this gives best read
>     >    performance both with buffered and direct I/O.
>
>     Why does patch 1 then choose to use 2 MB?
>
>
> The first patch uses sync I/O, and in this case 2 MB is a little faster.

Interesting.

>     > - Number of coroutines is not configurable. Testing does not show
>     >    improvement when using more than 8 coroutines.
>     >
>     > - Progress include entire image, not only the allocated state.
>     >
>     > Comparing to the simple read loop shows that this version is up
>     to 4.67
>     > times faster when computing a checksum for an image full of
>     zeroes. For
>     > real images it is 1.59 times faster with direct I/O, and with
>     buffered
>     > I/O there is no difference.
>     >
>     > Test results on Dell PowerEdge R640 in a CentOS Stream 9 container:
>     >
>     > | image    | size | i/o       | before         | after         |
>     change |
>     >
>     |----------|------|-----------|----------------|----------------|--------|
>     > | zero [1] |   6g | buffered  | 1.600s ±0.014s | 0.342s ±0.016s
>     |  x4.67 |
>     > | zero     |   6g | direct    | 4.684s ±0.093s | 2.211s ±0.009s
>     |  x2.12 |
>     > | real [2] |   6g | buffered  | 1.841s ±0.075s | 1.806s ±0.036s
>     |  x1.02 |
>     > | real     |   6g | direct    | 3.094s ±0.079s | 1.947s ±0.017s
>     |  x1.59 |
>     > | nbd  [3] |   6g | buffered  | 2.455s ±0.183s | 1.808s ±0.016s
>     |  x1.36 |
>     > | nbd      |   6g | direct    | 3.540s ±0.020s | 1.749s ±0.018s
>     |  x2.02 |
>     >
>     > [1] raw image full of zeroes
>     > [2] raw fedora 35 image with additional random data, 50% full
>     > [3] image [2] exported by qemu-nbd via unix socket
>     >
>     > Signed-off-by: Nir Soffer <nsoffer@redhat.com>
>     > ---
>     >   qemu-img.c | 343
>     +++++++++++++++++++++++++++++++++++++++++------------
>     >   1 file changed, 270 insertions(+), 73 deletions(-)
>
>     Looks good!
>
>     Just a couple of style comments below.
>
>     > diff --git a/qemu-img.c b/qemu-img.c
>     > index 7edcfe4bc8..bfa8e2862f 100644
>     > --- a/qemu-img.c
>     > +++ b/qemu-img.c
>     > @@ -1613,48 +1613,288 @@ out:
>     >       qemu_vfree(buf2);
>     >       blk_unref(blk2);
>     >   out2:
>     >       blk_unref(blk1);
>     >   out3:
>     >       qemu_progress_end();
>     >       return ret;
>     >   }
>     >
>     >   #ifdef CONFIG_BLKHASH
>     > +
>     > +#define CHECKSUM_COROUTINES 8
>     > +#define CHECKSUM_BUF_SIZE (1 * MiB)
>     > +#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX)
>     > +
>     > +typedef struct ImgChecksumState ImgChecksumState;
>     > +
>     > +typedef struct ImgChecksumWorker {
>     > +    QTAILQ_ENTRY(ImgChecksumWorker) entry;
>     > +    ImgChecksumState *state;
>     > +    Coroutine *co;
>     > +    uint8_t *buf;
>     > +
>     > +    /* The current chunk. */
>     > +    int64_t offset;
>     > +    int64_t length;
>     > +    bool zero;
>     > +
>     > +    /* Always true for zero extent, false for data extent. Set
>     to true
>     > +     * when reading the chunk completes. */
>
>     Qemu codestyle requires /* and */ to be on separate lines for
>     multi-line
>     comments (see checkpatch.pl <http://checkpatch.pl>).
>
>
> I'll change that. Do we have a good way to run checkpatch.pl 
> <http://checkpatch.pl/> when using
> git-publish?
>
> Maybe a way to run checkpatch.pl <http://checkpatch.pl> on all patches 
> generated by git publish
> automatically?

I don’t know myself, I don’t use git-publish.  Stefan should know. ;)

>
>     > +    bool ready;
>     > +} ImgChecksumWorker;
>     > +
>     > +struct ImgChecksumState {
>     > +    const char *filename;
>     > +    BlockBackend *blk;
>     > +    BlockDriverState *bs;
>     > +    int64_t total_size;
>     > +
>     > +    /* Current extent, modified in checksum_co_next. */
>     > +    int64_t offset;
>     > +    int64_t length;
>     > +    bool zero;
>     > +
>     > +    int running_coroutines;
>     > +    CoMutex lock;
>     > +    ImgChecksumWorker workers[CHECKSUM_COROUTINES];
>     > +
>     > +    /* Ensure in-order updates. Update are scheduled at the
>     tail of the
>     > +     * queue and processed from the head of the queue when a
>     worker is
>     > +     * ready. */
>
>     Qemu codestyle requires /* and */ to be on separate lines for
>     multi-line
>     comments.
>
>     > +    QTAILQ_HEAD(, ImgChecksumWorker) update_queue;
>     > +
>     > +    struct blkhash *hash;
>     > +    int ret;
>     > +};
>
>     [...]
>
>     > +static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w)
>     > +{
>     > +    ImgChecksumState *s = w->state;
>     > +
>     > +    qemu_co_mutex_lock(&s->lock);
>     > +
>     > +    if (s->offset == s->total_size || s->ret != -EINPROGRESS) {
>     > +        qemu_co_mutex_unlock(&s->lock);
>     > +        return false;
>     > +    }
>     > +
>     > +    if (s->length == 0 && checksum_block_status(s)) {
>
>     I’d prefer `checksum_block_status(s) < 0` so that it is clear that
>     negative values indicate errors.  Otherwise, based on this code
>     alone,
>     it looks to me more like `checksum_block_status()` returned a
>     boolean,
>     where `false` would generally indicate an error, which is confusing.
>
>
> Sure, will change.
>
>
>     Same in other places below.
>
>     > +        qemu_co_mutex_unlock(&s->lock);
>     > +        return false;
>     > +    }
>
>     [...]
>
>     > +/* Enter the next worker coroutine if the worker is ready. */
>     > +static void coroutine_fn
>     checksum_co_enter_next(ImgChecksumWorker *w)
>     > +{
>     > +    ImgChecksumState *s = w->state;
>     > +    ImgChecksumWorker *next;
>     > +
>     > +    if (!QTAILQ_EMPTY(&s->update_queue)) {
>     > +        next = QTAILQ_FIRST(&s->update_queue);
>     > +        if (next->ready)
>     > +            qemu_coroutine_enter(next->co);
>
>     Qemu codestyle requires braces here.
>
>
> Will add in next version.
>
> Before beautifying the first commit, should I squash this commit into it?
>
> The first commit was an attempt to do the simplest possible 
> implementation,
> but since this commit looks good (with some style issues), do we 
> really need the
> first one?

I wouldn’t mind squashing.  Your choice. :)

Hanna

Re: [PATCH 3/3] qemu-img: Speed up checksum

Posted by Nir Soffer 3 years, 3 months ago

On Sun, Oct 30, 2022 at 7:38 PM Nir Soffer <nsoffer@redhat.com> wrote:

> On Wed, Oct 26, 2022 at 4:54 PM Hanna Reitz <hreitz@redhat.com> wrote:
>
>> On 01.09.22 16:32, Nir Soffer wrote:
>>
> [...]

> > +    /* The current chunk. */
>> > +    int64_t offset;
>> > +    int64_t length;
>> > +    bool zero;
>> > +
>> > +    /* Always true for zero extent, false for data extent. Set to true
>> > +     * when reading the chunk completes. */
>>
>> Qemu codestyle requires /* and */ to be on separate lines for multi-line
>> comments (see checkpatch.pl).
>>
>
> I'll change that. Do we have a good way to run checkpatch.pl when using
> git-publish?
>
> Maybe a way to run checkpatch.pl on all patches generated by git publish
> automatically?
>

I found
https://blog.vmsplice.net/2011/03/how-to-automatically-run-checkpatchpl.html
and it seems to work well.

[PATCH 1/3] qemu-img: Add checksum command
[PATCH 2/3] iotests: Test qemu-img checksum
[PATCH 3/3] qemu-img: Speed up checksum