[libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance

Claudio Fontana posted 1 patch 2 years, 1 month ago
Test syntax-check failed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/libvirt tags/patchew/20220312163001.3811-1-cfontana@suse.de
src/qemu/qemu_driver.c    |  6 +++---
src/qemu/qemu_saveimage.c | 11 ++++++-----
src/util/virfile.c        | 12 ++++++++++++
src/util/virfile.h        |  1 +
4 files changed, 22 insertions(+), 8 deletions(-)
[libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
the first user is the qemu driver,

virsh save/resume would slow to a crawl with a default pipe size (64k).

This improves the situation by 400%.

Going through io_helper still seems to incur in some penalty (~15%-ish)
compared with direct qemu migration to a nc socket to a file.

Signed-off-by: Claudio Fontana <cfontana@suse.de>
---
 src/qemu/qemu_driver.c    |  6 +++---
 src/qemu/qemu_saveimage.c | 11 ++++++-----
 src/util/virfile.c        | 12 ++++++++++++
 src/util/virfile.h        |  1 +
 4 files changed, 22 insertions(+), 8 deletions(-)

Hello, I initially thought this to be a qemu performance issue,
so you can find the discussion about this in qemu-devel:

"Re: bad virsh save /dev/null performance (600 MiB/s max)"

https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html

RFC since need to validate idea, and it is only lightly tested:

save     - about 400% benefit in throughput, getting around 20 Gbps to /dev/null,
           and around 13 Gbps to a ramdisk.
	   By comparison, direct qemu migration to a nc socket is around 24Gbps.

restore  - not tested, _should_ also benefit in the "bypass_cache" case
coredump - not tested, _should_ also benefit like for save

Thanks for your comments and review,

Claudio


diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index c1b3bd8536..be248c1e92 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -3044,7 +3044,7 @@ doCoreDump(virQEMUDriver *driver,
     virFileWrapperFd *wrapperFd = NULL;
     int directFlag = 0;
     bool needUnlink = false;
-    unsigned int flags = VIR_FILE_WRAPPER_NON_BLOCKING;
+    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
     const char *memory_dump_format = NULL;
     g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
     g_autoptr(virCommand) compressor = NULL;
@@ -3059,7 +3059,7 @@ doCoreDump(virQEMUDriver *driver,
 
     /* Create an empty file with appropriate ownership.  */
     if (dump_flags & VIR_DUMP_BYPASS_CACHE) {
-        flags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
+        wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
         directFlag = virFileDirectFdFlag();
         if (directFlag < 0) {
             virReportError(VIR_ERR_OPERATION_FAILED, "%s",
@@ -3072,7 +3072,7 @@ doCoreDump(virQEMUDriver *driver,
                              &needUnlink)) < 0)
         goto cleanup;
 
-    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, flags)))
+    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
         goto cleanup;
 
     if (dump_flags & VIR_DUMP_MEMORY_ONLY) {
diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c
index c0139041eb..1b522a1542 100644
--- a/src/qemu/qemu_saveimage.c
+++ b/src/qemu/qemu_saveimage.c
@@ -267,7 +267,7 @@ qemuSaveImageCreate(virQEMUDriver *driver,
     int fd = -1;
     int directFlag = 0;
     virFileWrapperFd *wrapperFd = NULL;
-    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING;
+    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
 
     /* Obtain the file handle.  */
     if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) {
@@ -463,10 +463,11 @@ qemuSaveImageOpen(virQEMUDriver *driver,
     if ((fd = qemuDomainOpenFile(cfg, NULL, path, oflags, NULL)) < 0)
         return -1;
 
-    if (bypass_cache &&
-        !(*wrapperFd = virFileWrapperFdNew(&fd, path,
-                                           VIR_FILE_WRAPPER_BYPASS_CACHE)))
-        return -1;
+    if (bypass_cache) {
+        unsigned int wrapperFlags = VIR_FILE_WRAPPER_BYPASS_CACHE | VIR_FILE_WRAPPER_BIG_PIPE;
+        if (!(*wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
+            return -1;
+    }
 
     data = g_new0(virQEMUSaveData, 1);
 
diff --git a/src/util/virfile.c b/src/util/virfile.c
index a04f888e06..fdacd17890 100644
--- a/src/util/virfile.c
+++ b/src/util/virfile.c
@@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
 
     ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
 
+    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {
+        /*
+         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
+         * This improves the situation by 400%, although going through io_helper still incurs
+         * in a performance penalty compared with a direct qemu migration to a socket.
+         */
+        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");
+        if (rv != 0) {
+            pipe_sz = 1024 * 1024; /* common default for pipe-max-size */
+        }
+        fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);
+    }
     if (output) {
         virCommandSetInputFD(ret->cmd, pipefd[0]);
         virCommandSetOutputFD(ret->cmd, fd);
diff --git a/src/util/virfile.h b/src/util/virfile.h
index b04386f6e6..8383c4b069 100644
--- a/src/util/virfile.h
+++ b/src/util/virfile.h
@@ -107,6 +107,7 @@ int virFileDirectFdFlag(void);
 typedef enum {
     VIR_FILE_WRAPPER_BYPASS_CACHE   = (1 << 0),
     VIR_FILE_WRAPPER_NON_BLOCKING   = (1 << 1),
+    VIR_FILE_WRAPPER_BIG_PIPE       = (1 << 2),
 } virFileWrapperFdFlags;
 
 virFileWrapperFd *virFileWrapperFdNew(int *fd,
-- 
2.26.2
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Michal Prívozník 2 years, 1 month ago
On 3/12/22 17:30, Claudio Fontana wrote:
> the first user is the qemu driver,
> 
> virsh save/resume would slow to a crawl with a default pipe size (64k).
> 
> This improves the situation by 400%.
> 
> Going through io_helper still seems to incur in some penalty (~15%-ish)
> compared with direct qemu migration to a nc socket to a file.
> 
> Signed-off-by: Claudio Fontana <cfontana@suse.de>
> ---
>  src/qemu/qemu_driver.c    |  6 +++---
>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>  src/util/virfile.c        | 12 ++++++++++++
>  src/util/virfile.h        |  1 +
>  4 files changed, 22 insertions(+), 8 deletions(-)
> 
> Hello, I initially thought this to be a qemu performance issue,
> so you can find the discussion about this in qemu-devel:
> 
> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
> 
> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
> 
> RFC since need to validate idea, and it is only lightly tested:
> 
> save     - about 400% benefit in throughput, getting around 20 Gbps to /dev/null,
>            and around 13 Gbps to a ramdisk.
> 	   By comparison, direct qemu migration to a nc socket is around 24Gbps.
> 
> restore  - not tested, _should_ also benefit in the "bypass_cache" case
> coredump - not tested, _should_ also benefit like for save
> 
> Thanks for your comments and review,
> 
> Claudio

Hey, I like this idea, but couple of points below.

> 
> 
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index c1b3bd8536..be248c1e92 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -3044,7 +3044,7 @@ doCoreDump(virQEMUDriver *driver,
>      virFileWrapperFd *wrapperFd = NULL;
>      int directFlag = 0;
>      bool needUnlink = false;
> -    unsigned int flags = VIR_FILE_WRAPPER_NON_BLOCKING;
> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>      const char *memory_dump_format = NULL;
>      g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>      g_autoptr(virCommand) compressor = NULL;
> @@ -3059,7 +3059,7 @@ doCoreDump(virQEMUDriver *driver,
>  
>      /* Create an empty file with appropriate ownership.  */
>      if (dump_flags & VIR_DUMP_BYPASS_CACHE) {
> -        flags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
> +        wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>          directFlag = virFileDirectFdFlag();
>          if (directFlag < 0) {
>              virReportError(VIR_ERR_OPERATION_FAILED, "%s",
> @@ -3072,7 +3072,7 @@ doCoreDump(virQEMUDriver *driver,
>                               &needUnlink)) < 0)
>          goto cleanup;
>  
> -    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, flags)))
> +    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>          goto cleanup;
>  
>      if (dump_flags & VIR_DUMP_MEMORY_ONLY) {
> diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c
> index c0139041eb..1b522a1542 100644
> --- a/src/qemu/qemu_saveimage.c
> +++ b/src/qemu/qemu_saveimage.c
> @@ -267,7 +267,7 @@ qemuSaveImageCreate(virQEMUDriver *driver,
>      int fd = -1;
>      int directFlag = 0;
>      virFileWrapperFd *wrapperFd = NULL;
> -    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING;
> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>  
>      /* Obtain the file handle.  */
>      if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) {
> @@ -463,10 +463,11 @@ qemuSaveImageOpen(virQEMUDriver *driver,
>      if ((fd = qemuDomainOpenFile(cfg, NULL, path, oflags, NULL)) < 0)
>          return -1;
>  
> -    if (bypass_cache &&
> -        !(*wrapperFd = virFileWrapperFdNew(&fd, path,
> -                                           VIR_FILE_WRAPPER_BYPASS_CACHE)))
> -        return -1;
> +    if (bypass_cache) {
> +        unsigned int wrapperFlags = VIR_FILE_WRAPPER_BYPASS_CACHE | VIR_FILE_WRAPPER_BIG_PIPE;
> +        if (!(*wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
> +            return -1;
> +    }
>  
>      data = g_new0(virQEMUSaveData, 1);
>  
> diff --git a/src/util/virfile.c b/src/util/virfile.c
> index a04f888e06..fdacd17890 100644
> --- a/src/util/virfile.c
> +++ b/src/util/virfile.c
> @@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
>  
>      ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
>  
> +    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {


I believe we don't need this flag. I mean, the plain fact that
virFileWrapper is used means that caller wants to avoid VFS because it's
interested in speed. Therefore, this code could be done unconditionally.

> +        /*
> +         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
> +         * This improves the situation by 400%, although going through io_helper still incurs
> +         * in a performance penalty compared with a direct qemu migration to a socket.
> +         */

This belongs into the commit message. This code has no knowledge about
qemu. What you can mention here is the performance benefit.
Also, QEMU migrating straight to a socket is going to have performance
benefit but only in a few cases, because if it's a migration into a file
then VFS (and thus caching) is involved. Thus, if you migrate into a
file and have enough free RAM for caches then yes, it's going to be
faster. But if you don't have free RAM then it's going to be way slower.

> +        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");
> +        if (rv != 0) {
> +            pipe_sz = 1024 * 1024; /* common default for pipe-max-size */
> +        }
> +        fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);

Alternative implementation would be to call fcntl() only if we know
we've succeeded in reading /proc/.../pipe-max-size, like this:

int pipe_sz;
int rv = virFileReadValueInt(&pipe_size, "/proc/sys/fs/pipe-max-size");

if (rv >= 0)
  fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);

(notice I've declared variables on separate lines, we like it that way)

Now, what can we do about that ternary operator? It doesn't look nice.
But I guess we can hardcode just one end of the pipe, because it's the
actual pipe and not FD we are modifying here.

Lastly, let's add some error checking:

int pipe_sz;
int rv = virFileReadValueInt(&pipe_size, "/proc/sys/fs/pipe-max-size");

if (rv == -2) {
  /* file doesn't exist */
} else if (rv < 0) {
  /* Unable to read/parse file */
  goto error;
} else {
  if (fcntl(pipefd[0], F_SETPIPE_SZ, pipe_sz) < 0) {
    virReportSystemError(errno, _("Unable to set pipe size to %d"),
pipe_sze);
    goto error;
}


Even better, move this fcntl() or even the whole code into a separate
function (virPipe() lives under src/util/virutil.c) so that it can be
reused.

Michal
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
Hello Michal,

thanks for your answer,

On 3/14/22 12:36 PM, Michal Prívozník wrote:
> On 3/12/22 17:30, Claudio Fontana wrote:
>> the first user is the qemu driver,
>>
>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>
>> This improves the situation by 400%.
>>
>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>> compared with direct qemu migration to a nc socket to a file.
>>
>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>> ---
>>  src/qemu/qemu_driver.c    |  6 +++---
>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>  src/util/virfile.c        | 12 ++++++++++++
>>  src/util/virfile.h        |  1 +
>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>
>> Hello, I initially thought this to be a qemu performance issue,
>> so you can find the discussion about this in qemu-devel:
>>
>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>
>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>
>> RFC since need to validate idea, and it is only lightly tested:
>>
>> save     - about 400% benefit in throughput, getting around 20 Gbps to /dev/null,
>>            and around 13 Gbps to a ramdisk.
>> 	   By comparison, direct qemu migration to a nc socket is around 24Gbps.
>>
>> restore  - not tested, _should_ also benefit in the "bypass_cache" case
>> coredump - not tested, _should_ also benefit like for save
>>
>> Thanks for your comments and review,
>>
>> Claudio
> 
> Hey, I like this idea, but couple of points below.
> 
>>
>>
>> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
>> index c1b3bd8536..be248c1e92 100644
>> --- a/src/qemu/qemu_driver.c
>> +++ b/src/qemu/qemu_driver.c
>> @@ -3044,7 +3044,7 @@ doCoreDump(virQEMUDriver *driver,
>>      virFileWrapperFd *wrapperFd = NULL;
>>      int directFlag = 0;
>>      bool needUnlink = false;
>> -    unsigned int flags = VIR_FILE_WRAPPER_NON_BLOCKING;
>> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>>      const char *memory_dump_format = NULL;
>>      g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>>      g_autoptr(virCommand) compressor = NULL;
>> @@ -3059,7 +3059,7 @@ doCoreDump(virQEMUDriver *driver,
>>  
>>      /* Create an empty file with appropriate ownership.  */
>>      if (dump_flags & VIR_DUMP_BYPASS_CACHE) {
>> -        flags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>> +        wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>>          directFlag = virFileDirectFdFlag();
>>          if (directFlag < 0) {
>>              virReportError(VIR_ERR_OPERATION_FAILED, "%s",
>> @@ -3072,7 +3072,7 @@ doCoreDump(virQEMUDriver *driver,
>>                               &needUnlink)) < 0)
>>          goto cleanup;
>>  
>> -    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, flags)))
>> +    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>>          goto cleanup;
>>  
>>      if (dump_flags & VIR_DUMP_MEMORY_ONLY) {
>> diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c
>> index c0139041eb..1b522a1542 100644
>> --- a/src/qemu/qemu_saveimage.c
>> +++ b/src/qemu/qemu_saveimage.c
>> @@ -267,7 +267,7 @@ qemuSaveImageCreate(virQEMUDriver *driver,
>>      int fd = -1;
>>      int directFlag = 0;
>>      virFileWrapperFd *wrapperFd = NULL;
>> -    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING;
>> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>>  
>>      /* Obtain the file handle.  */
>>      if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) {
>> @@ -463,10 +463,11 @@ qemuSaveImageOpen(virQEMUDriver *driver,
>>      if ((fd = qemuDomainOpenFile(cfg, NULL, path, oflags, NULL)) < 0)
>>          return -1;
>>  
>> -    if (bypass_cache &&
>> -        !(*wrapperFd = virFileWrapperFdNew(&fd, path,
>> -                                           VIR_FILE_WRAPPER_BYPASS_CACHE)))
>> -        return -1;
>> +    if (bypass_cache) {
>> +        unsigned int wrapperFlags = VIR_FILE_WRAPPER_BYPASS_CACHE | VIR_FILE_WRAPPER_BIG_PIPE;
>> +        if (!(*wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>> +            return -1;
>> +    }
>>  
>>      data = g_new0(virQEMUSaveData, 1);
>>  
>> diff --git a/src/util/virfile.c b/src/util/virfile.c
>> index a04f888e06..fdacd17890 100644
>> --- a/src/util/virfile.c
>> +++ b/src/util/virfile.c
>> @@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
>>  
>>      ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
>>  
>> +    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {
> 
> 
> I believe we don't need this flag. I mean, the plain fact that
> virFileWrapper is used means that caller wants to avoid VFS because it's
> interested in speed. Therefore, this code could be done unconditionally.


right, I see now this is called only by the qemu driver for those specific operations.


> 
>> +        /*
>> +         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
>> +         * This improves the situation by 400%, although going through io_helper still incurs
>> +         * in a performance penalty compared with a direct qemu migration to a socket.
>> +         */
> 
> This belongs into the commit message. This code has no knowledge about
> qemu. What you can mention here is the performance benefit.


Ok. Just FTR, this code seems used by qemu only, but I see how this might change in the future potentially.


> Also, QEMU migrating straight to a socket is going to have performance
> benefit but only in a few cases, because if it's a migration into a file
> then VFS (and thus caching) is involved. Thus, if you migrate into a
> file and have enough free RAM for caches then yes, it's going to be
> faster. But if you don't have free RAM then it's going to be way slower.


Yes, this is from a very specific requirement from the field.


> 
>> +        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");
>> +        if (rv != 0) {
>> +            pipe_sz = 1024 * 1024; /* common default for pipe-max-size */
>> +        }
>> +        fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);
> 
> Alternative implementation would be to call fcntl() only if we know
> we've succeeded in reading /proc/.../pipe-max-size, like this:

My implementation fails gracefully, ie if for any reason we cannot read pipe-max-size,
we could still succeed (maybe with a warning), with a reasonable default.

No need to fail hard on this.

> 
> int pipe_sz;
> int rv = virFileReadValueInt(&pipe_size, "/proc/sys/fs/pipe-max-size");
> 
> if (rv >= 0)
>   fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);
> 
> (notice I've declared variables on separate lines, we like it that way)
> 
> Now, what can we do about that ternary operator? It doesn't look nice.
> But I guess we can hardcode just one end of the pipe, because it's the
> actual pipe and not FD we are modifying here.
> 
> Lastly, let's add some error checking:
> 
> int pipe_sz;
> int rv = virFileReadValueInt(&pipe_size, "/proc/sys/fs/pipe-max-size");
> 
> if (rv == -2) {
>   /* file doesn't exist */
> } else if (rv < 0) {
>   /* Unable to read/parse file */
>   goto error;
> } else {
>   if (fcntl(pipefd[0], F_SETPIPE_SZ, pipe_sz) < 0) {
>     virReportSystemError(errno, _("Unable to set pipe size to %d"),
> pipe_sze);
>     goto error;
> }
> 
> 
> Even better, move this fcntl() or even the whole code into a separate
> function (virPipe() lives under src/util/virutil.c) so that it can be
> reused.
> 
> Michal
> 

hmm I doubt you want this for _every_ pipe, only for the ones.. you need a wrapper for as per your initial comment above?

Thanks,

Claudio
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Ani Sinha 2 years, 1 month ago
On Mon, Mar 14, 2022 at 5:06 PM Michal Prívozník <mprivozn@redhat.com> wrote:
>
> On 3/12/22 17:30, Claudio Fontana wrote:
> > the first user is the qemu driver,
> >
> > virsh save/resume would slow to a crawl with a default pipe size (64k).
> >
> > This improves the situation by 400%.
> >
> > Going through io_helper still seems to incur in some penalty (~15%-ish)
> > compared with direct qemu migration to a nc socket to a file.
> >
> > Signed-off-by: Claudio Fontana <cfontana@suse.de>
> > ---
> >  src/qemu/qemu_driver.c    |  6 +++---
> >  src/qemu/qemu_saveimage.c | 11 ++++++-----
> >  src/util/virfile.c        | 12 ++++++++++++
> >  src/util/virfile.h        |  1 +
> >  4 files changed, 22 insertions(+), 8 deletions(-)
> >
> > Hello, I initially thought this to be a qemu performance issue,
> > so you can find the discussion about this in qemu-devel:
> >
> > "Re: bad virsh save /dev/null performance (600 MiB/s max)"
> >
> > https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
> >
> > RFC since need to validate idea, and it is only lightly tested:
> >
> > save     - about 400% benefit in throughput, getting around 20 Gbps to /dev/null,
> >            and around 13 Gbps to a ramdisk.
> >          By comparison, direct qemu migration to a nc socket is around 24Gbps.
> >
> > restore  - not tested, _should_ also benefit in the "bypass_cache" case
> > coredump - not tested, _should_ also benefit like for save
> >
> > Thanks for your comments and review,
> >
> > Claudio
>
> Hey, I like this idea, but couple of points below.
>
> >
> >
> > diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> > index c1b3bd8536..be248c1e92 100644
> > --- a/src/qemu/qemu_driver.c
> > +++ b/src/qemu/qemu_driver.c
> > @@ -3044,7 +3044,7 @@ doCoreDump(virQEMUDriver *driver,
> >      virFileWrapperFd *wrapperFd = NULL;
> >      int directFlag = 0;
> >      bool needUnlink = false;
> > -    unsigned int flags = VIR_FILE_WRAPPER_NON_BLOCKING;
> > +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
> >      const char *memory_dump_format = NULL;
> >      g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
> >      g_autoptr(virCommand) compressor = NULL;
> > @@ -3059,7 +3059,7 @@ doCoreDump(virQEMUDriver *driver,
> >
> >      /* Create an empty file with appropriate ownership.  */
> >      if (dump_flags & VIR_DUMP_BYPASS_CACHE) {
> > -        flags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
> > +        wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
> >          directFlag = virFileDirectFdFlag();
> >          if (directFlag < 0) {
> >              virReportError(VIR_ERR_OPERATION_FAILED, "%s",
> > @@ -3072,7 +3072,7 @@ doCoreDump(virQEMUDriver *driver,
> >                               &needUnlink)) < 0)
> >          goto cleanup;
> >
> > -    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, flags)))
> > +    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
> >          goto cleanup;
> >
> >      if (dump_flags & VIR_DUMP_MEMORY_ONLY) {
> > diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c
> > index c0139041eb..1b522a1542 100644
> > --- a/src/qemu/qemu_saveimage.c
> > +++ b/src/qemu/qemu_saveimage.c
> > @@ -267,7 +267,7 @@ qemuSaveImageCreate(virQEMUDriver *driver,
> >      int fd = -1;
> >      int directFlag = 0;
> >      virFileWrapperFd *wrapperFd = NULL;
> > -    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING;
> > +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
> >
> >      /* Obtain the file handle.  */
> >      if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) {
> > @@ -463,10 +463,11 @@ qemuSaveImageOpen(virQEMUDriver *driver,
> >      if ((fd = qemuDomainOpenFile(cfg, NULL, path, oflags, NULL)) < 0)
> >          return -1;
> >
> > -    if (bypass_cache &&
> > -        !(*wrapperFd = virFileWrapperFdNew(&fd, path,
> > -                                           VIR_FILE_WRAPPER_BYPASS_CACHE)))
> > -        return -1;
> > +    if (bypass_cache) {
> > +        unsigned int wrapperFlags = VIR_FILE_WRAPPER_BYPASS_CACHE | VIR_FILE_WRAPPER_BIG_PIPE;
> > +        if (!(*wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
> > +            return -1;
> > +    }
> >
> >      data = g_new0(virQEMUSaveData, 1);
> >
> > diff --git a/src/util/virfile.c b/src/util/virfile.c
> > index a04f888e06..fdacd17890 100644
> > --- a/src/util/virfile.c
> > +++ b/src/util/virfile.c
> > @@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
> >
> >      ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
> >
> > +    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {
>
>
> I believe we don't need this flag. I mean, the plain fact that
> virFileWrapper is used means that caller wants to avoid VFS because it's
> interested in speed. Therefore, this code could be done unconditionally.
>
> > +        /*
> > +         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
> > +         * This improves the situation by 400%, although going through io_helper still incurs
> > +         * in a performance penalty compared with a direct qemu migration to a socket.
> > +         */
>
> This belongs into the commit message. This code has no knowledge about
> qemu. What you can mention here is the performance benefit.
> Also, QEMU migrating straight to a socket is going to have performance
> benefit but only in a few cases, because if it's a migration into a file
> then VFS (and thus caching) is involved. Thus, if you migrate into a
> file and have enough free RAM for caches then yes, it's going to be
> faster. But if you don't have free RAM then it's going to be way slower.
>
> > +        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");
> > +        if (rv != 0) {
> > +            pipe_sz = 1024 * 1024; /* common default for pipe-max-size */
> > +        }
> > +        fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);
>
> Alternative implementation would be to call fcntl() only if we know
> we've succeeded in reading /proc/.../pipe-max-size, like this:
>
> int pipe_sz;
> int rv = virFileReadValueInt(&pipe_size, "/proc/sys/fs/pipe-max-size");
>
> if (rv >= 0)
>   fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);
>
> (notice I've declared variables on separate lines, we like it that way)
>
> Now, what can we do about that ternary operator? It doesn't look nice.
> But I guess we can hardcode just one end of the pipe, because it's the
> actual pipe and not FD we are modifying here.
>
> Lastly, let's add some error checking:
>
> int pipe_sz;
> int rv = virFileReadValueInt(&pipe_size, "/proc/sys/fs/pipe-max-size");
>
> if (rv == -2) {
>   /* file doesn't exist */

Yes here we need to distinguish between error on linux vs legit error
on non-linux systems.
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Ani Sinha 2 years, 1 month ago
>
> diff --git a/src/util/virfile.c b/src/util/virfile.c
> index a04f888e06..fdacd17890 100644
> --- a/src/util/virfile.c
> +++ b/src/util/virfile.c
> @@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
>
>      ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
>
> +    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {
> +        /*
> +         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
> +         * This improves the situation by 400%, although going through io_helper still incurs
> +         * in a performance penalty compared with a direct qemu migration to a socket.
> +         */
> +        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");
> +        if (rv != 0) {
> +            pipe_sz = 1024 * 1024; /* common default for pipe-max-size */
> +        }
> +        fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);
> +    }

I believe this entire hunk of code should be ifdef'd within #ifdef
__linux__. non-windows does not necessarily mean only linux.
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Daniel P. Berrangé 2 years, 1 month ago
On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
> the first user is the qemu driver,
> 
> virsh save/resume would slow to a crawl with a default pipe size (64k).
> 
> This improves the situation by 400%.
> 
> Going through io_helper still seems to incur in some penalty (~15%-ish)
> compared with direct qemu migration to a nc socket to a file.
> 
> Signed-off-by: Claudio Fontana <cfontana@suse.de>
> ---
>  src/qemu/qemu_driver.c    |  6 +++---
>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>  src/util/virfile.c        | 12 ++++++++++++
>  src/util/virfile.h        |  1 +
>  4 files changed, 22 insertions(+), 8 deletions(-)
> 
> Hello, I initially thought this to be a qemu performance issue,
> so you can find the discussion about this in qemu-devel:
> 
> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
> 
> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
> 
> RFC since need to validate idea, and it is only lightly tested:
> 
> save     - about 400% benefit in throughput, getting around 20 Gbps to /dev/null,
>            and around 13 Gbps to a ramdisk.
> 	   By comparison, direct qemu migration to a nc socket is around 24Gbps.
> 
> restore  - not tested, _should_ also benefit in the "bypass_cache" case
> coredump - not tested, _should_ also benefit like for save
> 
> Thanks for your comments and review,
> 
> Claudio
> 
> 
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index c1b3bd8536..be248c1e92 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -3044,7 +3044,7 @@ doCoreDump(virQEMUDriver *driver,
>      virFileWrapperFd *wrapperFd = NULL;
>      int directFlag = 0;
>      bool needUnlink = false;
> -    unsigned int flags = VIR_FILE_WRAPPER_NON_BLOCKING;
> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>      const char *memory_dump_format = NULL;
>      g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>      g_autoptr(virCommand) compressor = NULL;
> @@ -3059,7 +3059,7 @@ doCoreDump(virQEMUDriver *driver,
>  
>      /* Create an empty file with appropriate ownership.  */
>      if (dump_flags & VIR_DUMP_BYPASS_CACHE) {
> -        flags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
> +        wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>          directFlag = virFileDirectFdFlag();
>          if (directFlag < 0) {
>              virReportError(VIR_ERR_OPERATION_FAILED, "%s",
> @@ -3072,7 +3072,7 @@ doCoreDump(virQEMUDriver *driver,
>                               &needUnlink)) < 0)
>          goto cleanup;
>  
> -    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, flags)))
> +    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>          goto cleanup;
>  
>      if (dump_flags & VIR_DUMP_MEMORY_ONLY) {
> diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c
> index c0139041eb..1b522a1542 100644
> --- a/src/qemu/qemu_saveimage.c
> +++ b/src/qemu/qemu_saveimage.c
> @@ -267,7 +267,7 @@ qemuSaveImageCreate(virQEMUDriver *driver,
>      int fd = -1;
>      int directFlag = 0;
>      virFileWrapperFd *wrapperFd = NULL;
> -    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING;
> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>  
>      /* Obtain the file handle.  */
>      if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) {
> @@ -463,10 +463,11 @@ qemuSaveImageOpen(virQEMUDriver *driver,
>      if ((fd = qemuDomainOpenFile(cfg, NULL, path, oflags, NULL)) < 0)
>          return -1;
>  
> -    if (bypass_cache &&
> -        !(*wrapperFd = virFileWrapperFdNew(&fd, path,
> -                                           VIR_FILE_WRAPPER_BYPASS_CACHE)))
> -        return -1;
> +    if (bypass_cache) {
> +        unsigned int wrapperFlags = VIR_FILE_WRAPPER_BYPASS_CACHE | VIR_FILE_WRAPPER_BIG_PIPE;
> +        if (!(*wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
> +            return -1;
> +    }
>  
>      data = g_new0(virQEMUSaveData, 1);
>  
> diff --git a/src/util/virfile.c b/src/util/virfile.c
> index a04f888e06..fdacd17890 100644
> --- a/src/util/virfile.c
> +++ b/src/util/virfile.c
> @@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
>  
>      ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
>  
> +    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {
> +        /*
> +         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
> +         * This improves the situation by 400%, although going through io_helper still incurs
> +         * in a performance penalty compared with a direct qemu migration to a socket.
> +         */
> +        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");

This is fine as an experiment but I don't think it is that safe
to use in the real world. There could be a variety of reasons why
an admin can enlarge this value, and we shouldn't assume the max
size is sensible for libvirt/QEMU to use.

I very much suspect there are diminishing returns here in terms
of buffer sizes.

64k is obvious too small, but 1 MB, may be sufficiently large
that the bottleneck is then elsewhere in our code. IOW, If the
pipe max size is 100 MB, we shouldn't blindly use it. Can you
do a few tests with varying sizes to see where a sensible
tradeoff falls ?

> +        if (rv != 0) {
> +            pipe_sz = 1024 * 1024; /* common default for pipe-max-size */
> +        }
> +        fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);
> +    }

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>> the first user is the qemu driver,
>>
>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>
>> This improves the situation by 400%.
>>
>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>> compared with direct qemu migration to a nc socket to a file.
>>
>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>> ---
>>  src/qemu/qemu_driver.c    |  6 +++---
>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>  src/util/virfile.c        | 12 ++++++++++++
>>  src/util/virfile.h        |  1 +
>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>
>> Hello, I initially thought this to be a qemu performance issue,
>> so you can find the discussion about this in qemu-devel:
>>
>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>
>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>
>> RFC since need to validate idea, and it is only lightly tested:
>>
>> save     - about 400% benefit in throughput, getting around 20 Gbps to /dev/null,
>>            and around 13 Gbps to a ramdisk.
>> 	   By comparison, direct qemu migration to a nc socket is around 24Gbps.
>>
>> restore  - not tested, _should_ also benefit in the "bypass_cache" case
>> coredump - not tested, _should_ also benefit like for save
>>
>> Thanks for your comments and review,
>>
>> Claudio
>>
>>
>> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
>> index c1b3bd8536..be248c1e92 100644
>> --- a/src/qemu/qemu_driver.c
>> +++ b/src/qemu/qemu_driver.c
>> @@ -3044,7 +3044,7 @@ doCoreDump(virQEMUDriver *driver,
>>      virFileWrapperFd *wrapperFd = NULL;
>>      int directFlag = 0;
>>      bool needUnlink = false;
>> -    unsigned int flags = VIR_FILE_WRAPPER_NON_BLOCKING;
>> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>>      const char *memory_dump_format = NULL;
>>      g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>>      g_autoptr(virCommand) compressor = NULL;
>> @@ -3059,7 +3059,7 @@ doCoreDump(virQEMUDriver *driver,
>>  
>>      /* Create an empty file with appropriate ownership.  */
>>      if (dump_flags & VIR_DUMP_BYPASS_CACHE) {
>> -        flags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>> +        wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>>          directFlag = virFileDirectFdFlag();
>>          if (directFlag < 0) {
>>              virReportError(VIR_ERR_OPERATION_FAILED, "%s",
>> @@ -3072,7 +3072,7 @@ doCoreDump(virQEMUDriver *driver,
>>                               &needUnlink)) < 0)
>>          goto cleanup;
>>  
>> -    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, flags)))
>> +    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>>          goto cleanup;
>>  
>>      if (dump_flags & VIR_DUMP_MEMORY_ONLY) {
>> diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c
>> index c0139041eb..1b522a1542 100644
>> --- a/src/qemu/qemu_saveimage.c
>> +++ b/src/qemu/qemu_saveimage.c
>> @@ -267,7 +267,7 @@ qemuSaveImageCreate(virQEMUDriver *driver,
>>      int fd = -1;
>>      int directFlag = 0;
>>      virFileWrapperFd *wrapperFd = NULL;
>> -    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING;
>> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>>  
>>      /* Obtain the file handle.  */
>>      if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) {
>> @@ -463,10 +463,11 @@ qemuSaveImageOpen(virQEMUDriver *driver,
>>      if ((fd = qemuDomainOpenFile(cfg, NULL, path, oflags, NULL)) < 0)
>>          return -1;
>>  
>> -    if (bypass_cache &&
>> -        !(*wrapperFd = virFileWrapperFdNew(&fd, path,
>> -                                           VIR_FILE_WRAPPER_BYPASS_CACHE)))
>> -        return -1;
>> +    if (bypass_cache) {
>> +        unsigned int wrapperFlags = VIR_FILE_WRAPPER_BYPASS_CACHE | VIR_FILE_WRAPPER_BIG_PIPE;
>> +        if (!(*wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>> +            return -1;
>> +    }
>>  
>>      data = g_new0(virQEMUSaveData, 1);
>>  
>> diff --git a/src/util/virfile.c b/src/util/virfile.c
>> index a04f888e06..fdacd17890 100644
>> --- a/src/util/virfile.c
>> +++ b/src/util/virfile.c
>> @@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
>>  
>>      ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
>>  
>> +    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {
>> +        /*
>> +         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
>> +         * This improves the situation by 400%, although going through io_helper still incurs
>> +         * in a performance penalty compared with a direct qemu migration to a socket.
>> +         */
>> +        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");
> 
> This is fine as an experiment but I don't think it is that safe
> to use in the real world. There could be a variety of reasons why
> an admin can enlarge this value, and we shouldn't assume the max
> size is sensible for libvirt/QEMU to use.
> 
> I very much suspect there are diminishing returns here in terms
> of buffer sizes.
> 
> 64k is obvious too small, but 1 MB, may be sufficiently large
> that the bottleneck is then elsewhere in our code. IOW, If the
> pipe max size is 100 MB, we shouldn't blindly use it. Can you
> do a few tests with varying sizes to see where a sensible
> tradeoff falls ?


Hi Daniel,

this is a very good point. Actually I see very diminishing returns after the default pipe-max-size (1MB).

The idea was that beyond allowing larger size, the admin could have set a _smaller_ pipe-max-size,
so we want to use that in that case, otherwise an attempt to use 1MB would result in EPERM, if the process does not have CAP_SYS_RESOURCE or CAP_SYS_ADMIN.
I am not sure if used with Kubevirt, for example, CAP_SYS_RESOURCE or CAP_SYS_ADMIN would be available...?

So maybe one idea could be to use the minimum between /proc/sys/fs/pipe-max-size and for example 1MB, but will do more testing to see where the actual break point is.

Wdyt?

Thanks!

Claudio


> 
>> +        if (rv != 0) {
>> +            pipe_sz = 1024 * 1024; /* common default for pipe-max-size */
>> +        }
>> +        fcntl(pipefd[output ? 0 : 1], F_SETPIPE_SZ, pipe_sz);
>> +    }
> 
> Regards,
> Daniel
> 
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Daniel P. Berrangé 2 years, 1 month ago
On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
> > On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
> >> the first user is the qemu driver,
> >>
> >> virsh save/resume would slow to a crawl with a default pipe size (64k).
> >>
> >> This improves the situation by 400%.
> >>
> >> Going through io_helper still seems to incur in some penalty (~15%-ish)
> >> compared with direct qemu migration to a nc socket to a file.
> >>
> >> Signed-off-by: Claudio Fontana <cfontana@suse.de>
> >> ---
> >>  src/qemu/qemu_driver.c    |  6 +++---
> >>  src/qemu/qemu_saveimage.c | 11 ++++++-----
> >>  src/util/virfile.c        | 12 ++++++++++++
> >>  src/util/virfile.h        |  1 +
> >>  4 files changed, 22 insertions(+), 8 deletions(-)
> >>
> >> Hello, I initially thought this to be a qemu performance issue,
> >> so you can find the discussion about this in qemu-devel:
> >>
> >> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
> >>
> >> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
> >>
> >> RFC since need to validate idea, and it is only lightly tested:
> >>
> >> save     - about 400% benefit in throughput, getting around 20 Gbps to /dev/null,
> >>            and around 13 Gbps to a ramdisk.
> >> 	   By comparison, direct qemu migration to a nc socket is around 24Gbps.
> >>
> >> restore  - not tested, _should_ also benefit in the "bypass_cache" case
> >> coredump - not tested, _should_ also benefit like for save
> >>
> >> Thanks for your comments and review,
> >>
> >> Claudio
> >>
> >>
> >> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> >> index c1b3bd8536..be248c1e92 100644
> >> --- a/src/qemu/qemu_driver.c
> >> +++ b/src/qemu/qemu_driver.c
> >> @@ -3044,7 +3044,7 @@ doCoreDump(virQEMUDriver *driver,
> >>      virFileWrapperFd *wrapperFd = NULL;
> >>      int directFlag = 0;
> >>      bool needUnlink = false;
> >> -    unsigned int flags = VIR_FILE_WRAPPER_NON_BLOCKING;
> >> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
> >>      const char *memory_dump_format = NULL;
> >>      g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
> >>      g_autoptr(virCommand) compressor = NULL;
> >> @@ -3059,7 +3059,7 @@ doCoreDump(virQEMUDriver *driver,
> >>  
> >>      /* Create an empty file with appropriate ownership.  */
> >>      if (dump_flags & VIR_DUMP_BYPASS_CACHE) {
> >> -        flags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
> >> +        wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
> >>          directFlag = virFileDirectFdFlag();
> >>          if (directFlag < 0) {
> >>              virReportError(VIR_ERR_OPERATION_FAILED, "%s",
> >> @@ -3072,7 +3072,7 @@ doCoreDump(virQEMUDriver *driver,
> >>                               &needUnlink)) < 0)
> >>          goto cleanup;
> >>  
> >> -    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, flags)))
> >> +    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
> >>          goto cleanup;
> >>  
> >>      if (dump_flags & VIR_DUMP_MEMORY_ONLY) {
> >> diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c
> >> index c0139041eb..1b522a1542 100644
> >> --- a/src/qemu/qemu_saveimage.c
> >> +++ b/src/qemu/qemu_saveimage.c
> >> @@ -267,7 +267,7 @@ qemuSaveImageCreate(virQEMUDriver *driver,
> >>      int fd = -1;
> >>      int directFlag = 0;
> >>      virFileWrapperFd *wrapperFd = NULL;
> >> -    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING;
> >> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
> >>  
> >>      /* Obtain the file handle.  */
> >>      if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) {
> >> @@ -463,10 +463,11 @@ qemuSaveImageOpen(virQEMUDriver *driver,
> >>      if ((fd = qemuDomainOpenFile(cfg, NULL, path, oflags, NULL)) < 0)
> >>          return -1;
> >>  
> >> -    if (bypass_cache &&
> >> -        !(*wrapperFd = virFileWrapperFdNew(&fd, path,
> >> -                                           VIR_FILE_WRAPPER_BYPASS_CACHE)))
> >> -        return -1;
> >> +    if (bypass_cache) {
> >> +        unsigned int wrapperFlags = VIR_FILE_WRAPPER_BYPASS_CACHE | VIR_FILE_WRAPPER_BIG_PIPE;
> >> +        if (!(*wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
> >> +            return -1;
> >> +    }
> >>  
> >>      data = g_new0(virQEMUSaveData, 1);
> >>  
> >> diff --git a/src/util/virfile.c b/src/util/virfile.c
> >> index a04f888e06..fdacd17890 100644
> >> --- a/src/util/virfile.c
> >> +++ b/src/util/virfile.c
> >> @@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
> >>  
> >>      ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
> >>  
> >> +    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {
> >> +        /*
> >> +         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
> >> +         * This improves the situation by 400%, although going through io_helper still incurs
> >> +         * in a performance penalty compared with a direct qemu migration to a socket.
> >> +         */
> >> +        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");
> > 
> > This is fine as an experiment but I don't think it is that safe
> > to use in the real world. There could be a variety of reasons why
> > an admin can enlarge this value, and we shouldn't assume the max
> > size is sensible for libvirt/QEMU to use.
> > 
> > I very much suspect there are diminishing returns here in terms
> > of buffer sizes.
> > 
> > 64k is obvious too small, but 1 MB, may be sufficiently large
> > that the bottleneck is then elsewhere in our code. IOW, If the
> > pipe max size is 100 MB, we shouldn't blindly use it. Can you
> > do a few tests with varying sizes to see where a sensible
> > tradeoff falls ?
> 
> 
> Hi Daniel,
> 
> this is a very good point. Actually I see very diminishing returns after the default pipe-max-size (1MB).
> 
> The idea was that beyond allowing larger size, the admin could have set a _smaller_ pipe-max-size,
> so we want to use that in that case, otherwise an attempt to use 1MB would result in EPERM, if the process does not have CAP_SYS_RESOURCE or CAP_SYS_ADMIN.
> I am not sure if used with Kubevirt, for example, CAP_SYS_RESOURCE or CAP_SYS_ADMIN would be available...?
> 
> So maybe one idea could be to use the minimum between /proc/sys/fs/pipe-max-size and for example 1MB, but will do more testing to see where the actual break point is.

That's reasonable.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>> the first user is the qemu driver,
>>>>
>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>
>>>> This improves the situation by 400%.
>>>>
>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>> compared with direct qemu migration to a nc socket to a file.
>>>>
>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>> ---
>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>  src/util/virfile.h        |  1 +
>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>
>>>> Hello, I initially thought this to be a qemu performance issue,
>>>> so you can find the discussion about this in qemu-devel:
>>>>
>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>
>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>
>>>> RFC since need to validate idea, and it is only lightly tested:
>>>>
>>>> save     - about 400% benefit in throughput, getting around 20 Gbps to /dev/null,
>>>>            and around 13 Gbps to a ramdisk.
>>>> 	   By comparison, direct qemu migration to a nc socket is around 24Gbps.
>>>>
>>>> restore  - not tested, _should_ also benefit in the "bypass_cache" case
>>>> coredump - not tested, _should_ also benefit like for save
>>>>
>>>> Thanks for your comments and review,
>>>>
>>>> Claudio
>>>>
>>>>
>>>> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
>>>> index c1b3bd8536..be248c1e92 100644
>>>> --- a/src/qemu/qemu_driver.c
>>>> +++ b/src/qemu/qemu_driver.c
>>>> @@ -3044,7 +3044,7 @@ doCoreDump(virQEMUDriver *driver,
>>>>      virFileWrapperFd *wrapperFd = NULL;
>>>>      int directFlag = 0;
>>>>      bool needUnlink = false;
>>>> -    unsigned int flags = VIR_FILE_WRAPPER_NON_BLOCKING;
>>>> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>>>>      const char *memory_dump_format = NULL;
>>>>      g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>>>>      g_autoptr(virCommand) compressor = NULL;
>>>> @@ -3059,7 +3059,7 @@ doCoreDump(virQEMUDriver *driver,
>>>>  
>>>>      /* Create an empty file with appropriate ownership.  */
>>>>      if (dump_flags & VIR_DUMP_BYPASS_CACHE) {
>>>> -        flags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>>>> +        wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>>>>          directFlag = virFileDirectFdFlag();
>>>>          if (directFlag < 0) {
>>>>              virReportError(VIR_ERR_OPERATION_FAILED, "%s",
>>>> @@ -3072,7 +3072,7 @@ doCoreDump(virQEMUDriver *driver,
>>>>                               &needUnlink)) < 0)
>>>>          goto cleanup;
>>>>  
>>>> -    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, flags)))
>>>> +    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>>>>          goto cleanup;
>>>>  
>>>>      if (dump_flags & VIR_DUMP_MEMORY_ONLY) {
>>>> diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c
>>>> index c0139041eb..1b522a1542 100644
>>>> --- a/src/qemu/qemu_saveimage.c
>>>> +++ b/src/qemu/qemu_saveimage.c
>>>> @@ -267,7 +267,7 @@ qemuSaveImageCreate(virQEMUDriver *driver,
>>>>      int fd = -1;
>>>>      int directFlag = 0;
>>>>      virFileWrapperFd *wrapperFd = NULL;
>>>> -    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING;
>>>> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>>>>  
>>>>      /* Obtain the file handle.  */
>>>>      if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) {
>>>> @@ -463,10 +463,11 @@ qemuSaveImageOpen(virQEMUDriver *driver,
>>>>      if ((fd = qemuDomainOpenFile(cfg, NULL, path, oflags, NULL)) < 0)
>>>>          return -1;
>>>>  
>>>> -    if (bypass_cache &&
>>>> -        !(*wrapperFd = virFileWrapperFdNew(&fd, path,
>>>> -                                           VIR_FILE_WRAPPER_BYPASS_CACHE)))
>>>> -        return -1;
>>>> +    if (bypass_cache) {
>>>> +        unsigned int wrapperFlags = VIR_FILE_WRAPPER_BYPASS_CACHE | VIR_FILE_WRAPPER_BIG_PIPE;
>>>> +        if (!(*wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>>>> +            return -1;
>>>> +    }
>>>>  
>>>>      data = g_new0(virQEMUSaveData, 1);
>>>>  
>>>> diff --git a/src/util/virfile.c b/src/util/virfile.c
>>>> index a04f888e06..fdacd17890 100644
>>>> --- a/src/util/virfile.c
>>>> +++ b/src/util/virfile.c
>>>> @@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
>>>>  
>>>>      ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
>>>>  
>>>> +    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {
>>>> +        /*
>>>> +         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
>>>> +         * This improves the situation by 400%, although going through io_helper still incurs
>>>> +         * in a performance penalty compared with a direct qemu migration to a socket.
>>>> +         */
>>>> +        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");
>>>
>>> This is fine as an experiment but I don't think it is that safe
>>> to use in the real world. There could be a variety of reasons why
>>> an admin can enlarge this value, and we shouldn't assume the max
>>> size is sensible for libvirt/QEMU to use.
>>>
>>> I very much suspect there are diminishing returns here in terms
>>> of buffer sizes.
>>>
>>> 64k is obvious too small, but 1 MB, may be sufficiently large
>>> that the bottleneck is then elsewhere in our code. IOW, If the
>>> pipe max size is 100 MB, we shouldn't blindly use it. Can you
>>> do a few tests with varying sizes to see where a sensible
>>> tradeoff falls ?
>>
>>
>> Hi Daniel,
>>
>> this is a very good point. Actually I see very diminishing returns after the default pipe-max-size (1MB).
>>
>> The idea was that beyond allowing larger size, the admin could have set a _smaller_ pipe-max-size,
>> so we want to use that in that case, otherwise an attempt to use 1MB would result in EPERM, if the process does not have CAP_SYS_RESOURCE or CAP_SYS_ADMIN.
>> I am not sure if used with Kubevirt, for example, CAP_SYS_RESOURCE or CAP_SYS_ADMIN would be available...?
>>
>> So maybe one idea could be to use the minimum between /proc/sys/fs/pipe-max-size and for example 1MB, but will do more testing to see where the actual break point is.
> 
> That's reasonable.
> 

Just as an update: still running tests with various combinations, and larger VMs (to RAM, to slow disk, and now to nvme).

For now no clear winner yet. There seems to be a significant benefit already going from 1MB (my previous default) to 2MB,
but anything more than 16MB seems to not improve anything at all.

But I just need to do more testing, more runs.

Thanks,

Claudio
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/16/22 1:17 PM, Claudio Fontana wrote:
> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>> the first user is the qemu driver,
>>>>>
>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>
>>>>> This improves the situation by 400%.
>>>>>
>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>
>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>> ---
>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>  src/util/virfile.h        |  1 +
>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>
>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>> so you can find the discussion about this in qemu-devel:
>>>>>
>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>
>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>>
>>>>> RFC since need to validate idea, and it is only lightly tested:
>>>>>
>>>>> save     - about 400% benefit in throughput, getting around 20 Gbps to /dev/null,
>>>>>            and around 13 Gbps to a ramdisk.
>>>>> 	   By comparison, direct qemu migration to a nc socket is around 24Gbps.
>>>>>
>>>>> restore  - not tested, _should_ also benefit in the "bypass_cache" case
>>>>> coredump - not tested, _should_ also benefit like for save
>>>>>
>>>>> Thanks for your comments and review,
>>>>>
>>>>> Claudio
>>>>>
>>>>>
>>>>> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
>>>>> index c1b3bd8536..be248c1e92 100644
>>>>> --- a/src/qemu/qemu_driver.c
>>>>> +++ b/src/qemu/qemu_driver.c
>>>>> @@ -3044,7 +3044,7 @@ doCoreDump(virQEMUDriver *driver,
>>>>>      virFileWrapperFd *wrapperFd = NULL;
>>>>>      int directFlag = 0;
>>>>>      bool needUnlink = false;
>>>>> -    unsigned int flags = VIR_FILE_WRAPPER_NON_BLOCKING;
>>>>> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>>>>>      const char *memory_dump_format = NULL;
>>>>>      g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver);
>>>>>      g_autoptr(virCommand) compressor = NULL;
>>>>> @@ -3059,7 +3059,7 @@ doCoreDump(virQEMUDriver *driver,
>>>>>  
>>>>>      /* Create an empty file with appropriate ownership.  */
>>>>>      if (dump_flags & VIR_DUMP_BYPASS_CACHE) {
>>>>> -        flags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>>>>> +        wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE;
>>>>>          directFlag = virFileDirectFdFlag();
>>>>>          if (directFlag < 0) {
>>>>>              virReportError(VIR_ERR_OPERATION_FAILED, "%s",
>>>>> @@ -3072,7 +3072,7 @@ doCoreDump(virQEMUDriver *driver,
>>>>>                               &needUnlink)) < 0)
>>>>>          goto cleanup;
>>>>>  
>>>>> -    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, flags)))
>>>>> +    if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>>>>>          goto cleanup;
>>>>>  
>>>>>      if (dump_flags & VIR_DUMP_MEMORY_ONLY) {
>>>>> diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c
>>>>> index c0139041eb..1b522a1542 100644
>>>>> --- a/src/qemu/qemu_saveimage.c
>>>>> +++ b/src/qemu/qemu_saveimage.c
>>>>> @@ -267,7 +267,7 @@ qemuSaveImageCreate(virQEMUDriver *driver,
>>>>>      int fd = -1;
>>>>>      int directFlag = 0;
>>>>>      virFileWrapperFd *wrapperFd = NULL;
>>>>> -    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING;
>>>>> +    unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING | VIR_FILE_WRAPPER_BIG_PIPE;
>>>>>  
>>>>>      /* Obtain the file handle.  */
>>>>>      if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) {
>>>>> @@ -463,10 +463,11 @@ qemuSaveImageOpen(virQEMUDriver *driver,
>>>>>      if ((fd = qemuDomainOpenFile(cfg, NULL, path, oflags, NULL)) < 0)
>>>>>          return -1;
>>>>>  
>>>>> -    if (bypass_cache &&
>>>>> -        !(*wrapperFd = virFileWrapperFdNew(&fd, path,
>>>>> -                                           VIR_FILE_WRAPPER_BYPASS_CACHE)))
>>>>> -        return -1;
>>>>> +    if (bypass_cache) {
>>>>> +        unsigned int wrapperFlags = VIR_FILE_WRAPPER_BYPASS_CACHE | VIR_FILE_WRAPPER_BIG_PIPE;
>>>>> +        if (!(*wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags)))
>>>>> +            return -1;
>>>>> +    }
>>>>>  
>>>>>      data = g_new0(virQEMUSaveData, 1);
>>>>>  
>>>>> diff --git a/src/util/virfile.c b/src/util/virfile.c
>>>>> index a04f888e06..fdacd17890 100644
>>>>> --- a/src/util/virfile.c
>>>>> +++ b/src/util/virfile.c
>>>>> @@ -282,6 +282,18 @@ virFileWrapperFdNew(int *fd, const char *name, unsigned int flags)
>>>>>  
>>>>>      ret->cmd = virCommandNewArgList(iohelper_path, name, NULL);
>>>>>  
>>>>> +    if (flags & VIR_FILE_WRAPPER_BIG_PIPE) {
>>>>> +        /*
>>>>> +         * virsh save/resume would slow to a crawl with a default pipe size (usually 64k).
>>>>> +         * This improves the situation by 400%, although going through io_helper still incurs
>>>>> +         * in a performance penalty compared with a direct qemu migration to a socket.
>>>>> +         */
>>>>> +        int pipe_sz, rv = virFileReadValueInt(&pipe_sz, "/proc/sys/fs/pipe-max-size");
>>>>
>>>> This is fine as an experiment but I don't think it is that safe
>>>> to use in the real world. There could be a variety of reasons why
>>>> an admin can enlarge this value, and we shouldn't assume the max
>>>> size is sensible for libvirt/QEMU to use.
>>>>
>>>> I very much suspect there are diminishing returns here in terms
>>>> of buffer sizes.
>>>>
>>>> 64k is obvious too small, but 1 MB, may be sufficiently large
>>>> that the bottleneck is then elsewhere in our code. IOW, If the
>>>> pipe max size is 100 MB, we shouldn't blindly use it. Can you
>>>> do a few tests with varying sizes to see where a sensible
>>>> tradeoff falls ?
>>>
>>>
>>> Hi Daniel,
>>>
>>> this is a very good point. Actually I see very diminishing returns after the default pipe-max-size (1MB).
>>>
>>> The idea was that beyond allowing larger size, the admin could have set a _smaller_ pipe-max-size,
>>> so we want to use that in that case, otherwise an attempt to use 1MB would result in EPERM, if the process does not have CAP_SYS_RESOURCE or CAP_SYS_ADMIN.
>>> I am not sure if used with Kubevirt, for example, CAP_SYS_RESOURCE or CAP_SYS_ADMIN would be available...?
>>>
>>> So maybe one idea could be to use the minimum between /proc/sys/fs/pipe-max-size and for example 1MB, but will do more testing to see where the actual break point is.
>>
>> That's reasonable.
>>
> 
> Just as an update: still running tests with various combinations, and larger VMs (to RAM, to slow disk, and now to nvme).
> 
> For now no clear winner yet. There seems to be a significant benefit already going from 1MB (my previous default) to 2MB,
> but anything more than 16MB seems to not improve anything at all.
> 
> But I just need to do more testing, more runs.
> 
> Thanks,
> 
> Claudio
> 

Current results show these experimental averages maximum throughput migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP "query-migrate", tests repeated 5 times for each).
VM Size is 60G, most of the memory effectively touched before migration, through user application allocating and touching all memory with pseudorandom data.

64K:     5200 Mbps (current situation)
128K:    5800 Mbps
256K:   20900 Mbps
512K:   21600 Mbps
1M:     22800 Mbps
2M:     22800 Mbps
4M:     22400 Mbps
8M:     22500 Mbps
16M:    22800 Mbps
32M:    22900 Mbps
64M:    22900 Mbps
128M:   22800 Mbps

This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.

As for the theoretical limit for the libvirt architecture,
I ran a qemu migration directly issuing the appropriate QMP commands, setting the same migration parameters as per libvirt, and then migrating to a socket netcatted to /dev/null via
{"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } : 

QMP:    37000 Mbps

---

So although the Pipe size improves things (in particular the large jump is for the 256K size, although 1M seems a very good value),
there is still a second bottleneck in there somewhere that accounts for a loss of ~14200 Mbps in throughput.

Thanks,

Claudio
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Daniel P. Berrangé 2 years, 1 month ago
On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
> On 3/16/22 1:17 PM, Claudio Fontana wrote:
> > On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
> >> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
> >>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
> >>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
> >>>>> the first user is the qemu driver,
> >>>>>
> >>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
> >>>>>
> >>>>> This improves the situation by 400%.
> >>>>>
> >>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
> >>>>> compared with direct qemu migration to a nc socket to a file.
> >>>>>
> >>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
> >>>>> ---
> >>>>>  src/qemu/qemu_driver.c    |  6 +++---
> >>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
> >>>>>  src/util/virfile.c        | 12 ++++++++++++
> >>>>>  src/util/virfile.h        |  1 +
> >>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
> >>>>>
> >>>>> Hello, I initially thought this to be a qemu performance issue,
> >>>>> so you can find the discussion about this in qemu-devel:
> >>>>>
> >>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
> >>>>>
> >>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html


> Current results show these experimental averages maximum throughput
> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
> "query-migrate", tests repeated 5 times for each).
> VM Size is 60G, most of the memory effectively touched before migration,
> through user application allocating and touching all memory with
> pseudorandom data.
> 
> 64K:     5200 Mbps (current situation)
> 128K:    5800 Mbps
> 256K:   20900 Mbps
> 512K:   21600 Mbps
> 1M:     22800 Mbps
> 2M:     22800 Mbps
> 4M:     22400 Mbps
> 8M:     22500 Mbps
> 16M:    22800 Mbps
> 32M:    22900 Mbps
> 64M:    22900 Mbps
> 128M:   22800 Mbps
> 
> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.

Ok, its bouncing around with noise after 1 MB. So I'd suggest that
libvirt attempt to raise the pipe limit to 1 MB by default, but
not try to go higher.

> As for the theoretical limit for the libvirt architecture,
> I ran a qemu migration directly issuing the appropriate QMP
> commands, setting the same migration parameters as per libvirt,
> and then migrating to a socket netcatted to /dev/null via
> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
> 
> QMP:    37000 Mbps

> So although the Pipe size improves things (in particular the
> large jump is for the 256K size, although 1M seems a very good value),
> there is still a second bottleneck in there somewhere that
> accounts for a loss of ~14200 Mbps in throughput.

In the above tests with libvirt, were you using the
--bypass-cache flag or not ?

Hopefully use of O_DIRECT doesn't make a difference for
/dev/null, since the I/O is being immediately thrown
away and so ought to never go into I/O cache. 

In terms of the comparison, we still have libvirt iohelper
giving QEMU a pipe, while your test above gives QEMU a
UNIX socket.

So I still wonder if the delta is caused by the pipe vs socket
difference, as opposed to netcat vs libvirt iohelper code.

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>> the first user is the qemu driver,
>>>>>>>
>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>
>>>>>>> This improves the situation by 400%.
>>>>>>>
>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>
>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>> ---
>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>
>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>
>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>
>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
> 
> 
>> Current results show these experimental averages maximum throughput
>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>> "query-migrate", tests repeated 5 times for each).
>> VM Size is 60G, most of the memory effectively touched before migration,
>> through user application allocating and touching all memory with
>> pseudorandom data.
>>
>> 64K:     5200 Mbps (current situation)
>> 128K:    5800 Mbps
>> 256K:   20900 Mbps
>> 512K:   21600 Mbps
>> 1M:     22800 Mbps
>> 2M:     22800 Mbps
>> 4M:     22400 Mbps
>> 8M:     22500 Mbps
>> 16M:    22800 Mbps
>> 32M:    22900 Mbps
>> 64M:    22900 Mbps
>> 128M:   22800 Mbps
>>
>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
> 
> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
> libvirt attempt to raise the pipe limit to 1 MB by default, but
> not try to go higher.
> 
>> As for the theoretical limit for the libvirt architecture,
>> I ran a qemu migration directly issuing the appropriate QMP
>> commands, setting the same migration parameters as per libvirt,
>> and then migrating to a socket netcatted to /dev/null via
>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>
>> QMP:    37000 Mbps
> 
>> So although the Pipe size improves things (in particular the
>> large jump is for the 256K size, although 1M seems a very good value),
>> there is still a second bottleneck in there somewhere that
>> accounts for a loss of ~14200 Mbps in throughput.
> 
> In the above tests with libvirt, were you using the
> --bypass-cache flag or not ?

No, I do not. Tests with ramdisk did not show a notable difference for me,

but tests with /dev/null were not possible, since the command line is not accepted:

# virsh save centos7 /dev/null
Domain 'centos7' saved to /dev/null
[OK]

# virsh save centos7 /dev/null --bypass-cache
error: Failed to save domain 'centos7' to /dev/null
error: Failed to create file '/dev/null': Invalid argument


> 
> Hopefully use of O_DIRECT doesn't make a difference for
> /dev/null, since the I/O is being immediately thrown
> away and so ought to never go into I/O cache. 
> 
> In terms of the comparison, we still have libvirt iohelper
> giving QEMU a pipe, while your test above gives QEMU a
> UNIX socket.
> 
> So I still wonder if the delta is caused by the pipe vs socket
> difference, as opposed to netcat vs libvirt iohelper code.

I'll look into this aspect, thanks!
> 
> With regards,
> Daniel
> 

Ciao,

Claudio
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/17/22 2:41 PM, Claudio Fontana wrote:
> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>>> the first user is the qemu driver,
>>>>>>>>
>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>>
>>>>>>>> This improves the situation by 400%.
>>>>>>>>
>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>>
>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>>> ---
>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>>
>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>>
>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>>
>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>
>>
>>> Current results show these experimental averages maximum throughput
>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>>> "query-migrate", tests repeated 5 times for each).
>>> VM Size is 60G, most of the memory effectively touched before migration,
>>> through user application allocating and touching all memory with
>>> pseudorandom data.
>>>
>>> 64K:     5200 Mbps (current situation)
>>> 128K:    5800 Mbps
>>> 256K:   20900 Mbps
>>> 512K:   21600 Mbps
>>> 1M:     22800 Mbps
>>> 2M:     22800 Mbps
>>> 4M:     22400 Mbps
>>> 8M:     22500 Mbps
>>> 16M:    22800 Mbps
>>> 32M:    22900 Mbps
>>> 64M:    22900 Mbps
>>> 128M:   22800 Mbps
>>>
>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
>>
>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
>> libvirt attempt to raise the pipe limit to 1 MB by default, but
>> not try to go higher.
>>
>>> As for the theoretical limit for the libvirt architecture,
>>> I ran a qemu migration directly issuing the appropriate QMP
>>> commands, setting the same migration parameters as per libvirt,
>>> and then migrating to a socket netcatted to /dev/null via
>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>>
>>> QMP:    37000 Mbps
>>
>>> So although the Pipe size improves things (in particular the
>>> large jump is for the 256K size, although 1M seems a very good value),
>>> there is still a second bottleneck in there somewhere that
>>> accounts for a loss of ~14200 Mbps in throughput.


Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.

~50000 mbps qemu to netcat socket to /dev/null
~35500 mbps virsh save to /dev/null

Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).

Ciao,

C

>>
>> In the above tests with libvirt, were you using the
>> --bypass-cache flag or not ?
> 
> No, I do not. Tests with ramdisk did not show a notable difference for me,
> 
> but tests with /dev/null were not possible, since the command line is not accepted:
> 
> # virsh save centos7 /dev/null
> Domain 'centos7' saved to /dev/null
> [OK]
> 
> # virsh save centos7 /dev/null --bypass-cache
> error: Failed to save domain 'centos7' to /dev/null
> error: Failed to create file '/dev/null': Invalid argument
> 
> 
>>
>> Hopefully use of O_DIRECT doesn't make a difference for
>> /dev/null, since the I/O is being immediately thrown
>> away and so ought to never go into I/O cache. 
>>
>> In terms of the comparison, we still have libvirt iohelper
>> giving QEMU a pipe, while your test above gives QEMU a
>> UNIX socket.
>>
>> So I still wonder if the delta is caused by the pipe vs socket
>> difference, as opposed to netcat vs libvirt iohelper code.
> 
> I'll look into this aspect, thanks!
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Dr. David Alan Gilbert 2 years, 1 month ago
* Claudio Fontana (cfontana@suse.de) wrote:
> On 3/17/22 2:41 PM, Claudio Fontana wrote:
> > On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
> >> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
> >>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
> >>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
> >>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
> >>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
> >>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
> >>>>>>>> the first user is the qemu driver,
> >>>>>>>>
> >>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
> >>>>>>>>
> >>>>>>>> This improves the situation by 400%.
> >>>>>>>>
> >>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
> >>>>>>>> compared with direct qemu migration to a nc socket to a file.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
> >>>>>>>> ---
> >>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
> >>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
> >>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
> >>>>>>>>  src/util/virfile.h        |  1 +
> >>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
> >>>>>>>>
> >>>>>>>> Hello, I initially thought this to be a qemu performance issue,
> >>>>>>>> so you can find the discussion about this in qemu-devel:
> >>>>>>>>
> >>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
> >>>>>>>>
> >>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
> >>
> >>
> >>> Current results show these experimental averages maximum throughput
> >>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
> >>> "query-migrate", tests repeated 5 times for each).
> >>> VM Size is 60G, most of the memory effectively touched before migration,
> >>> through user application allocating and touching all memory with
> >>> pseudorandom data.
> >>>
> >>> 64K:     5200 Mbps (current situation)
> >>> 128K:    5800 Mbps
> >>> 256K:   20900 Mbps
> >>> 512K:   21600 Mbps
> >>> 1M:     22800 Mbps
> >>> 2M:     22800 Mbps
> >>> 4M:     22400 Mbps
> >>> 8M:     22500 Mbps
> >>> 16M:    22800 Mbps
> >>> 32M:    22900 Mbps
> >>> 64M:    22900 Mbps
> >>> 128M:   22800 Mbps
> >>>
> >>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
> >>
> >> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
> >> libvirt attempt to raise the pipe limit to 1 MB by default, but
> >> not try to go higher.
> >>
> >>> As for the theoretical limit for the libvirt architecture,
> >>> I ran a qemu migration directly issuing the appropriate QMP
> >>> commands, setting the same migration parameters as per libvirt,
> >>> and then migrating to a socket netcatted to /dev/null via
> >>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
> >>>
> >>> QMP:    37000 Mbps
> >>
> >>> So although the Pipe size improves things (in particular the
> >>> large jump is for the 256K size, although 1M seems a very good value),
> >>> there is still a second bottleneck in there somewhere that
> >>> accounts for a loss of ~14200 Mbps in throughput.
> 
> 
> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
> 
> ~50000 mbps qemu to netcat socket to /dev/null
> ~35500 mbps virsh save to /dev/null
> 
> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).

It might be closer to RAM or cache bandwidth limited though; for an extra copy.

Dave

> Ciao,
> 
> C
> 
> >>
> >> In the above tests with libvirt, were you using the
> >> --bypass-cache flag or not ?
> > 
> > No, I do not. Tests with ramdisk did not show a notable difference for me,
> > 
> > but tests with /dev/null were not possible, since the command line is not accepted:
> > 
> > # virsh save centos7 /dev/null
> > Domain 'centos7' saved to /dev/null
> > [OK]
> > 
> > # virsh save centos7 /dev/null --bypass-cache
> > error: Failed to save domain 'centos7' to /dev/null
> > error: Failed to create file '/dev/null': Invalid argument
> > 
> > 
> >>
> >> Hopefully use of O_DIRECT doesn't make a difference for
> >> /dev/null, since the I/O is being immediately thrown
> >> away and so ought to never go into I/O cache. 
> >>
> >> In terms of the comparison, we still have libvirt iohelper
> >> giving QEMU a pipe, while your test above gives QEMU a
> >> UNIX socket.
> >>
> >> So I still wonder if the delta is caused by the pipe vs socket
> >> difference, as opposed to netcat vs libvirt iohelper code.
> > 
> > I'll look into this aspect, thanks!
> 
-- 
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
> * Claudio Fontana (cfontana@suse.de) wrote:
>> On 3/17/22 2:41 PM, Claudio Fontana wrote:
>>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
>>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>>>>> the first user is the qemu driver,
>>>>>>>>>>
>>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>>>>
>>>>>>>>>> This improves the situation by 400%.
>>>>>>>>>>
>>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>>>>> ---
>>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>>>>
>>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>>>>
>>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>>>>
>>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>
>>>>
>>>>> Current results show these experimental averages maximum throughput
>>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>>>>> "query-migrate", tests repeated 5 times for each).
>>>>> VM Size is 60G, most of the memory effectively touched before migration,
>>>>> through user application allocating and touching all memory with
>>>>> pseudorandom data.
>>>>>
>>>>> 64K:     5200 Mbps (current situation)
>>>>> 128K:    5800 Mbps
>>>>> 256K:   20900 Mbps
>>>>> 512K:   21600 Mbps
>>>>> 1M:     22800 Mbps
>>>>> 2M:     22800 Mbps
>>>>> 4M:     22400 Mbps
>>>>> 8M:     22500 Mbps
>>>>> 16M:    22800 Mbps
>>>>> 32M:    22900 Mbps
>>>>> 64M:    22900 Mbps
>>>>> 128M:   22800 Mbps
>>>>>
>>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
>>>>
>>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
>>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
>>>> not try to go higher.
>>>>
>>>>> As for the theoretical limit for the libvirt architecture,
>>>>> I ran a qemu migration directly issuing the appropriate QMP
>>>>> commands, setting the same migration parameters as per libvirt,
>>>>> and then migrating to a socket netcatted to /dev/null via
>>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>>>>
>>>>> QMP:    37000 Mbps
>>>>
>>>>> So although the Pipe size improves things (in particular the
>>>>> large jump is for the 256K size, although 1M seems a very good value),
>>>>> there is still a second bottleneck in there somewhere that
>>>>> accounts for a loss of ~14200 Mbps in throughput.
>>
>>
>> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
>> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
>>
>> ~50000 mbps qemu to netcat socket to /dev/null
>> ~35500 mbps virsh save to /dev/null
>>
>> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
> 
> It might be closer to RAM or cache bandwidth limited though; for an extra copy.

I was thinking about sendfile(2) in iohelper, but that probably can't work as the input fd is a socket, I am getting EINVAL.

One thing that I noticed is:

ommit afe6e58aedcd5e27ea16184fed90b338569bd042
Author: Jiri Denemark <jdenemar@redhat.com>
Date:   Mon Feb 6 14:40:48 2012 +0100

    util: Generalize virFileDirectFd
    
    virFileDirectFd was used for accessing files opened with O_DIRECT using
    libvirt_iohelper. We will want to use the helper for accessing files
    regardless on O_DIRECT and thus virFileDirectFd was generalized and
    renamed to virFileWrapperFd.


And in particular the comment in src/util/virFile.c:

    /* XXX support posix_fadvise rather than O_DIRECT, if the kernel support                                                                                                 
     * for that is decent enough. In that case, we will also need to                                                                                                         
     * explicitly support VIR_FILE_WRAPPER_NON_BLOCKING since                                                                                                                
     * VIR_FILE_WRAPPER_BYPASS_CACHE alone will no longer require spawning                                                                                                   
     * iohelper.                                                                                                                                                             
     */

by Jiri Denemark.

I have lots of questions here, and I tried to involve Jiri and Andrea Righi here, who a long time ago proposed a POSIX_FADV_NOREUSE implementation.

1) What is the reason iohelper was introduced?

2) Was Jiri's comment about the missing linux implementation of POSIX_FADV_NOREUSE?

3) if using O_DIRECT is the only reason for iohelper to exist (...?), would replacing it with posix_fadvise remove the need for iohelper?

4) What has stopped Andreas' or another POSIX_FADV_NOREUSE implementation in the kernel?

Lots of questions..

Thanks for all your insight,

Claudio

> 
> Dave
> 
>> Ciao,
>>
>> C
>>
>>>>
>>>> In the above tests with libvirt, were you using the
>>>> --bypass-cache flag or not ?
>>>
>>> No, I do not. Tests with ramdisk did not show a notable difference for me,
>>>
>>> but tests with /dev/null were not possible, since the command line is not accepted:
>>>
>>> # virsh save centos7 /dev/null
>>> Domain 'centos7' saved to /dev/null
>>> [OK]
>>>
>>> # virsh save centos7 /dev/null --bypass-cache
>>> error: Failed to save domain 'centos7' to /dev/null
>>> error: Failed to create file '/dev/null': Invalid argument
>>>
>>>
>>>>
>>>> Hopefully use of O_DIRECT doesn't make a difference for
>>>> /dev/null, since the I/O is being immediately thrown
>>>> away and so ought to never go into I/O cache. 
>>>>
>>>> In terms of the comparison, we still have libvirt iohelper
>>>> giving QEMU a pipe, while your test above gives QEMU a
>>>> UNIX socket.
>>>>
>>>> So I still wonder if the delta is caused by the pipe vs socket
>>>> difference, as opposed to netcat vs libvirt iohelper code.
>>>
>>> I'll look into this aspect, thanks!
>>
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Daniel P. Berrangé 2 years, 1 month ago
On Fri, Mar 18, 2022 at 02:34:29PM +0100, Claudio Fontana wrote:
> On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
> > * Claudio Fontana (cfontana@suse.de) wrote:
> >> On 3/17/22 2:41 PM, Claudio Fontana wrote:
> >>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
> >>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
> >>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
> >>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
> >>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
> >>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
> >>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
> >>>>>>>>>> the first user is the qemu driver,
> >>>>>>>>>>
> >>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
> >>>>>>>>>>
> >>>>>>>>>> This improves the situation by 400%.
> >>>>>>>>>>
> >>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
> >>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
> >>>>>>>>>>
> >>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
> >>>>>>>>>> ---
> >>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
> >>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
> >>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
> >>>>>>>>>>  src/util/virfile.h        |  1 +
> >>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
> >>>>>>>>>>
> >>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
> >>>>>>>>>> so you can find the discussion about this in qemu-devel:
> >>>>>>>>>>
> >>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
> >>>>>>>>>>
> >>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
> >>>>
> >>>>
> >>>>> Current results show these experimental averages maximum throughput
> >>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
> >>>>> "query-migrate", tests repeated 5 times for each).
> >>>>> VM Size is 60G, most of the memory effectively touched before migration,
> >>>>> through user application allocating and touching all memory with
> >>>>> pseudorandom data.
> >>>>>
> >>>>> 64K:     5200 Mbps (current situation)
> >>>>> 128K:    5800 Mbps
> >>>>> 256K:   20900 Mbps
> >>>>> 512K:   21600 Mbps
> >>>>> 1M:     22800 Mbps
> >>>>> 2M:     22800 Mbps
> >>>>> 4M:     22400 Mbps
> >>>>> 8M:     22500 Mbps
> >>>>> 16M:    22800 Mbps
> >>>>> 32M:    22900 Mbps
> >>>>> 64M:    22900 Mbps
> >>>>> 128M:   22800 Mbps
> >>>>>
> >>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
> >>>>
> >>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
> >>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
> >>>> not try to go higher.
> >>>>
> >>>>> As for the theoretical limit for the libvirt architecture,
> >>>>> I ran a qemu migration directly issuing the appropriate QMP
> >>>>> commands, setting the same migration parameters as per libvirt,
> >>>>> and then migrating to a socket netcatted to /dev/null via
> >>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
> >>>>>
> >>>>> QMP:    37000 Mbps
> >>>>
> >>>>> So although the Pipe size improves things (in particular the
> >>>>> large jump is for the 256K size, although 1M seems a very good value),
> >>>>> there is still a second bottleneck in there somewhere that
> >>>>> accounts for a loss of ~14200 Mbps in throughput.
> >>
> >>
> >> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
> >> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
> >>
> >> ~50000 mbps qemu to netcat socket to /dev/null
> >> ~35500 mbps virsh save to /dev/null
> >>
> >> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
> > 
> > It might be closer to RAM or cache bandwidth limited though; for an extra copy.
> 
> I was thinking about sendfile(2) in iohelper, but that probably
> can't work as the input fd is a socket, I am getting EINVAL.

Yep, sendfile() requires the input to be a mmapable FD,
and the output to be a socket.

Try splice() instead  which merely requires 1 end to be a
pipe, and the other end can be any FD afaik.

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/25/22 12:29 PM, Daniel P. Berrangé wrote:
> On Fri, Mar 18, 2022 at 02:34:29PM +0100, Claudio Fontana wrote:
>> On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
>>> * Claudio Fontana (cfontana@suse.de) wrote:
>>>> On 3/17/22 2:41 PM, Claudio Fontana wrote:
>>>>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
>>>>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>>>>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>>>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>> the first user is the qemu driver,
>>>>>>>>>>>>
>>>>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>>>>>>
>>>>>>>>>>>> This improves the situation by 400%.
>>>>>>>>>>>>
>>>>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>>>>>>> ---
>>>>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>>>>>>
>>>>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>>>>>>
>>>>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>>>
>>>>>>
>>>>>>> Current results show these experimental averages maximum throughput
>>>>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>>>>>>> "query-migrate", tests repeated 5 times for each).
>>>>>>> VM Size is 60G, most of the memory effectively touched before migration,
>>>>>>> through user application allocating and touching all memory with
>>>>>>> pseudorandom data.
>>>>>>>
>>>>>>> 64K:     5200 Mbps (current situation)
>>>>>>> 128K:    5800 Mbps
>>>>>>> 256K:   20900 Mbps
>>>>>>> 512K:   21600 Mbps
>>>>>>> 1M:     22800 Mbps
>>>>>>> 2M:     22800 Mbps
>>>>>>> 4M:     22400 Mbps
>>>>>>> 8M:     22500 Mbps
>>>>>>> 16M:    22800 Mbps
>>>>>>> 32M:    22900 Mbps
>>>>>>> 64M:    22900 Mbps
>>>>>>> 128M:   22800 Mbps
>>>>>>>
>>>>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
>>>>>>
>>>>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
>>>>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
>>>>>> not try to go higher.
>>>>>>
>>>>>>> As for the theoretical limit for the libvirt architecture,
>>>>>>> I ran a qemu migration directly issuing the appropriate QMP
>>>>>>> commands, setting the same migration parameters as per libvirt,
>>>>>>> and then migrating to a socket netcatted to /dev/null via
>>>>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>>>>>>
>>>>>>> QMP:    37000 Mbps
>>>>>>
>>>>>>> So although the Pipe size improves things (in particular the
>>>>>>> large jump is for the 256K size, although 1M seems a very good value),
>>>>>>> there is still a second bottleneck in there somewhere that
>>>>>>> accounts for a loss of ~14200 Mbps in throughput.
>>>>
>>>>
>>>> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
>>>> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
>>>>
>>>> ~50000 mbps qemu to netcat socket to /dev/null
>>>> ~35500 mbps virsh save to /dev/null
>>>>
>>>> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
>>>
>>> It might be closer to RAM or cache bandwidth limited though; for an extra copy.
>>
>> I was thinking about sendfile(2) in iohelper, but that probably
>> can't work as the input fd is a socket, I am getting EINVAL.
> 
> Yep, sendfile() requires the input to be a mmapable FD,
> and the output to be a socket.
> 
> Try splice() instead  which merely requires 1 end to be a
> pipe, and the other end can be any FD afaik.
> 
> With regards,
> Daniel
> 

I did try splice(), but performance is worse by around 500%.

It also fails with EINVAL when trying to use it in combination with O_DIRECT.

Tried larger and smaller buffers, flags like SPLICE_F_MORE an SPLICE_F_MOVE in any combination; no change, just awful performance.

Here is the code:

#ifdef __linux__
+static ssize_t safesplice(int fdin, int fdout, size_t todo)
+{
+    unsigned int flags = SPLICE_F_MOVE | SPLICE_F_MORE;
+    ssize_t ncopied = 0;
+
+    while (todo > 0) {
+        ssize_t r = splice(fdin, NULL, fdout, NULL, todo, flags);
+        if (r < 0 && errno == EINTR)
+            continue;
+        if (r < 0)
+            return r;
+        if (r == 0)
+            return ncopied;
+        todo -= r;
+        ncopied += r;
+    }
+    return ncopied;
+}
+
+static ssize_t runIOCopy(const struct runIOParams p)
+{
+    size_t len = 1024 * 1024;
+    ssize_t total = 0;
+
+    while (1) {
+        ssize_t got = safesplice(p.fdin, p.fdout, len);
+        if (got < 0)
+            return -1;
+        if (got == 0)
+            break;
+
+        total += got;
+
+        /* handle last write truncate in direct case */
+        if (got < len && p.isDirect && p.isWrite && !p.isBlockDev) {
+            if (ftruncate(p.fdout, total) < 0) {
+                return -4;
+            }
+            break;
+        }
+    }
+    return total;
+}
+
+#endif


Any ideas welcome,

Claudio
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/26/22 4:49 PM, Claudio Fontana wrote:
> On 3/25/22 12:29 PM, Daniel P. Berrangé wrote:
>> On Fri, Mar 18, 2022 at 02:34:29PM +0100, Claudio Fontana wrote:
>>> On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
>>>> * Claudio Fontana (cfontana@suse.de) wrote:
>>>>> On 3/17/22 2:41 PM, Claudio Fontana wrote:
>>>>>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
>>>>>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>>>>>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>>>>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>>>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>>> the first user is the qemu driver,
>>>>>>>>>>>>>
>>>>>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>>>>>>>
>>>>>>>>>>>>> This improves the situation by 400%.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>>>>>>>
>>>>>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>>>>>>>
>>>>>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>>>>
>>>>>>>
>>>>>>>> Current results show these experimental averages maximum throughput
>>>>>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>>>>>>>> "query-migrate", tests repeated 5 times for each).
>>>>>>>> VM Size is 60G, most of the memory effectively touched before migration,
>>>>>>>> through user application allocating and touching all memory with
>>>>>>>> pseudorandom data.
>>>>>>>>
>>>>>>>> 64K:     5200 Mbps (current situation)
>>>>>>>> 128K:    5800 Mbps
>>>>>>>> 256K:   20900 Mbps
>>>>>>>> 512K:   21600 Mbps
>>>>>>>> 1M:     22800 Mbps
>>>>>>>> 2M:     22800 Mbps
>>>>>>>> 4M:     22400 Mbps
>>>>>>>> 8M:     22500 Mbps
>>>>>>>> 16M:    22800 Mbps
>>>>>>>> 32M:    22900 Mbps
>>>>>>>> 64M:    22900 Mbps
>>>>>>>> 128M:   22800 Mbps
>>>>>>>>
>>>>>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
>>>>>>>
>>>>>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
>>>>>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
>>>>>>> not try to go higher.
>>>>>>>
>>>>>>>> As for the theoretical limit for the libvirt architecture,
>>>>>>>> I ran a qemu migration directly issuing the appropriate QMP
>>>>>>>> commands, setting the same migration parameters as per libvirt,
>>>>>>>> and then migrating to a socket netcatted to /dev/null via
>>>>>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>>>>>>>
>>>>>>>> QMP:    37000 Mbps
>>>>>>>
>>>>>>>> So although the Pipe size improves things (in particular the
>>>>>>>> large jump is for the 256K size, although 1M seems a very good value),
>>>>>>>> there is still a second bottleneck in there somewhere that
>>>>>>>> accounts for a loss of ~14200 Mbps in throughput.
>>>>>
>>>>>
>>>>> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
>>>>> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
>>>>>
>>>>> ~50000 mbps qemu to netcat socket to /dev/null
>>>>> ~35500 mbps virsh save to /dev/null
>>>>>
>>>>> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
>>>>
>>>> It might be closer to RAM or cache bandwidth limited though; for an extra copy.
>>>
>>> I was thinking about sendfile(2) in iohelper, but that probably
>>> can't work as the input fd is a socket, I am getting EINVAL.
>>
>> Yep, sendfile() requires the input to be a mmapable FD,
>> and the output to be a socket.
>>
>> Try splice() instead  which merely requires 1 end to be a
>> pipe, and the other end can be any FD afaik.
>>
>> With regards,
>> Daniel
>>
> 
> I did try splice(), but performance is worse by around 500%.
> 
> It also fails with EINVAL when trying to use it in combination with O_DIRECT.
> 
> Tried larger and smaller buffers, flags like SPLICE_F_MORE an SPLICE_F_MOVE in any combination; no change, just awful performance.


Ok I found a case where splice actually helps: in the read case, without O_DIRECT, splice seems to actually outperform read/write
by _a lot_.

I will code up the patch and start making more experiments with larger VM sizes etc.

Thanks!

Claudio


> 
> Here is the code:
> 
> #ifdef __linux__
> +static ssize_t safesplice(int fdin, int fdout, size_t todo)
> +{
> +    unsigned int flags = SPLICE_F_MOVE | SPLICE_F_MORE;
> +    ssize_t ncopied = 0;
> +
> +    while (todo > 0) {
> +        ssize_t r = splice(fdin, NULL, fdout, NULL, todo, flags);
> +        if (r < 0 && errno == EINTR)
> +            continue;
> +        if (r < 0)
> +            return r;
> +        if (r == 0)
> +            return ncopied;
> +        todo -= r;
> +        ncopied += r;
> +    }
> +    return ncopied;
> +}
> +
> +static ssize_t runIOCopy(const struct runIOParams p)
> +{
> +    size_t len = 1024 * 1024;
> +    ssize_t total = 0;
> +
> +    while (1) {
> +        ssize_t got = safesplice(p.fdin, p.fdout, len);
> +        if (got < 0)
> +            return -1;
> +        if (got == 0)
> +            break;
> +
> +        total += got;
> +
> +        /* handle last write truncate in direct case */
> +        if (got < len && p.isDirect && p.isWrite && !p.isBlockDev) {
> +            if (ftruncate(p.fdout, total) < 0) {
> +                return -4;
> +            }
> +            break;
> +        }
> +    }
> +    return total;
> +}
> +
> +#endif
> 
> 
> Any ideas welcome,
> 
> Claudio
> 
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/28/22 12:47 PM, Claudio Fontana wrote:
> On 3/26/22 4:49 PM, Claudio Fontana wrote:
>> On 3/25/22 12:29 PM, Daniel P. Berrangé wrote:
>>> On Fri, Mar 18, 2022 at 02:34:29PM +0100, Claudio Fontana wrote:
>>>> On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
>>>>> * Claudio Fontana (cfontana@suse.de) wrote:
>>>>>> On 3/17/22 2:41 PM, Claudio Fontana wrote:
>>>>>>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
>>>>>>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>>>>>>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>>>>>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>>>> the first user is the qemu driver,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This improves the situation by 400%.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>>>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>>>>>
>>>>>>>>
>>>>>>>>> Current results show these experimental averages maximum throughput
>>>>>>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>>>>>>>>> "query-migrate", tests repeated 5 times for each).
>>>>>>>>> VM Size is 60G, most of the memory effectively touched before migration,
>>>>>>>>> through user application allocating and touching all memory with
>>>>>>>>> pseudorandom data.
>>>>>>>>>
>>>>>>>>> 64K:     5200 Mbps (current situation)
>>>>>>>>> 128K:    5800 Mbps
>>>>>>>>> 256K:   20900 Mbps
>>>>>>>>> 512K:   21600 Mbps
>>>>>>>>> 1M:     22800 Mbps
>>>>>>>>> 2M:     22800 Mbps
>>>>>>>>> 4M:     22400 Mbps
>>>>>>>>> 8M:     22500 Mbps
>>>>>>>>> 16M:    22800 Mbps
>>>>>>>>> 32M:    22900 Mbps
>>>>>>>>> 64M:    22900 Mbps
>>>>>>>>> 128M:   22800 Mbps
>>>>>>>>>
>>>>>>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
>>>>>>>>
>>>>>>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
>>>>>>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
>>>>>>>> not try to go higher.
>>>>>>>>
>>>>>>>>> As for the theoretical limit for the libvirt architecture,
>>>>>>>>> I ran a qemu migration directly issuing the appropriate QMP
>>>>>>>>> commands, setting the same migration parameters as per libvirt,
>>>>>>>>> and then migrating to a socket netcatted to /dev/null via
>>>>>>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>>>>>>>>
>>>>>>>>> QMP:    37000 Mbps
>>>>>>>>
>>>>>>>>> So although the Pipe size improves things (in particular the
>>>>>>>>> large jump is for the 256K size, although 1M seems a very good value),
>>>>>>>>> there is still a second bottleneck in there somewhere that
>>>>>>>>> accounts for a loss of ~14200 Mbps in throughput.
>>>>>>
>>>>>>
>>>>>> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
>>>>>> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
>>>>>>
>>>>>> ~50000 mbps qemu to netcat socket to /dev/null
>>>>>> ~35500 mbps virsh save to /dev/null
>>>>>>
>>>>>> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
>>>>>
>>>>> It might be closer to RAM or cache bandwidth limited though; for an extra copy.
>>>>
>>>> I was thinking about sendfile(2) in iohelper, but that probably
>>>> can't work as the input fd is a socket, I am getting EINVAL.
>>>
>>> Yep, sendfile() requires the input to be a mmapable FD,
>>> and the output to be a socket.
>>>
>>> Try splice() instead  which merely requires 1 end to be a
>>> pipe, and the other end can be any FD afaik.
>>>
>>> With regards,
>>> Daniel
>>>
>>
>> I did try splice(), but performance is worse by around 500%.
>>
>> It also fails with EINVAL when trying to use it in combination with O_DIRECT.
>>
>> Tried larger and smaller buffers, flags like SPLICE_F_MORE an SPLICE_F_MOVE in any combination; no change, just awful performance.
> 
> 
> Ok I found a case where splice actually helps: in the read case, without O_DIRECT, splice seems to actually outperform read/write
> by _a lot_.


I was just hit by a cache effect. No real improvements I could measure.

> 
> I will code up the patch and start making more experiments with larger VM sizes etc.
> 
> Thanks!
> 
> Claudio
> 
> 
>>
>> Here is the code:
>>
>> #ifdef __linux__
>> +static ssize_t safesplice(int fdin, int fdout, size_t todo)
>> +{
>> +    unsigned int flags = SPLICE_F_MOVE | SPLICE_F_MORE;
>> +    ssize_t ncopied = 0;
>> +
>> +    while (todo > 0) {
>> +        ssize_t r = splice(fdin, NULL, fdout, NULL, todo, flags);
>> +        if (r < 0 && errno == EINTR)
>> +            continue;
>> +        if (r < 0)
>> +            return r;
>> +        if (r == 0)
>> +            return ncopied;
>> +        todo -= r;
>> +        ncopied += r;
>> +    }
>> +    return ncopied;
>> +}
>> +
>> +static ssize_t runIOCopy(const struct runIOParams p)
>> +{
>> +    size_t len = 1024 * 1024;
>> +    ssize_t total = 0;
>> +
>> +    while (1) {
>> +        ssize_t got = safesplice(p.fdin, p.fdout, len);
>> +        if (got < 0)
>> +            return -1;
>> +        if (got == 0)
>> +            break;
>> +
>> +        total += got;
>> +
>> +        /* handle last write truncate in direct case */
>> +        if (got < len && p.isDirect && p.isWrite && !p.isBlockDev) {
>> +            if (ftruncate(p.fdout, total) < 0) {
>> +                return -4;
>> +            }
>> +            break;
>> +        }
>> +    }
>> +    return total;
>> +}
>> +
>> +#endif
>>
>>
>> Any ideas welcome,
>>
>> Claudio
>>
> 
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Daniel P. Berrangé 2 years, 1 month ago
On Sat, Mar 26, 2022 at 04:49:46PM +0100, Claudio Fontana wrote:
> On 3/25/22 12:29 PM, Daniel P. Berrangé wrote:
> > On Fri, Mar 18, 2022 at 02:34:29PM +0100, Claudio Fontana wrote:
> >> On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
> >>> * Claudio Fontana (cfontana@suse.de) wrote:
> >>>> On 3/17/22 2:41 PM, Claudio Fontana wrote:
> >>>>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
> >>>>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
> >>>>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
> >>>>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
> >>>>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
> >>>>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
> >>>>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
> >>>>>>>>>>>> the first user is the qemu driver,
> >>>>>>>>>>>>
> >>>>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
> >>>>>>>>>>>>
> >>>>>>>>>>>> This improves the situation by 400%.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
> >>>>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
> >>>>>>>>>>>> ---
> >>>>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
> >>>>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
> >>>>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
> >>>>>>>>>>>>  src/util/virfile.h        |  1 +
> >>>>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
> >>>>>>>>>>>>
> >>>>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
> >>>>>>>>>>>> so you can find the discussion about this in qemu-devel:
> >>>>>>>>>>>>
> >>>>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
> >>>>>>>>>>>>
> >>>>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
> >>>>>>
> >>>>>>
> >>>>>>> Current results show these experimental averages maximum throughput
> >>>>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
> >>>>>>> "query-migrate", tests repeated 5 times for each).
> >>>>>>> VM Size is 60G, most of the memory effectively touched before migration,
> >>>>>>> through user application allocating and touching all memory with
> >>>>>>> pseudorandom data.
> >>>>>>>
> >>>>>>> 64K:     5200 Mbps (current situation)
> >>>>>>> 128K:    5800 Mbps
> >>>>>>> 256K:   20900 Mbps
> >>>>>>> 512K:   21600 Mbps
> >>>>>>> 1M:     22800 Mbps
> >>>>>>> 2M:     22800 Mbps
> >>>>>>> 4M:     22400 Mbps
> >>>>>>> 8M:     22500 Mbps
> >>>>>>> 16M:    22800 Mbps
> >>>>>>> 32M:    22900 Mbps
> >>>>>>> 64M:    22900 Mbps
> >>>>>>> 128M:   22800 Mbps
> >>>>>>>
> >>>>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
> >>>>>>
> >>>>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
> >>>>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
> >>>>>> not try to go higher.
> >>>>>>
> >>>>>>> As for the theoretical limit for the libvirt architecture,
> >>>>>>> I ran a qemu migration directly issuing the appropriate QMP
> >>>>>>> commands, setting the same migration parameters as per libvirt,
> >>>>>>> and then migrating to a socket netcatted to /dev/null via
> >>>>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
> >>>>>>>
> >>>>>>> QMP:    37000 Mbps
> >>>>>>
> >>>>>>> So although the Pipe size improves things (in particular the
> >>>>>>> large jump is for the 256K size, although 1M seems a very good value),
> >>>>>>> there is still a second bottleneck in there somewhere that
> >>>>>>> accounts for a loss of ~14200 Mbps in throughput.
> >>>>
> >>>>
> >>>> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
> >>>> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
> >>>>
> >>>> ~50000 mbps qemu to netcat socket to /dev/null
> >>>> ~35500 mbps virsh save to /dev/null
> >>>>
> >>>> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
> >>>
> >>> It might be closer to RAM or cache bandwidth limited though; for an extra copy.
> >>
> >> I was thinking about sendfile(2) in iohelper, but that probably
> >> can't work as the input fd is a socket, I am getting EINVAL.
> > 
> > Yep, sendfile() requires the input to be a mmapable FD,
> > and the output to be a socket.
> > 
> > Try splice() instead  which merely requires 1 end to be a
> > pipe, and the other end can be any FD afaik.
> > 
> 
> I did try splice(), but performance is worse by around 500%.

Hmm, that's certainly unexpected !

> Any ideas welcome,

I learnt there is also a newer  copy_file_range call, not sure if that's
any better.

You passed len as 1 MB, I wonder if passing MAXINT is viable ? We just
want to copy everything IIRC.

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/28/22 10:31 AM, Daniel P. Berrangé wrote:
> On Sat, Mar 26, 2022 at 04:49:46PM +0100, Claudio Fontana wrote:
>> On 3/25/22 12:29 PM, Daniel P. Berrangé wrote:
>>> On Fri, Mar 18, 2022 at 02:34:29PM +0100, Claudio Fontana wrote:
>>>> On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
>>>>> * Claudio Fontana (cfontana@suse.de) wrote:
>>>>>> On 3/17/22 2:41 PM, Claudio Fontana wrote:
>>>>>>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
>>>>>>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>>>>>>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>>>>>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>>>> the first user is the qemu driver,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This improves the situation by 400%.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>>>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>>>>>
>>>>>>>>
>>>>>>>>> Current results show these experimental averages maximum throughput
>>>>>>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>>>>>>>>> "query-migrate", tests repeated 5 times for each).
>>>>>>>>> VM Size is 60G, most of the memory effectively touched before migration,
>>>>>>>>> through user application allocating and touching all memory with
>>>>>>>>> pseudorandom data.
>>>>>>>>>
>>>>>>>>> 64K:     5200 Mbps (current situation)
>>>>>>>>> 128K:    5800 Mbps
>>>>>>>>> 256K:   20900 Mbps
>>>>>>>>> 512K:   21600 Mbps
>>>>>>>>> 1M:     22800 Mbps
>>>>>>>>> 2M:     22800 Mbps
>>>>>>>>> 4M:     22400 Mbps
>>>>>>>>> 8M:     22500 Mbps
>>>>>>>>> 16M:    22800 Mbps
>>>>>>>>> 32M:    22900 Mbps
>>>>>>>>> 64M:    22900 Mbps
>>>>>>>>> 128M:   22800 Mbps
>>>>>>>>>
>>>>>>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
>>>>>>>>
>>>>>>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
>>>>>>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
>>>>>>>> not try to go higher.
>>>>>>>>
>>>>>>>>> As for the theoretical limit for the libvirt architecture,
>>>>>>>>> I ran a qemu migration directly issuing the appropriate QMP
>>>>>>>>> commands, setting the same migration parameters as per libvirt,
>>>>>>>>> and then migrating to a socket netcatted to /dev/null via
>>>>>>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>>>>>>>>
>>>>>>>>> QMP:    37000 Mbps
>>>>>>>>
>>>>>>>>> So although the Pipe size improves things (in particular the
>>>>>>>>> large jump is for the 256K size, although 1M seems a very good value),
>>>>>>>>> there is still a second bottleneck in there somewhere that
>>>>>>>>> accounts for a loss of ~14200 Mbps in throughput.
>>>>>>
>>>>>>
>>>>>> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
>>>>>> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
>>>>>>
>>>>>> ~50000 mbps qemu to netcat socket to /dev/null
>>>>>> ~35500 mbps virsh save to /dev/null
>>>>>>
>>>>>> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
>>>>>
>>>>> It might be closer to RAM or cache bandwidth limited though; for an extra copy.
>>>>
>>>> I was thinking about sendfile(2) in iohelper, but that probably
>>>> can't work as the input fd is a socket, I am getting EINVAL.
>>>
>>> Yep, sendfile() requires the input to be a mmapable FD,
>>> and the output to be a socket.
>>>
>>> Try splice() instead  which merely requires 1 end to be a
>>> pipe, and the other end can be any FD afaik.
>>>
>>
>> I did try splice(), but performance is worse by around 500%.
> 
> Hmm, that's certainly unexpected !
> 
>> Any ideas welcome,
> 
> I learnt there is also a newer  copy_file_range call, not sure if that's
> any better.
> 
> You passed len as 1 MB, I wonder if passing MAXINT is viable ? We just
> want to copy everything IIRC.
> 
> With regards,
> Daniel
> 

Crazy idea, would trying to use the parallel migration concept for migrating to/from a file make any sense?

Not sure if applying the qemu multifd implementation of this would apply, maybe it could be given another implementation for "toFile", trying to use more than one cpu to do the transfer?

Thanks,

Claudio
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/28/22 10:31 AM, Daniel P. Berrangé wrote:
> On Sat, Mar 26, 2022 at 04:49:46PM +0100, Claudio Fontana wrote:
>> On 3/25/22 12:29 PM, Daniel P. Berrangé wrote:
>>> On Fri, Mar 18, 2022 at 02:34:29PM +0100, Claudio Fontana wrote:
>>>> On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
>>>>> * Claudio Fontana (cfontana@suse.de) wrote:
>>>>>> On 3/17/22 2:41 PM, Claudio Fontana wrote:
>>>>>>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
>>>>>>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>>>>>>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>>>>>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>>>> the first user is the qemu driver,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This improves the situation by 400%.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>>>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>>>>>
>>>>>>>>
>>>>>>>>> Current results show these experimental averages maximum throughput
>>>>>>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>>>>>>>>> "query-migrate", tests repeated 5 times for each).
>>>>>>>>> VM Size is 60G, most of the memory effectively touched before migration,
>>>>>>>>> through user application allocating and touching all memory with
>>>>>>>>> pseudorandom data.
>>>>>>>>>
>>>>>>>>> 64K:     5200 Mbps (current situation)
>>>>>>>>> 128K:    5800 Mbps
>>>>>>>>> 256K:   20900 Mbps
>>>>>>>>> 512K:   21600 Mbps
>>>>>>>>> 1M:     22800 Mbps
>>>>>>>>> 2M:     22800 Mbps
>>>>>>>>> 4M:     22400 Mbps
>>>>>>>>> 8M:     22500 Mbps
>>>>>>>>> 16M:    22800 Mbps
>>>>>>>>> 32M:    22900 Mbps
>>>>>>>>> 64M:    22900 Mbps
>>>>>>>>> 128M:   22800 Mbps
>>>>>>>>>
>>>>>>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
>>>>>>>>
>>>>>>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
>>>>>>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
>>>>>>>> not try to go higher.
>>>>>>>>
>>>>>>>>> As for the theoretical limit for the libvirt architecture,
>>>>>>>>> I ran a qemu migration directly issuing the appropriate QMP
>>>>>>>>> commands, setting the same migration parameters as per libvirt,
>>>>>>>>> and then migrating to a socket netcatted to /dev/null via
>>>>>>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>>>>>>>>
>>>>>>>>> QMP:    37000 Mbps
>>>>>>>>
>>>>>>>>> So although the Pipe size improves things (in particular the
>>>>>>>>> large jump is for the 256K size, although 1M seems a very good value),
>>>>>>>>> there is still a second bottleneck in there somewhere that
>>>>>>>>> accounts for a loss of ~14200 Mbps in throughput.
>>>>>>
>>>>>>
>>>>>> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
>>>>>> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
>>>>>>
>>>>>> ~50000 mbps qemu to netcat socket to /dev/null
>>>>>> ~35500 mbps virsh save to /dev/null
>>>>>>
>>>>>> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
>>>>>
>>>>> It might be closer to RAM or cache bandwidth limited though; for an extra copy.
>>>>
>>>> I was thinking about sendfile(2) in iohelper, but that probably
>>>> can't work as the input fd is a socket, I am getting EINVAL.
>>>
>>> Yep, sendfile() requires the input to be a mmapable FD,
>>> and the output to be a socket.
>>>
>>> Try splice() instead  which merely requires 1 end to be a
>>> pipe, and the other end can be any FD afaik.
>>>
>>
>> I did try splice(), but performance is worse by around 500%.
> 
> Hmm, that's certainly unexpected !
> 
>> Any ideas welcome,
> 
> I learnt there is also a newer  copy_file_range call, not sure if that's
> any better.
> 
> You passed len as 1 MB, I wonder if passing MAXINT is viable ? We just
> want to copy everything IIRC.
> 
> With regards,
> Daniel
> 

Hi Daniel, tried also up to 64MB, no improvement with splice.

I'll take a look at copy_file_range,

Thanks!

Claudio
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/28/22 11:19 AM, Claudio Fontana wrote:
> On 3/28/22 10:31 AM, Daniel P. Berrangé wrote:
>> On Sat, Mar 26, 2022 at 04:49:46PM +0100, Claudio Fontana wrote:
>>> On 3/25/22 12:29 PM, Daniel P. Berrangé wrote:
>>>> On Fri, Mar 18, 2022 at 02:34:29PM +0100, Claudio Fontana wrote:
>>>>> On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
>>>>>> * Claudio Fontana (cfontana@suse.de) wrote:
>>>>>>> On 3/17/22 2:41 PM, Claudio Fontana wrote:
>>>>>>>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
>>>>>>>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>>>>>>>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>>>>>>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>>>>> the first user is the qemu driver,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> This improves the situation by 400%.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>>>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>>>>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> Current results show these experimental averages maximum throughput
>>>>>>>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>>>>>>>>>> "query-migrate", tests repeated 5 times for each).
>>>>>>>>>> VM Size is 60G, most of the memory effectively touched before migration,
>>>>>>>>>> through user application allocating and touching all memory with
>>>>>>>>>> pseudorandom data.
>>>>>>>>>>
>>>>>>>>>> 64K:     5200 Mbps (current situation)
>>>>>>>>>> 128K:    5800 Mbps
>>>>>>>>>> 256K:   20900 Mbps
>>>>>>>>>> 512K:   21600 Mbps
>>>>>>>>>> 1M:     22800 Mbps
>>>>>>>>>> 2M:     22800 Mbps
>>>>>>>>>> 4M:     22400 Mbps
>>>>>>>>>> 8M:     22500 Mbps
>>>>>>>>>> 16M:    22800 Mbps
>>>>>>>>>> 32M:    22900 Mbps
>>>>>>>>>> 64M:    22900 Mbps
>>>>>>>>>> 128M:   22800 Mbps
>>>>>>>>>>
>>>>>>>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
>>>>>>>>>
>>>>>>>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
>>>>>>>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
>>>>>>>>> not try to go higher.
>>>>>>>>>
>>>>>>>>>> As for the theoretical limit for the libvirt architecture,
>>>>>>>>>> I ran a qemu migration directly issuing the appropriate QMP
>>>>>>>>>> commands, setting the same migration parameters as per libvirt,
>>>>>>>>>> and then migrating to a socket netcatted to /dev/null via
>>>>>>>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>>>>>>>>>
>>>>>>>>>> QMP:    37000 Mbps
>>>>>>>>>
>>>>>>>>>> So although the Pipe size improves things (in particular the
>>>>>>>>>> large jump is for the 256K size, although 1M seems a very good value),
>>>>>>>>>> there is still a second bottleneck in there somewhere that
>>>>>>>>>> accounts for a loss of ~14200 Mbps in throughput.
>>>>>>>
>>>>>>>
>>>>>>> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
>>>>>>> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
>>>>>>>
>>>>>>> ~50000 mbps qemu to netcat socket to /dev/null
>>>>>>> ~35500 mbps virsh save to /dev/null
>>>>>>>
>>>>>>> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
>>>>>>
>>>>>> It might be closer to RAM or cache bandwidth limited though; for an extra copy.
>>>>>
>>>>> I was thinking about sendfile(2) in iohelper, but that probably
>>>>> can't work as the input fd is a socket, I am getting EINVAL.
>>>>
>>>> Yep, sendfile() requires the input to be a mmapable FD,
>>>> and the output to be a socket.
>>>>
>>>> Try splice() instead  which merely requires 1 end to be a
>>>> pipe, and the other end can be any FD afaik.
>>>>
>>>
>>> I did try splice(), but performance is worse by around 500%.
>>
>> Hmm, that's certainly unexpected !
>>
>>> Any ideas welcome,
>>
>> I learnt there is also a newer  copy_file_range call, not sure if that's
>> any better.
>>
>> You passed len as 1 MB, I wonder if passing MAXINT is viable ? We just
>> want to copy everything IIRC.
>>
>> With regards,
>> Daniel
>>
> 
> Hi Daniel, tried also up to 64MB, no improvement with splice.
> 
> I'll take a look at copy_file_range,

It fails with EINVAL, according to man pages it needs both fds to refer to regular files.

All these alternatives to read/write API seem very situational...

would be cool if there was an API that does the best thing to minimize copies with the FDs it is passed, avoiding the need for userspace buffer
whatever the FDs refer to, but seems like there isn't one?

Ciao,

Claudio
Re: [libvirt RFC] virFile: new VIR_FILE_WRAPPER_BIG_PIPE to improve performance
Posted by Claudio Fontana 2 years, 1 month ago
On 3/26/22 4:49 PM, Claudio Fontana wrote:
> On 3/25/22 12:29 PM, Daniel P. Berrangé wrote:
>> On Fri, Mar 18, 2022 at 02:34:29PM +0100, Claudio Fontana wrote:
>>> On 3/17/22 4:03 PM, Dr. David Alan Gilbert wrote:
>>>> * Claudio Fontana (cfontana@suse.de) wrote:
>>>>> On 3/17/22 2:41 PM, Claudio Fontana wrote:
>>>>>> On 3/17/22 11:25 AM, Daniel P. Berrangé wrote:
>>>>>>> On Thu, Mar 17, 2022 at 11:12:11AM +0100, Claudio Fontana wrote:
>>>>>>>> On 3/16/22 1:17 PM, Claudio Fontana wrote:
>>>>>>>>> On 3/14/22 6:48 PM, Daniel P. Berrangé wrote:
>>>>>>>>>> On Mon, Mar 14, 2022 at 06:38:31PM +0100, Claudio Fontana wrote:
>>>>>>>>>>> On 3/14/22 6:17 PM, Daniel P. Berrangé wrote:
>>>>>>>>>>>> On Sat, Mar 12, 2022 at 05:30:01PM +0100, Claudio Fontana wrote:
>>>>>>>>>>>>> the first user is the qemu driver,
>>>>>>>>>>>>>
>>>>>>>>>>>>> virsh save/resume would slow to a crawl with a default pipe size (64k).
>>>>>>>>>>>>>
>>>>>>>>>>>>> This improves the situation by 400%.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Going through io_helper still seems to incur in some penalty (~15%-ish)
>>>>>>>>>>>>> compared with direct qemu migration to a nc socket to a file.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Claudio Fontana <cfontana@suse.de>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>  src/qemu/qemu_driver.c    |  6 +++---
>>>>>>>>>>>>>  src/qemu/qemu_saveimage.c | 11 ++++++-----
>>>>>>>>>>>>>  src/util/virfile.c        | 12 ++++++++++++
>>>>>>>>>>>>>  src/util/virfile.h        |  1 +
>>>>>>>>>>>>>  4 files changed, 22 insertions(+), 8 deletions(-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> Hello, I initially thought this to be a qemu performance issue,
>>>>>>>>>>>>> so you can find the discussion about this in qemu-devel:
>>>>>>>>>>>>>
>>>>>>>>>>>>> "Re: bad virsh save /dev/null performance (600 MiB/s max)"
>>>>>>>>>>>>>
>>>>>>>>>>>>> https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg03142.html
>>>>>>>
>>>>>>>
>>>>>>>> Current results show these experimental averages maximum throughput
>>>>>>>> migrating to /dev/null per each FdWrapper Pipe Size (as per QEMU QMP
>>>>>>>> "query-migrate", tests repeated 5 times for each).
>>>>>>>> VM Size is 60G, most of the memory effectively touched before migration,
>>>>>>>> through user application allocating and touching all memory with
>>>>>>>> pseudorandom data.
>>>>>>>>
>>>>>>>> 64K:     5200 Mbps (current situation)
>>>>>>>> 128K:    5800 Mbps
>>>>>>>> 256K:   20900 Mbps
>>>>>>>> 512K:   21600 Mbps
>>>>>>>> 1M:     22800 Mbps
>>>>>>>> 2M:     22800 Mbps
>>>>>>>> 4M:     22400 Mbps
>>>>>>>> 8M:     22500 Mbps
>>>>>>>> 16M:    22800 Mbps
>>>>>>>> 32M:    22900 Mbps
>>>>>>>> 64M:    22900 Mbps
>>>>>>>> 128M:   22800 Mbps
>>>>>>>>
>>>>>>>> This above is the throughput out of patched libvirt with multiple Pipe Sizes for the FDWrapper.
>>>>>>>
>>>>>>> Ok, its bouncing around with noise after 1 MB. So I'd suggest that
>>>>>>> libvirt attempt to raise the pipe limit to 1 MB by default, but
>>>>>>> not try to go higher.
>>>>>>>
>>>>>>>> As for the theoretical limit for the libvirt architecture,
>>>>>>>> I ran a qemu migration directly issuing the appropriate QMP
>>>>>>>> commands, setting the same migration parameters as per libvirt,
>>>>>>>> and then migrating to a socket netcatted to /dev/null via
>>>>>>>> {"execute": "migrate", "arguments": { "uri", "unix:///tmp/netcat.sock" } } :
>>>>>>>>
>>>>>>>> QMP:    37000 Mbps
>>>>>>>
>>>>>>>> So although the Pipe size improves things (in particular the
>>>>>>>> large jump is for the 256K size, although 1M seems a very good value),
>>>>>>>> there is still a second bottleneck in there somewhere that
>>>>>>>> accounts for a loss of ~14200 Mbps in throughput.
>>>>>
>>>>>
>>>>> Interesting addition: I tested quickly on a system with faster cpus and larger VM sizes, up to 200GB,
>>>>> and the difference in throughput libvirt vs qemu is basically the same ~14500 Mbps.
>>>>>
>>>>> ~50000 mbps qemu to netcat socket to /dev/null
>>>>> ~35500 mbps virsh save to /dev/null
>>>>>
>>>>> Seems it is not proportional to cpu speed by the looks of it (not a totally fair comparison because the VM sizes are different).
>>>>
>>>> It might be closer to RAM or cache bandwidth limited though; for an extra copy.
>>>
>>> I was thinking about sendfile(2) in iohelper, but that probably
>>> can't work as the input fd is a socket, I am getting EINVAL.
>>
>> Yep, sendfile() requires the input to be a mmapable FD,
>> and the output to be a socket.
>>
>> Try splice() instead  which merely requires 1 end to be a
>> pipe, and the other end can be any FD afaik.
>>
>> With regards,
>> Daniel
>>
> 
> I did try splice(), but performance is worse by around 500%.
> 
> It also fails with EINVAL when trying to use it in combination with O_DIRECT.
> 
> Tried larger and smaller buffers, flags like SPLICE_F_MORE an SPLICE_F_MOVE in any combination; no change, just awful performance.


when doing read from the save file, performance is actually okish (when doing virsh restore), still slightly worse than normal read/write.

Claudio

> 
> Here is the code:
> 
> #ifdef __linux__
> +static ssize_t safesplice(int fdin, int fdout, size_t todo)
> +{
> +    unsigned int flags = SPLICE_F_MOVE | SPLICE_F_MORE;
> +    ssize_t ncopied = 0;
> +
> +    while (todo > 0) {
> +        ssize_t r = splice(fdin, NULL, fdout, NULL, todo, flags);
> +        if (r < 0 && errno == EINTR)
> +            continue;
> +        if (r < 0)
> +            return r;
> +        if (r == 0)
> +            return ncopied;
> +        todo -= r;
> +        ncopied += r;
> +    }
> +    return ncopied;
> +}
> +
> +static ssize_t runIOCopy(const struct runIOParams p)
> +{
> +    size_t len = 1024 * 1024;
> +    ssize_t total = 0;
> +
> +    while (1) {
> +        ssize_t got = safesplice(p.fdin, p.fdout, len);
> +        if (got < 0)
> +            return -1;
> +        if (got == 0)
> +            break;
> +
> +        total += got;
> +
> +        /* handle last write truncate in direct case */
> +        if (got < len && p.isDirect && p.isWrite && !p.isBlockDev) {
> +            if (ftruncate(p.fdout, total) < 0) {
> +                return -4;
> +            }
> +            break;
> +        }
> +    }
> +    return total;
> +}
> +
> +#endif
> 
> 
> Any ideas welcome,
> 
> Claudio
>