[RFC 08/11] aio-posix: gracefully handle io_uring_queue_init() failure

Stefan Hajnoczi posted 11 patches 5 months, 2 weeks ago
Maintainers: Kevin Wolf <kwolf@redhat.com>, Hanna Reitz <hreitz@redhat.com>, Aarushi Mehta <mehta.aaru20@gmail.com>, Julia Suvorova <jusual@redhat.com>, Stefan Hajnoczi <stefanha@redhat.com>, Stefano Garzarella <sgarzare@redhat.com>, Fam Zheng <fam@euphon.net>, Paolo Bonzini <pbonzini@redhat.com>, "Marc-André Lureau" <marcandre.lureau@redhat.com>, "Daniel P. Berrangé" <berrange@redhat.com>, "Philippe Mathieu-Daudé" <philmd@linaro.org>, Stefan Weil <sw@weilnetz.de>
There is a newer version of this series
[RFC 08/11] aio-posix: gracefully handle io_uring_queue_init() failure
Posted by Stefan Hajnoczi 5 months, 2 weeks ago
io_uring may not be available at runtime due to system policies (e.g.
the io_uring_disabled sysctl) or creation could fail due to file
descriptor resource limits.

Handle failure scenarios as follows:

If another AioContext already has io_uring, then fail AioContext
creation so that the aio_add_sqe() API is available uniformly from all
QEMU threads. Otherwise fall back to epoll(7) if io_uring is
unavailable.

Notes:
- Update the comment about selecting the fastest fdmon implementation.
  At this point it's not about speed anymore, it's about aio_add_sqe()
  API availability.
- Uppercase the error message when converting from error_report() to
  error_setg_errno() for consistency (but there are instances of
  lowercase in the codebase).
- It's easier to move the #ifdefs from aio-posix.h to aio-posix.c.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 util/aio-posix.h      | 12 ++----------
 util/aio-posix.c      | 29 ++++++++++++++++++++++++++---
 util/fdmon-io_uring.c |  8 ++++----
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/util/aio-posix.h b/util/aio-posix.h
index f9994ed79e..6f9d97d866 100644
--- a/util/aio-posix.h
+++ b/util/aio-posix.h
@@ -18,6 +18,7 @@
 #define AIO_POSIX_H
 
 #include "block/aio.h"
+#include "qapi/error.h"
 
 struct AioHandler {
     GPollFD pfd;
@@ -72,17 +73,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx)
 #endif /* !CONFIG_EPOLL_CREATE1 */
 
 #ifdef CONFIG_LINUX_IO_URING
-bool fdmon_io_uring_setup(AioContext *ctx);
+void fdmon_io_uring_setup(AioContext *ctx, Error **errp);
 void fdmon_io_uring_destroy(AioContext *ctx);
-#else
-static inline bool fdmon_io_uring_setup(AioContext *ctx)
-{
-    return false;
-}
-
-static inline void fdmon_io_uring_destroy(AioContext *ctx)
-{
-}
 #endif /* !CONFIG_LINUX_IO_URING */
 
 #endif /* AIO_POSIX_H */
diff --git a/util/aio-posix.c b/util/aio-posix.c
index fa047fc7ad..44b3df61f9 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -16,6 +16,7 @@
 #include "qemu/osdep.h"
 #include "block/block.h"
 #include "block/thread-pool.h"
+#include "qapi/error.h"
 #include "qemu/main-loop.h"
 #include "qemu/lockcnt.h"
 #include "qemu/rcu.h"
@@ -717,17 +718,39 @@ void aio_context_setup(AioContext *ctx, Error **errp)
     ctx->epollfd = -1;
     ctx->epollfd_tag = NULL;
 
-    /* Use the fastest fd monitoring implementation if available */
-    if (fdmon_io_uring_setup(ctx)) {
-        return;
+#ifdef CONFIG_LINUX_IO_URING
+    {
+        static bool need_io_uring;
+        Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */
+
+        /* io_uring takes precedence because it provides aio_add_sqe() support */
+        fdmon_io_uring_setup(ctx, &local_err);
+        if (!local_err) {
+            /*
+             * If one AioContext gets io_uring, then all AioContexts need io_uring
+             * so that aio_add_sqe() support is available across all threads.
+             */
+            need_io_uring = true;
+            return;
+        }
+        if (need_io_uring) {
+            error_propagate(errp, local_err);
+            return;
+        }
+
+        warn_report_err_once(local_err); /* frees local_err */
+        local_err = NULL;
     }
+#endif /* CONFIG_LINUX_IO_URING */
 
     fdmon_epoll_setup(ctx);
 }
 
 void aio_context_destroy(AioContext *ctx)
 {
+#ifdef CONFIG_LINUX_IO_URING
     fdmon_io_uring_destroy(ctx);
+#endif
 
     qemu_lockcnt_lock(&ctx->list_lock);
     fdmon_epoll_disable(ctx);
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
index 2092d08d24..ef1a866a03 100644
--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@@ -45,6 +45,7 @@
 
 #include "qemu/osdep.h"
 #include <poll.h>
+#include "qapi/error.h"
 #include "qemu/rcu_queue.h"
 #include "aio-posix.h"
 
@@ -361,7 +362,7 @@ static const FDMonOps fdmon_io_uring_ops = {
     .gsource_dispatch = fdmon_io_uring_gsource_dispatch,
 };
 
-bool fdmon_io_uring_setup(AioContext *ctx)
+void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
 {
     int ret;
 
@@ -369,15 +370,14 @@ bool fdmon_io_uring_setup(AioContext *ctx)
 
     ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
     if (ret != 0) {
-        return false;
+        error_setg_errno(errp, -ret, "Failed to initialize io_uring");
+        return;
     }
 
     QSLIST_INIT(&ctx->submit_list);
     ctx->fdmon_ops = &fdmon_io_uring_ops;
     ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source,
             ctx->fdmon_io_uring.ring_fd, G_IO_IN);
-
-    return true;
 }
 
 void fdmon_io_uring_destroy(AioContext *ctx)
-- 
2.49.0
Re: [RFC 08/11] aio-posix: gracefully handle io_uring_queue_init() failure
Posted by Brian 5 months, 2 weeks ago
On 5/28/25 3:09 PM, Stefan Hajnoczi wrote:
> io_uring may not be available at runtime due to system policies (e.g.
> the io_uring_disabled sysctl) or creation could fail due to file
> descriptor resource limits.
>
> Handle failure scenarios as follows:
>
> If another AioContext already has io_uring, then fail AioContext
> creation so that the aio_add_sqe() API is available uniformly from all
> QEMU threads. Otherwise fall back to epoll(7) if io_uring is
> unavailable.
>
> Notes:
> - Update the comment about selecting the fastest fdmon implementation.
>    At this point it's not about speed anymore, it's about aio_add_sqe()
>    API availability.
> - Uppercase the error message when converting from error_report() to
>    error_setg_errno() for consistency (but there are instances of
>    lowercase in the codebase).
> - It's easier to move the #ifdefs from aio-posix.h to aio-posix.c.
>
> Signed-off-by: Stefan Hajnoczi<stefanha@redhat.com>
> ---
>   util/aio-posix.h      | 12 ++----------
>   util/aio-posix.c      | 29 ++++++++++++++++++++++++++---
>   util/fdmon-io_uring.c |  8 ++++----
>   3 files changed, 32 insertions(+), 17 deletions(-)
>
> diff --git a/util/aio-posix.h b/util/aio-posix.h
> index f9994ed79e..6f9d97d866 100644
> --- a/util/aio-posix.h
> +++ b/util/aio-posix.h
> @@ -18,6 +18,7 @@
>   #define AIO_POSIX_H
>   
>   #include "block/aio.h"
> +#include "qapi/error.h" struct AioHandler { GPollFD pfd; @@ -72,17 +73,8 @@ static inline 
> void fdmon_epoll_disable(AioContext *ctx) #endif /* 
> !CONFIG_EPOLL_CREATE1 */ #ifdef CONFIG_LINUX_IO_URING -bool 
> fdmon_io_uring_setup(AioContext *ctx); +void 
> fdmon_io_uring_setup(AioContext *ctx, Error **errp); void 
> fdmon_io_uring_destroy(AioContext *ctx); -#else -static inline bool 
> fdmon_io_uring_setup(AioContext *ctx) -{ - return false; -} - -static 
> inline void fdmon_io_uring_destroy(AioContext *ctx) -{ -} #endif /* 
> !CONFIG_LINUX_IO_URING */ #endif /* AIO_POSIX_H */ diff --git 
> a/util/aio-posix.c b/util/aio-posix.c index fa047fc7ad..44b3df61f9 
> 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -16,6 +16,7 @@ 
> #include "qemu/osdep.h"
>   #include "block/block.h"
>   #include "block/thread-pool.h"
> +#include "qapi/error.h"
>   #include "qemu/main-loop.h"
>   #include "qemu/lockcnt.h"
>   #include "qemu/rcu.h"
> @@ -717,17 +718,39 @@ void aio_context_setup(AioContext *ctx, Error **errp)
>       ctx->epollfd = -1;
>       ctx->epollfd_tag = NULL;
>   
> -    /* Use the fastest fd monitoring implementation if available */
> -    if (fdmon_io_uring_setup(ctx)) {
> -        return;
> +#ifdef CONFIG_LINUX_IO_URING
> +    {
> +        static bool need_io_uring;
> +        Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */
> +
> +        /* io_uring takes precedence because it provides aio_add_sqe() support */
> +        fdmon_io_uring_setup(ctx, &local_err);
> +        if (!local_err) {
> +            /*
> +             * If one AioContext gets io_uring, then all AioContexts need io_uring
> +             * so that aio_add_sqe() support is available across all threads.
> +             */
> +            need_io_uring = true;
> +            return;
> +        }
> +        if (need_io_uring) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +
> +        warn_report_err_once(local_err); /* frees local_err */
> +        local_err = NULL;
>       }
> +#endif /* CONFIG_LINUX_IO_URING */
>   
Is there a problem with the logic of this code snippet?

If we fail at fdmon_io_uring_setup, specifically at io_uring_queue_init,
local_err (or errp) will be set to a non-NULL error value. In that case,
need_io_uring will be set to true, but the function will return 
immediately.
As a result, the later if (need_io_uring) block will never be executed

>       fdmon_epoll_setup(ctx);
>   }
>   
>   void aio_context_destroy(AioContext *ctx)
>   {
> +#ifdef CONFIG_LINUX_IO_URING
>       fdmon_io_uring_destroy(ctx);
> +#endif
>   
>       qemu_lockcnt_lock(&ctx->list_lock);
>       fdmon_epoll_disable(ctx);
> diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
> index 2092d08d24..ef1a866a03 100644
> --- a/util/fdmon-io_uring.c
> +++ b/util/fdmon-io_uring.c
> @@ -45,6 +45,7 @@
>   
>   #include "qemu/osdep.h"
>   #include <poll.h>
> +#include "qapi/error.h"
>   #include "qemu/rcu_queue.h"
>   #include "aio-posix.h"
>   
> @@ -361,7 +362,7 @@ static const FDMonOps fdmon_io_uring_ops = {
>       .gsource_dispatch = fdmon_io_uring_gsource_dispatch,
>   };
>   
> -bool fdmon_io_uring_setup(AioContext *ctx)
> +void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
>   {
>       int ret;
>   
> @@ -369,15 +370,14 @@ bool fdmon_io_uring_setup(AioContext *ctx)
>   
>       ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
>       if (ret != 0) {
> -        return false;
> +        error_setg_errno(errp, -ret, "Failed to initialize io_uring");
> +        return;
>       }
>   
>       QSLIST_INIT(&ctx->submit_list);
>       ctx->fdmon_ops = &fdmon_io_uring_ops;
>       ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source,
>               ctx->fdmon_io_uring.ring_fd, G_IO_IN);
> -
> -    return true;
>   }
>   
>   void fdmon_io_uring_destroy(AioContext *ctx)
Re: [RFC 08/11] aio-posix: gracefully handle io_uring_queue_init() failure
Posted by Stefan Hajnoczi 5 months, 2 weeks ago
On Mon, Jun 02, 2025 at 08:26:39AM -0400, Brian wrote:
> On 5/28/25 3:09 PM, Stefan Hajnoczi wrote:
> > io_uring may not be available at runtime due to system policies (e.g.
> > the io_uring_disabled sysctl) or creation could fail due to file
> > descriptor resource limits.
> > 
> > Handle failure scenarios as follows:
> > 
> > If another AioContext already has io_uring, then fail AioContext
> > creation so that the aio_add_sqe() API is available uniformly from all
> > QEMU threads. Otherwise fall back to epoll(7) if io_uring is
> > unavailable.
> > 
> > Notes:
> > - Update the comment about selecting the fastest fdmon implementation.
> >    At this point it's not about speed anymore, it's about aio_add_sqe()
> >    API availability.
> > - Uppercase the error message when converting from error_report() to
> >    error_setg_errno() for consistency (but there are instances of
> >    lowercase in the codebase).
> > - It's easier to move the #ifdefs from aio-posix.h to aio-posix.c.
> > 
> > Signed-off-by: Stefan Hajnoczi<stefanha@redhat.com>
> > ---
> >   util/aio-posix.h      | 12 ++----------
> >   util/aio-posix.c      | 29 ++++++++++++++++++++++++++---
> >   util/fdmon-io_uring.c |  8 ++++----
> >   3 files changed, 32 insertions(+), 17 deletions(-)
> > 
> > diff --git a/util/aio-posix.h b/util/aio-posix.h
> > index f9994ed79e..6f9d97d866 100644
> > --- a/util/aio-posix.h
> > +++ b/util/aio-posix.h
> > @@ -18,6 +18,7 @@
> >   #define AIO_POSIX_H
> >   #include "block/aio.h"
> > +#include "qapi/error.h" struct AioHandler { GPollFD pfd; @@ -72,17
> > +73,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx) #endif
> > /* !CONFIG_EPOLL_CREATE1 */ #ifdef CONFIG_LINUX_IO_URING -bool
> > fdmon_io_uring_setup(AioContext *ctx); +void
> > fdmon_io_uring_setup(AioContext *ctx, Error **errp); void
> > fdmon_io_uring_destroy(AioContext *ctx); -#else -static inline bool
> > fdmon_io_uring_setup(AioContext *ctx) -{ - return false; -} - -static
> > inline void fdmon_io_uring_destroy(AioContext *ctx) -{ -} #endif /*
> > !CONFIG_LINUX_IO_URING */ #endif /* AIO_POSIX_H */ diff --git
> > a/util/aio-posix.c b/util/aio-posix.c index fa047fc7ad..44b3df61f9
> > 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -16,6 +16,7 @@
> > #include "qemu/osdep.h"
> >   #include "block/block.h"
> >   #include "block/thread-pool.h"
> > +#include "qapi/error.h"
> >   #include "qemu/main-loop.h"
> >   #include "qemu/lockcnt.h"
> >   #include "qemu/rcu.h"
> > @@ -717,17 +718,39 @@ void aio_context_setup(AioContext *ctx, Error **errp)
> >       ctx->epollfd = -1;
> >       ctx->epollfd_tag = NULL;
> > -    /* Use the fastest fd monitoring implementation if available */
> > -    if (fdmon_io_uring_setup(ctx)) {
> > -        return;
> > +#ifdef CONFIG_LINUX_IO_URING
> > +    {
> > +        static bool need_io_uring;
> > +        Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */
> > +
> > +        /* io_uring takes precedence because it provides aio_add_sqe() support */
> > +        fdmon_io_uring_setup(ctx, &local_err);
> > +        if (!local_err) {
> > +            /*
> > +             * If one AioContext gets io_uring, then all AioContexts need io_uring
> > +             * so that aio_add_sqe() support is available across all threads.
> > +             */
> > +            need_io_uring = true;
> > +            return;
> > +        }
> > +        if (need_io_uring) {
> > +            error_propagate(errp, local_err);
> > +            return;
> > +        }
> > +
> > +        warn_report_err_once(local_err); /* frees local_err */
> > +        local_err = NULL;
> >       }
> > +#endif /* CONFIG_LINUX_IO_URING */
> Is there a problem with the logic of this code snippet?
> 
> If we fail at fdmon_io_uring_setup, specifically at io_uring_queue_init,
> local_err (or errp) will be set to a non-NULL error value. In that case,
> need_io_uring will be set to true, but the function will return immediately.

If local_err is non-NULL then this conditional is not taken:

  if (!local_err) {
      /*
       * If one AioContext gets io_uring, then all AioContexts need io_uring
       * so that aio_add_sqe() support is available across all threads.
       */
      need_io_uring = true;
      return;
  }

If the logic you described is correct, please rephrase it. I don't see
how what you've written can happen.

> As a result, the later if (need_io_uring) block will never be executed
> 
> >       fdmon_epoll_setup(ctx);
> >   }
> >   void aio_context_destroy(AioContext *ctx)
> >   {
> > +#ifdef CONFIG_LINUX_IO_URING
> >       fdmon_io_uring_destroy(ctx);
> > +#endif
> >       qemu_lockcnt_lock(&ctx->list_lock);
> >       fdmon_epoll_disable(ctx);
> > diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
> > index 2092d08d24..ef1a866a03 100644
> > --- a/util/fdmon-io_uring.c
> > +++ b/util/fdmon-io_uring.c
> > @@ -45,6 +45,7 @@
> >   #include "qemu/osdep.h"
> >   #include <poll.h>
> > +#include "qapi/error.h"
> >   #include "qemu/rcu_queue.h"
> >   #include "aio-posix.h"
> > @@ -361,7 +362,7 @@ static const FDMonOps fdmon_io_uring_ops = {
> >       .gsource_dispatch = fdmon_io_uring_gsource_dispatch,
> >   };
> > -bool fdmon_io_uring_setup(AioContext *ctx)
> > +void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
> >   {
> >       int ret;
> > @@ -369,15 +370,14 @@ bool fdmon_io_uring_setup(AioContext *ctx)
> >       ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
> >       if (ret != 0) {
> > -        return false;
> > +        error_setg_errno(errp, -ret, "Failed to initialize io_uring");
> > +        return;
> >       }
> >       QSLIST_INIT(&ctx->submit_list);
> >       ctx->fdmon_ops = &fdmon_io_uring_ops;
> >       ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source,
> >               ctx->fdmon_io_uring.ring_fd, G_IO_IN);
> > -
> > -    return true;
> >   }
> >   void fdmon_io_uring_destroy(AioContext *ctx)
Re: [RFC 08/11] aio-posix: gracefully handle io_uring_queue_init() failure
Posted by Brian 5 months, 2 weeks ago
On 6/2/25 4:20 PM, Stefan Hajnoczi wrote:
> On Mon, Jun 02, 2025 at 08:26:39AM -0400, Brian wrote:
>> On 5/28/25 3:09 PM, Stefan Hajnoczi wrote:
>>> io_uring may not be available at runtime due to system policies (e.g.
>>> the io_uring_disabled sysctl) or creation could fail due to file
>>> descriptor resource limits.
>>>
>>> Handle failure scenarios as follows:
>>>
>>> If another AioContext already has io_uring, then fail AioContext
>>> creation so that the aio_add_sqe() API is available uniformly from all
>>> QEMU threads. Otherwise fall back to epoll(7) if io_uring is
>>> unavailable.
>>>
>>> Notes:
>>> - Update the comment about selecting the fastest fdmon implementation.
>>>     At this point it's not about speed anymore, it's about aio_add_sqe()
>>>     API availability.
>>> - Uppercase the error message when converting from error_report() to
>>>     error_setg_errno() for consistency (but there are instances of
>>>     lowercase in the codebase).
>>> - It's easier to move the #ifdefs from aio-posix.h to aio-posix.c.
>>>
>>> Signed-off-by: Stefan Hajnoczi<stefanha@redhat.com>
>>> ---
>>>    util/aio-posix.h      | 12 ++----------
>>>    util/aio-posix.c      | 29 ++++++++++++++++++++++++++---
>>>    util/fdmon-io_uring.c |  8 ++++----
>>>    3 files changed, 32 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/util/aio-posix.h b/util/aio-posix.h
>>> index f9994ed79e..6f9d97d866 100644
>>> --- a/util/aio-posix.h
>>> +++ b/util/aio-posix.h
>>> @@ -18,6 +18,7 @@
>>>    #define AIO_POSIX_H
>>>    #include "block/aio.h"
>>> +#include "qapi/error.h" struct AioHandler { GPollFD pfd; @@ -72,17
>>> +73,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx) #endif
>>> /* !CONFIG_EPOLL_CREATE1 */ #ifdef CONFIG_LINUX_IO_URING -bool
>>> fdmon_io_uring_setup(AioContext *ctx); +void
>>> fdmon_io_uring_setup(AioContext *ctx, Error **errp); void
>>> fdmon_io_uring_destroy(AioContext *ctx); -#else -static inline bool
>>> fdmon_io_uring_setup(AioContext *ctx) -{ - return false; -} - -static
>>> inline void fdmon_io_uring_destroy(AioContext *ctx) -{ -} #endif /*
>>> !CONFIG_LINUX_IO_URING */ #endif /* AIO_POSIX_H */ diff --git
>>> a/util/aio-posix.c b/util/aio-posix.c index fa047fc7ad..44b3df61f9
>>> 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -16,6 +16,7 @@
>>> #include "qemu/osdep.h"
>>>    #include "block/block.h"
>>>    #include "block/thread-pool.h"
>>> +#include "qapi/error.h"
>>>    #include "qemu/main-loop.h"
>>>    #include "qemu/lockcnt.h"
>>>    #include "qemu/rcu.h"
>>> @@ -717,17 +718,39 @@ void aio_context_setup(AioContext *ctx, Error **errp)
>>>        ctx->epollfd = -1;
>>>        ctx->epollfd_tag = NULL;
>>> -    /* Use the fastest fd monitoring implementation if available */
>>> -    if (fdmon_io_uring_setup(ctx)) {
>>> -        return;
>>> +#ifdef CONFIG_LINUX_IO_URING
>>> +    {
>>> +        static bool need_io_uring;
>>> +        Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */
>>> +
>>> +        /* io_uring takes precedence because it provides aio_add_sqe() support */
>>> +        fdmon_io_uring_setup(ctx, &local_err);
>>> +        if (!local_err) {
>>> +            /*
>>> +             * If one AioContext gets io_uring, then all AioContexts need io_uring
>>> +             * so that aio_add_sqe() support is available across all threads.
>>> +             */
>>> +            need_io_uring = true;
>>> +            return;
>>> +        }
>>> +        if (need_io_uring) {
>>> +            error_propagate(errp, local_err);
>>> +            return;
>>> +        }
>>> +
>>> +        warn_report_err_once(local_err); /* frees local_err */
>>> +        local_err = NULL;
>>>        }
>>> +#endif /* CONFIG_LINUX_IO_URING */
>> Is there a problem with the logic of this code snippet?
>>
>> If we fail at fdmon_io_uring_setup, specifically at io_uring_queue_init,
>> local_err (or errp) will be set to a non-NULL error value. In that case,
>> need_io_uring will be set to true, but the function will return immediately.
> If local_err is non-NULL then this conditional is not taken:
>
>    if (!local_err) {
>        /*
>         * If one AioContext gets io_uring, then all AioContexts need io_uring
>         * so that aio_add_sqe() support is available across all threads.
>         */
>        need_io_uring = true;
>        return;
>    }
>
> If the logic you described is correct, please rephrase it. I don't see
> how what you've written can happen.

Sorry, I didn’t notice that need_io_uring is a static var. Was confused 
about when if (need_io_block) gets executed

>> As a result, the later if (need_io_uring) block will never be executed
>>
>>>        fdmon_epoll_setup(ctx);
>>>    }
>>>    void aio_context_destroy(AioContext *ctx)
>>>    {
>>> +#ifdef CONFIG_LINUX_IO_URING
>>>        fdmon_io_uring_destroy(ctx);
>>> +#endif
>>>        qemu_lockcnt_lock(&ctx->list_lock);
>>>        fdmon_epoll_disable(ctx);
>>> diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
>>> index 2092d08d24..ef1a866a03 100644
>>> --- a/util/fdmon-io_uring.c
>>> +++ b/util/fdmon-io_uring.c
>>> @@ -45,6 +45,7 @@
>>>    #include "qemu/osdep.h"
>>>    #include <poll.h>
>>> +#include "qapi/error.h"
>>>    #include "qemu/rcu_queue.h"
>>>    #include "aio-posix.h"
>>> @@ -361,7 +362,7 @@ static const FDMonOps fdmon_io_uring_ops = {
>>>        .gsource_dispatch = fdmon_io_uring_gsource_dispatch,
>>>    };
>>> -bool fdmon_io_uring_setup(AioContext *ctx)
>>> +void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
>>>    {
>>>        int ret;
>>> @@ -369,15 +370,14 @@ bool fdmon_io_uring_setup(AioContext *ctx)
>>>        ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
>>>        if (ret != 0) {
>>> -        return false;
>>> +        error_setg_errno(errp, -ret, "Failed to initialize io_uring");
>>> +        return;
>>>        }
>>>        QSLIST_INIT(&ctx->submit_list);
>>>        ctx->fdmon_ops = &fdmon_io_uring_ops;
>>>        ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source,
>>>                ctx->fdmon_io_uring.ring_fd, G_IO_IN);
>>> -
>>> -    return true;
>>>    }
>>>    void fdmon_io_uring_destroy(AioContext *ctx)
Re: [RFC 08/11] aio-posix: gracefully handle io_uring_queue_init() failure
Posted by Eric Blake 5 months, 2 weeks ago
On Wed, May 28, 2025 at 03:09:13PM -0400, Stefan Hajnoczi wrote:
> io_uring may not be available at runtime due to system policies (e.g.
> the io_uring_disabled sysctl) or creation could fail due to file
> descriptor resource limits.
> 
> Handle failure scenarios as follows:
> 
> If another AioContext already has io_uring, then fail AioContext
> creation so that the aio_add_sqe() API is available uniformly from all
> QEMU threads. Otherwise fall back to epoll(7) if io_uring is
> unavailable.
> 
> Notes:
> - Update the comment about selecting the fastest fdmon implementation.
>   At this point it's not about speed anymore, it's about aio_add_sqe()
>   API availability.
> - Uppercase the error message when converting from error_report() to
>   error_setg_errno() for consistency (but there are instances of
>   lowercase in the codebase).
> - It's easier to move the #ifdefs from aio-posix.h to aio-posix.c.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  util/aio-posix.h      | 12 ++----------
>  util/aio-posix.c      | 29 ++++++++++++++++++++++++++---
>  util/fdmon-io_uring.c |  8 ++++----
>  3 files changed, 32 insertions(+), 17 deletions(-)
> 
> diff --git a/util/aio-posix.h b/util/aio-posix.h
> index f9994ed79e..6f9d97d866 100644
> --- a/util/aio-posix.h
> +++ b/util/aio-posix.h
> @@ -18,6 +18,7 @@
>  #define AIO_POSIX_H
>  
>  #include "block/aio.h"
> +#include "qapi/error.h"
>  
>  struct AioHandler {
>      GPollFD pfd;
> @@ -72,17 +73,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx)
>  #endif /* !CONFIG_EPOLL_CREATE1 */
>  
>  #ifdef CONFIG_LINUX_IO_URING
> -bool fdmon_io_uring_setup(AioContext *ctx);
> +void fdmon_io_uring_setup(AioContext *ctx, Error **errp);

Why change the return type to void?  That forces you to have to check
errp.  If you still return bool (true for errp unchanged, false when
errp set), callers might have a simpler interface.

>  void fdmon_io_uring_destroy(AioContext *ctx);
> -#else
> -static inline bool fdmon_io_uring_setup(AioContext *ctx)
> -{
> -    return false;
> -}
> -
> -static inline void fdmon_io_uring_destroy(AioContext *ctx)
> -{
> -}
>  #endif /* !CONFIG_LINUX_IO_URING */
>  
>  #endif /* AIO_POSIX_H */
> diff --git a/util/aio-posix.c b/util/aio-posix.c
> index fa047fc7ad..44b3df61f9 100644
> --- a/util/aio-posix.c
> +++ b/util/aio-posix.c
> @@ -16,6 +16,7 @@
>  #include "qemu/osdep.h"
>  #include "block/block.h"
>  #include "block/thread-pool.h"
> +#include "qapi/error.h"
>  #include "qemu/main-loop.h"
>  #include "qemu/lockcnt.h"
>  #include "qemu/rcu.h"
> @@ -717,17 +718,39 @@ void aio_context_setup(AioContext *ctx, Error **errp)
>      ctx->epollfd = -1;
>      ctx->epollfd_tag = NULL;
>  
> -    /* Use the fastest fd monitoring implementation if available */
> -    if (fdmon_io_uring_setup(ctx)) {
> -        return;
> +#ifdef CONFIG_LINUX_IO_URING
> +    {
> +        static bool need_io_uring;
> +        Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */

Really? I thought that was part of why we added ERRP_GUARD, so that
error_abort is pinned closer to the spot where the error is triggered
rather than where it was chained.  But your use of errp is a bit
different than usual here, so you may be correct that it doesn't
handle error_abort in the way that you want (allowing a graceful
downgrade to epoll if it is the first failure, but aborting if it is
the second AioContext that fails).

> +
> +        /* io_uring takes precedence because it provides aio_add_sqe() support */
> +        fdmon_io_uring_setup(ctx, &local_err);
> +        if (!local_err) {
> +            /*
> +             * If one AioContext gets io_uring, then all AioContexts need io_uring
> +             * so that aio_add_sqe() support is available across all threads.
> +             */
> +            need_io_uring = true;
> +            return;
> +        }
> +        if (need_io_uring) {
> +            error_propagate(errp, local_err);
> +            return;
> +        }
> +
> +        warn_report_err_once(local_err); /* frees local_err */
> +        local_err = NULL;
>      }
> +#endif /* CONFIG_LINUX_IO_URING */
>  
>      fdmon_epoll_setup(ctx);
>  }
>  
>  void aio_context_destroy(AioContext *ctx)
>  {
> +#ifdef CONFIG_LINUX_IO_URING
>      fdmon_io_uring_destroy(ctx);
> +#endif
>  
>      qemu_lockcnt_lock(&ctx->list_lock);
>      fdmon_epoll_disable(ctx);
> diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
> index 2092d08d24..ef1a866a03 100644
> --- a/util/fdmon-io_uring.c
> +++ b/util/fdmon-io_uring.c
> @@ -45,6 +45,7 @@
>  
>  #include "qemu/osdep.h"
>  #include <poll.h>
> +#include "qapi/error.h"
>  #include "qemu/rcu_queue.h"
>  #include "aio-posix.h"
>  
> @@ -361,7 +362,7 @@ static const FDMonOps fdmon_io_uring_ops = {
>      .gsource_dispatch = fdmon_io_uring_gsource_dispatch,
>  };
>  
> -bool fdmon_io_uring_setup(AioContext *ctx)
> +void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
>  {
>      int ret;
>  
> @@ -369,15 +370,14 @@ bool fdmon_io_uring_setup(AioContext *ctx)
>  
>      ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
>      if (ret != 0) {
> -        return false;
> +        error_setg_errno(errp, -ret, "Failed to initialize io_uring");
> +        return;

This is where I question whether you should still return bool.

>      }
>  
>      QSLIST_INIT(&ctx->submit_list);
>      ctx->fdmon_ops = &fdmon_io_uring_ops;
>      ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source,
>              ctx->fdmon_io_uring.ring_fd, G_IO_IN);
> -
> -    return true;
>  }
>  
>  void fdmon_io_uring_destroy(AioContext *ctx)
> -- 
> 2.49.0
> 
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org
Re: [RFC 08/11] aio-posix: gracefully handle io_uring_queue_init() failure
Posted by Stefan Hajnoczi 5 months, 2 weeks ago
On Wed, May 28, 2025 at 05:12:14PM -0500, Eric Blake wrote:
> On Wed, May 28, 2025 at 03:09:13PM -0400, Stefan Hajnoczi wrote:
> > io_uring may not be available at runtime due to system policies (e.g.
> > the io_uring_disabled sysctl) or creation could fail due to file
> > descriptor resource limits.
> > 
> > Handle failure scenarios as follows:
> > 
> > If another AioContext already has io_uring, then fail AioContext
> > creation so that the aio_add_sqe() API is available uniformly from all
> > QEMU threads. Otherwise fall back to epoll(7) if io_uring is
> > unavailable.
> > 
> > Notes:
> > - Update the comment about selecting the fastest fdmon implementation.
> >   At this point it's not about speed anymore, it's about aio_add_sqe()
> >   API availability.
> > - Uppercase the error message when converting from error_report() to
> >   error_setg_errno() for consistency (but there are instances of
> >   lowercase in the codebase).
> > - It's easier to move the #ifdefs from aio-posix.h to aio-posix.c.
> > 
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> >  util/aio-posix.h      | 12 ++----------
> >  util/aio-posix.c      | 29 ++++++++++++++++++++++++++---
> >  util/fdmon-io_uring.c |  8 ++++----
> >  3 files changed, 32 insertions(+), 17 deletions(-)
> > 
> > diff --git a/util/aio-posix.h b/util/aio-posix.h
> > index f9994ed79e..6f9d97d866 100644
> > --- a/util/aio-posix.h
> > +++ b/util/aio-posix.h
> > @@ -18,6 +18,7 @@
> >  #define AIO_POSIX_H
> >  
> >  #include "block/aio.h"
> > +#include "qapi/error.h"
> >  
> >  struct AioHandler {
> >      GPollFD pfd;
> > @@ -72,17 +73,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx)
> >  #endif /* !CONFIG_EPOLL_CREATE1 */
> >  
> >  #ifdef CONFIG_LINUX_IO_URING
> > -bool fdmon_io_uring_setup(AioContext *ctx);
> > +void fdmon_io_uring_setup(AioContext *ctx, Error **errp);
> 
> Why change the return type to void?  That forces you to have to check
> errp.  If you still return bool (true for errp unchanged, false when
> errp set), callers might have a simpler interface.

QEMU has both forms. I prefer void foo(Error **errp) because it
eliminates these awkward states:
1. Return true with errp set. There is a risk of leaking errp here.
2. Return false with errp NULL. This results in no error message.

Sometimes it is handy to have a return value but I think that void is a
good default return type.

I have CCed Markus in case he has suggestions.

> 
> >  void fdmon_io_uring_destroy(AioContext *ctx);
> > -#else
> > -static inline bool fdmon_io_uring_setup(AioContext *ctx)
> > -{
> > -    return false;
> > -}
> > -
> > -static inline void fdmon_io_uring_destroy(AioContext *ctx)
> > -{
> > -}
> >  #endif /* !CONFIG_LINUX_IO_URING */
> >  
> >  #endif /* AIO_POSIX_H */
> > diff --git a/util/aio-posix.c b/util/aio-posix.c
> > index fa047fc7ad..44b3df61f9 100644
> > --- a/util/aio-posix.c
> > +++ b/util/aio-posix.c
> > @@ -16,6 +16,7 @@
> >  #include "qemu/osdep.h"
> >  #include "block/block.h"
> >  #include "block/thread-pool.h"
> > +#include "qapi/error.h"
> >  #include "qemu/main-loop.h"
> >  #include "qemu/lockcnt.h"
> >  #include "qemu/rcu.h"
> > @@ -717,17 +718,39 @@ void aio_context_setup(AioContext *ctx, Error **errp)
> >      ctx->epollfd = -1;
> >      ctx->epollfd_tag = NULL;
> >  
> > -    /* Use the fastest fd monitoring implementation if available */
> > -    if (fdmon_io_uring_setup(ctx)) {
> > -        return;
> > +#ifdef CONFIG_LINUX_IO_URING
> > +    {
> > +        static bool need_io_uring;
> > +        Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */
> 
> Really? I thought that was part of why we added ERRP_GUARD, so that
> error_abort is pinned closer to the spot where the error is triggered
> rather than where it was chained.  But your use of errp is a bit
> different than usual here, so you may be correct that it doesn't
> handle error_abort in the way that you want (allowing a graceful
> downgrade to epoll if it is the first failure, but aborting if it is
> the second AioContext that fails).

The ERRP_GUARD() doc comment explains why it behaves this way:

 * Note: &error_abort is not rewritten, because that would move the
 * abort from the place where the error is created to the place where
 * it's propagated.

> 
> > +
> > +        /* io_uring takes precedence because it provides aio_add_sqe() support */
> > +        fdmon_io_uring_setup(ctx, &local_err);
> > +        if (!local_err) {
> > +            /*
> > +             * If one AioContext gets io_uring, then all AioContexts need io_uring
> > +             * so that aio_add_sqe() support is available across all threads.
> > +             */
> > +            need_io_uring = true;
> > +            return;
> > +        }
> > +        if (need_io_uring) {
> > +            error_propagate(errp, local_err);
> > +            return;
> > +        }
> > +
> > +        warn_report_err_once(local_err); /* frees local_err */
> > +        local_err = NULL;
> >      }
> > +#endif /* CONFIG_LINUX_IO_URING */
> >  
> >      fdmon_epoll_setup(ctx);
> >  }
> >  
> >  void aio_context_destroy(AioContext *ctx)
> >  {
> > +#ifdef CONFIG_LINUX_IO_URING
> >      fdmon_io_uring_destroy(ctx);
> > +#endif
> >  
> >      qemu_lockcnt_lock(&ctx->list_lock);
> >      fdmon_epoll_disable(ctx);
> > diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
> > index 2092d08d24..ef1a866a03 100644
> > --- a/util/fdmon-io_uring.c
> > +++ b/util/fdmon-io_uring.c
> > @@ -45,6 +45,7 @@
> >  
> >  #include "qemu/osdep.h"
> >  #include <poll.h>
> > +#include "qapi/error.h"
> >  #include "qemu/rcu_queue.h"
> >  #include "aio-posix.h"
> >  
> > @@ -361,7 +362,7 @@ static const FDMonOps fdmon_io_uring_ops = {
> >      .gsource_dispatch = fdmon_io_uring_gsource_dispatch,
> >  };
> >  
> > -bool fdmon_io_uring_setup(AioContext *ctx)
> > +void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
> >  {
> >      int ret;
> >  
> > @@ -369,15 +370,14 @@ bool fdmon_io_uring_setup(AioContext *ctx)
> >  
> >      ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
> >      if (ret != 0) {
> > -        return false;
> > +        error_setg_errno(errp, -ret, "Failed to initialize io_uring");
> > +        return;
> 
> This is where I question whether you should still return bool.
> 
> >      }
> >  
> >      QSLIST_INIT(&ctx->submit_list);
> >      ctx->fdmon_ops = &fdmon_io_uring_ops;
> >      ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source,
> >              ctx->fdmon_io_uring.ring_fd, G_IO_IN);
> > -
> > -    return true;
> >  }
> >  
> >  void fdmon_io_uring_destroy(AioContext *ctx)
> > -- 
> > 2.49.0
> > 
> > 
> 
> -- 
> Eric Blake, Principal Software Engineer
> Red Hat, Inc.
> Virtualization:  qemu.org | libguestfs.org
> 
Re: [RFC 08/11] aio-posix: gracefully handle io_uring_queue_init() failure
Posted by Markus Armbruster 5 months, 2 weeks ago
Stefan Hajnoczi <stefanha@redhat.com> writes:

> On Wed, May 28, 2025 at 05:12:14PM -0500, Eric Blake wrote:
>> On Wed, May 28, 2025 at 03:09:13PM -0400, Stefan Hajnoczi wrote:
>> > io_uring may not be available at runtime due to system policies (e.g.
>> > the io_uring_disabled sysctl) or creation could fail due to file
>> > descriptor resource limits.
>> > 
>> > Handle failure scenarios as follows:
>> > 
>> > If another AioContext already has io_uring, then fail AioContext
>> > creation so that the aio_add_sqe() API is available uniformly from all
>> > QEMU threads. Otherwise fall back to epoll(7) if io_uring is
>> > unavailable.
>> > 
>> > Notes:
>> > - Update the comment about selecting the fastest fdmon implementation.
>> >   At this point it's not about speed anymore, it's about aio_add_sqe()
>> >   API availability.
>> > - Uppercase the error message when converting from error_report() to
>> >   error_setg_errno() for consistency (but there are instances of
>> >   lowercase in the codebase).
>> > - It's easier to move the #ifdefs from aio-posix.h to aio-posix.c.
>> > 
>> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>> > ---
>> >  util/aio-posix.h      | 12 ++----------
>> >  util/aio-posix.c      | 29 ++++++++++++++++++++++++++---
>> >  util/fdmon-io_uring.c |  8 ++++----
>> >  3 files changed, 32 insertions(+), 17 deletions(-)
>> > 
>> > diff --git a/util/aio-posix.h b/util/aio-posix.h
>> > index f9994ed79e..6f9d97d866 100644
>> > --- a/util/aio-posix.h
>> > +++ b/util/aio-posix.h
>> > @@ -18,6 +18,7 @@
>> >  #define AIO_POSIX_H
>> >  
>> >  #include "block/aio.h"
>> > +#include "qapi/error.h"
>> >  
>> >  struct AioHandler {
>> >      GPollFD pfd;
>> > @@ -72,17 +73,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx)
>> >  #endif /* !CONFIG_EPOLL_CREATE1 */
>> >  
>> >  #ifdef CONFIG_LINUX_IO_URING
>> > -bool fdmon_io_uring_setup(AioContext *ctx);
>> > +void fdmon_io_uring_setup(AioContext *ctx, Error **errp);
>> 
>> Why change the return type to void?  That forces you to have to check
>> errp.  If you still return bool (true for errp unchanged, false when
>> errp set), callers might have a simpler interface.
>
> QEMU has both forms. I prefer void foo(Error **errp) because it
> eliminates these awkward states:
> 1. Return true with errp set. There is a risk of leaking errp here.
> 2. Return false with errp NULL. This results in no error message.
>
> Sometimes it is handy to have a return value but I think that void is a
> good default return type.

I used to think this way, too.  Experience changed my mind.

The "awkward states" are bugs.

The price to avoid them is more verbose error handling.  Because such
bugs have been rare in practice, the vebosity has turned out to be too
much pain for too little gain.  qapi/error.h's big comment:

 * = Rules =
 [...]
 * - Whenever practical, also return a value that indicates success /
 *   failure.  This can make the error checking more concise, and can
 *   avoid useless error object creation and destruction.  Note that
 *   we still have many functions returning void.  We recommend
 *   • bool-valued functions return true on success / false on failure,
 *   • pointer-valued functions return non-null / null pointer, and
 *   • integer-valued functions return non-negative / negative.

For what it's worth, this is exactly how GError wants to be used.  We
deviated from it because we thought we were smarter, and came to regret
it.

Further down, the big comment shows example code:

 * Call a function, receive an error from it, and pass it to the caller
 * - when the function returns a value that indicates failure, say
 *   false:
 *     if (!foo(arg, errp)) {
 *         handle the error...
 *     }
 * - when it does not, say because it is a void function:
 *     ERRP_GUARD();
 *     foo(arg, errp);
 *     if (*errp) {
 *         handle the error...
 *     }
 * More on ERRP_GUARD() below.
 *
 * Code predating ERRP_GUARD() still exists, and looks like this:
 *     Error *err = NULL;
 *     foo(arg, &err);
 *     if (err) {
 *         handle the error...
 *         error_propagate(errp, err); // deprecated
 *     }
 * Avoid in new code.  Do *not* "optimize" it to
 *     foo(arg, errp);
 *     if (*errp) { // WRONG!
 *         handle the error...
 *     }
 * because errp may be NULL without the ERRP_GUARD() guard.

In case you think the difference in readability isn't all that big:
error handling is *everywhere*, and any clutter adds up quickly,
obscuring the logic.  Every line counts.

> I have CCed Markus in case he has suggestions.

Thanks for that!

>> >  void fdmon_io_uring_destroy(AioContext *ctx);
>> > -#else
>> > -static inline bool fdmon_io_uring_setup(AioContext *ctx)
>> > -{
>> > -    return false;
>> > -}
>> > -
>> > -static inline void fdmon_io_uring_destroy(AioContext *ctx)
>> > -{
>> > -}
>> >  #endif /* !CONFIG_LINUX_IO_URING */
>> >  
>> >  #endif /* AIO_POSIX_H */
>> > diff --git a/util/aio-posix.c b/util/aio-posix.c
>> > index fa047fc7ad..44b3df61f9 100644
>> > --- a/util/aio-posix.c
>> > +++ b/util/aio-posix.c
>> > @@ -16,6 +16,7 @@
>> >  #include "qemu/osdep.h"
>> >  #include "block/block.h"
>> >  #include "block/thread-pool.h"
>> > +#include "qapi/error.h"
>> >  #include "qemu/main-loop.h"
>> >  #include "qemu/lockcnt.h"
>> >  #include "qemu/rcu.h"
>> > @@ -717,17 +718,39 @@ void aio_context_setup(AioContext *ctx, Error **errp)
>> >      ctx->epollfd = -1;
>> >      ctx->epollfd_tag = NULL;
>> >  
>> > -    /* Use the fastest fd monitoring implementation if available */
>> > -    if (fdmon_io_uring_setup(ctx)) {
>> > -        return;
>> > +#ifdef CONFIG_LINUX_IO_URING
>> > +    {
>> > +        static bool need_io_uring;
>> > +        Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */
>> 
>> Really? I thought that was part of why we added ERRP_GUARD, so that
>> error_abort is pinned closer to the spot where the error is triggered
>> rather than where it was chained.  But your use of errp is a bit
>> different than usual here, so you may be correct that it doesn't
>> handle error_abort in the way that you want (allowing a graceful
>> downgrade to epoll if it is the first failure, but aborting if it is
>> the second AioContext that fails).
>
> The ERRP_GUARD() doc comment explains why it behaves this way:
>
>  * Note: &error_abort is not rewritten, because that would move the
>  * abort from the place where the error is created to the place where
>  * it's propagated.

Error propagation should be avoided when possible.  It's not possible
here; more on that below.

Why avoid?  Two reasons.  One, it degrades stack backtraces, as Eric
pointed out.  Two, passing @errp directly is more obvious and less
verbose, as we've seen above.

>> > +
>> > +        /* io_uring takes precedence because it provides aio_add_sqe() support */
>> > +        fdmon_io_uring_setup(ctx, &local_err);
>> > +        if (!local_err) {
>> > +            /*
>> > +             * If one AioContext gets io_uring, then all AioContexts need io_uring
>> > +             * so that aio_add_sqe() support is available across all threads.
>> > +             */
>> > +            need_io_uring = true;
>> > +            return;
>> > +        }
>> > +        if (need_io_uring) {
>> > +            error_propagate(errp, local_err);
>> > +            return;
>> > +        }
>> > +
>> > +        warn_report_err_once(local_err); /* frees local_err */
>> > +        local_err = NULL;

This is why we can't avoid error_propagate() here: when
fdmon_io_uring_setup() fails, we either consume the error and succeed,
or pass it to the caller and fail.

Because of the former, passing @errp to fdmon_io_uring_setup() would be
wrong; we need to pass a &local_err.  Which we then need to propagate to
do the latter.

Similar code exists elsewhere.  It's fairly rare, though.

ERRP_GUARD() is not designed for this pattern.  We have to propragate
manually.

I'd drop the /* ERRP_GUARD() doesn't handle error_abort */ comment.  To
make sense of it, I believe you need to understand a lot more.  And if
you do, you don't really need the comment.

>> >      }
>> > +#endif /* CONFIG_LINUX_IO_URING */
>> >  
>> >      fdmon_epoll_setup(ctx);
>> >  }
>> >  
>> >  void aio_context_destroy(AioContext *ctx)
>> >  {
>> > +#ifdef CONFIG_LINUX_IO_URING
>> >      fdmon_io_uring_destroy(ctx);
>> > +#endif
>> >  
>> >      qemu_lockcnt_lock(&ctx->list_lock);
>> >      fdmon_epoll_disable(ctx);

[...]
Re: [RFC 08/11] aio-posix: gracefully handle io_uring_queue_init() failure
Posted by Stefan Hajnoczi 5 months, 2 weeks ago
On Tue, Jun 03, 2025 at 08:05:58AM +0200, Markus Armbruster wrote:
> Stefan Hajnoczi <stefanha@redhat.com> writes:
> 
> > On Wed, May 28, 2025 at 05:12:14PM -0500, Eric Blake wrote:
> >> On Wed, May 28, 2025 at 03:09:13PM -0400, Stefan Hajnoczi wrote:
> >> > io_uring may not be available at runtime due to system policies (e.g.
> >> > the io_uring_disabled sysctl) or creation could fail due to file
> >> > descriptor resource limits.
> >> > 
> >> > Handle failure scenarios as follows:
> >> > 
> >> > If another AioContext already has io_uring, then fail AioContext
> >> > creation so that the aio_add_sqe() API is available uniformly from all
> >> > QEMU threads. Otherwise fall back to epoll(7) if io_uring is
> >> > unavailable.
> >> > 
> >> > Notes:
> >> > - Update the comment about selecting the fastest fdmon implementation.
> >> >   At this point it's not about speed anymore, it's about aio_add_sqe()
> >> >   API availability.
> >> > - Uppercase the error message when converting from error_report() to
> >> >   error_setg_errno() for consistency (but there are instances of
> >> >   lowercase in the codebase).
> >> > - It's easier to move the #ifdefs from aio-posix.h to aio-posix.c.
> >> > 
> >> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> >> > ---
> >> >  util/aio-posix.h      | 12 ++----------
> >> >  util/aio-posix.c      | 29 ++++++++++++++++++++++++++---
> >> >  util/fdmon-io_uring.c |  8 ++++----
> >> >  3 files changed, 32 insertions(+), 17 deletions(-)
> >> > 
> >> > diff --git a/util/aio-posix.h b/util/aio-posix.h
> >> > index f9994ed79e..6f9d97d866 100644
> >> > --- a/util/aio-posix.h
> >> > +++ b/util/aio-posix.h
> >> > @@ -18,6 +18,7 @@
> >> >  #define AIO_POSIX_H
> >> >  
> >> >  #include "block/aio.h"
> >> > +#include "qapi/error.h"
> >> >  
> >> >  struct AioHandler {
> >> >      GPollFD pfd;
> >> > @@ -72,17 +73,8 @@ static inline void fdmon_epoll_disable(AioContext *ctx)
> >> >  #endif /* !CONFIG_EPOLL_CREATE1 */
> >> >  
> >> >  #ifdef CONFIG_LINUX_IO_URING
> >> > -bool fdmon_io_uring_setup(AioContext *ctx);
> >> > +void fdmon_io_uring_setup(AioContext *ctx, Error **errp);
> >> 
> >> Why change the return type to void?  That forces you to have to check
> >> errp.  If you still return bool (true for errp unchanged, false when
> >> errp set), callers might have a simpler interface.
> >
> > QEMU has both forms. I prefer void foo(Error **errp) because it
> > eliminates these awkward states:
> > 1. Return true with errp set. There is a risk of leaking errp here.
> > 2. Return false with errp NULL. This results in no error message.
> >
> > Sometimes it is handy to have a return value but I think that void is a
> > good default return type.
> 
> I used to think this way, too.  Experience changed my mind.
> 
> The "awkward states" are bugs.
> 
> The price to avoid them is more verbose error handling.  Because such
> bugs have been rare in practice, the vebosity has turned out to be too
> much pain for too little gain.  qapi/error.h's big comment:
> 
>  * = Rules =
>  [...]
>  * - Whenever practical, also return a value that indicates success /
>  *   failure.  This can make the error checking more concise, and can
>  *   avoid useless error object creation and destruction.  Note that
>  *   we still have many functions returning void.  We recommend
>  *   • bool-valued functions return true on success / false on failure,
>  *   • pointer-valued functions return non-null / null pointer, and
>  *   • integer-valued functions return non-negative / negative.
> 
> For what it's worth, this is exactly how GError wants to be used.  We
> deviated from it because we thought we were smarter, and came to regret
> it.
> 
> Further down, the big comment shows example code:
> 
>  * Call a function, receive an error from it, and pass it to the caller
>  * - when the function returns a value that indicates failure, say
>  *   false:
>  *     if (!foo(arg, errp)) {
>  *         handle the error...
>  *     }
>  * - when it does not, say because it is a void function:
>  *     ERRP_GUARD();
>  *     foo(arg, errp);
>  *     if (*errp) {
>  *         handle the error...
>  *     }
>  * More on ERRP_GUARD() below.
>  *
>  * Code predating ERRP_GUARD() still exists, and looks like this:
>  *     Error *err = NULL;
>  *     foo(arg, &err);
>  *     if (err) {
>  *         handle the error...
>  *         error_propagate(errp, err); // deprecated
>  *     }
>  * Avoid in new code.  Do *not* "optimize" it to
>  *     foo(arg, errp);
>  *     if (*errp) { // WRONG!
>  *         handle the error...
>  *     }
>  * because errp may be NULL without the ERRP_GUARD() guard.
> 
> In case you think the difference in readability isn't all that big:
> error handling is *everywhere*, and any clutter adds up quickly,
> obscuring the logic.  Every line counts.

Thank you for the pointers! I missed that this was already documented.

I will switch to a bool return type.

> 
> > I have CCed Markus in case he has suggestions.
> 
> Thanks for that!
> 
> >> >  void fdmon_io_uring_destroy(AioContext *ctx);
> >> > -#else
> >> > -static inline bool fdmon_io_uring_setup(AioContext *ctx)
> >> > -{
> >> > -    return false;
> >> > -}
> >> > -
> >> > -static inline void fdmon_io_uring_destroy(AioContext *ctx)
> >> > -{
> >> > -}
> >> >  #endif /* !CONFIG_LINUX_IO_URING */
> >> >  
> >> >  #endif /* AIO_POSIX_H */
> >> > diff --git a/util/aio-posix.c b/util/aio-posix.c
> >> > index fa047fc7ad..44b3df61f9 100644
> >> > --- a/util/aio-posix.c
> >> > +++ b/util/aio-posix.c
> >> > @@ -16,6 +16,7 @@
> >> >  #include "qemu/osdep.h"
> >> >  #include "block/block.h"
> >> >  #include "block/thread-pool.h"
> >> > +#include "qapi/error.h"
> >> >  #include "qemu/main-loop.h"
> >> >  #include "qemu/lockcnt.h"
> >> >  #include "qemu/rcu.h"
> >> > @@ -717,17 +718,39 @@ void aio_context_setup(AioContext *ctx, Error **errp)
> >> >      ctx->epollfd = -1;
> >> >      ctx->epollfd_tag = NULL;
> >> >  
> >> > -    /* Use the fastest fd monitoring implementation if available */
> >> > -    if (fdmon_io_uring_setup(ctx)) {
> >> > -        return;
> >> > +#ifdef CONFIG_LINUX_IO_URING
> >> > +    {
> >> > +        static bool need_io_uring;
> >> > +        Error *local_err = NULL; /* ERRP_GUARD() doesn't handle error_abort */
> >> 
> >> Really? I thought that was part of why we added ERRP_GUARD, so that
> >> error_abort is pinned closer to the spot where the error is triggered
> >> rather than where it was chained.  But your use of errp is a bit
> >> different than usual here, so you may be correct that it doesn't
> >> handle error_abort in the way that you want (allowing a graceful
> >> downgrade to epoll if it is the first failure, but aborting if it is
> >> the second AioContext that fails).
> >
> > The ERRP_GUARD() doc comment explains why it behaves this way:
> >
> >  * Note: &error_abort is not rewritten, because that would move the
> >  * abort from the place where the error is created to the place where
> >  * it's propagated.
> 
> Error propagation should be avoided when possible.  It's not possible
> here; more on that below.
> 
> Why avoid?  Two reasons.  One, it degrades stack backtraces, as Eric
> pointed out.  Two, passing @errp directly is more obvious and less
> verbose, as we've seen above.
> 
> >> > +
> >> > +        /* io_uring takes precedence because it provides aio_add_sqe() support */
> >> > +        fdmon_io_uring_setup(ctx, &local_err);
> >> > +        if (!local_err) {
> >> > +            /*
> >> > +             * If one AioContext gets io_uring, then all AioContexts need io_uring
> >> > +             * so that aio_add_sqe() support is available across all threads.
> >> > +             */
> >> > +            need_io_uring = true;
> >> > +            return;
> >> > +        }
> >> > +        if (need_io_uring) {
> >> > +            error_propagate(errp, local_err);
> >> > +            return;
> >> > +        }
> >> > +
> >> > +        warn_report_err_once(local_err); /* frees local_err */
> >> > +        local_err = NULL;
> 
> This is why we can't avoid error_propagate() here: when
> fdmon_io_uring_setup() fails, we either consume the error and succeed,
> or pass it to the caller and fail.
> 
> Because of the former, passing @errp to fdmon_io_uring_setup() would be
> wrong; we need to pass a &local_err.  Which we then need to propagate to
> do the latter.
> 
> Similar code exists elsewhere.  It's fairly rare, though.
> 
> ERRP_GUARD() is not designed for this pattern.  We have to propragate
> manually.
> 
> I'd drop the /* ERRP_GUARD() doesn't handle error_abort */ comment.  To
> make sense of it, I believe you need to understand a lot more.  And if
> you do, you don't really need the comment.

Will fix.

> 
> >> >      }
> >> > +#endif /* CONFIG_LINUX_IO_URING */
> >> >  
> >> >      fdmon_epoll_setup(ctx);
> >> >  }
> >> >  
> >> >  void aio_context_destroy(AioContext *ctx)
> >> >  {
> >> > +#ifdef CONFIG_LINUX_IO_URING
> >> >      fdmon_io_uring_destroy(ctx);
> >> > +#endif
> >> >  
> >> >      qemu_lockcnt_lock(&ctx->list_lock);
> >> >      fdmon_epoll_disable(ctx);
> 
> [...]
>