[PATCH v2 09/11] rust/block: Add read support for block drivers

Kevin Wolf posted 11 patches 1 month, 2 weeks ago
[PATCH v2 09/11] rust/block: Add read support for block drivers
Posted by Kevin Wolf 1 month, 2 weeks ago
This adds a map() function to the BlockDriver trait and makes use of it
to implement reading from an image.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 rust/block/src/driver.rs | 95 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/rust/block/src/driver.rs b/rust/block/src/driver.rs
index fe19f4b88f..022d50ffbc 100644
--- a/rust/block/src/driver.rs
+++ b/rust/block/src/driver.rs
@@ -9,10 +9,45 @@
 
 use crate::{IoBuffer, SizedIoBuffer};
 use qemu_api::bindings;
+use qemu_api::errno::Errno;
+use qemu_api::futures::qemu_co_run_future;
+use std::cmp::min;
 use std::ffi::c_void;
 use std::io::{self, Error, ErrorKind};
 use std::mem::MaybeUninit;
 use std::ptr;
+use std::sync::Arc;
+
+/// A request to a block driver
+pub enum Request {
+    Read { offset: u64, len: u64 },
+}
+
+/// The target for a number of guest blocks, e.g. a location in a child node or the information
+/// that the described blocks are unmapped.
+pub enum MappingTarget {
+    /// The described blocks are unallocated. Reading from them yields zeros.
+    Unmapped,
+
+    /// The described blocks are stored in a child node.
+    Data {
+        /// Child node in which the data is stored
+        node: Arc<BdrvChild>,
+
+        /// Offset in the child node at which the data is stored
+        offset: u64,
+    },
+}
+
+/// A mapping for a number of contiguous guest blocks
+pub struct Mapping {
+    /// Offset of the mapped blocks from the perspective of the guest
+    pub offset: u64,
+    /// Length of the mapping in bytes
+    pub len: u64,
+    /// Where the data for the described blocks is stored
+    pub target: MappingTarget,
+}
 
 /// A trait for writing block drivers.
 ///
@@ -37,6 +72,11 @@ unsafe fn open(
 
     /// Returns the size of the image in bytes
     fn size(&self) -> u64;
+
+    /// Returns the mapping for the first part of `req`. If the returned mapping is shorter than
+    /// the request, the function can be called again with a shortened request to get the mapping
+    /// for the remaining part.
+    async fn map(&self, req: &Request) -> io::Result<Mapping>;
 }
 
 /// Represents the connection between a parent and its child node.
@@ -166,6 +206,60 @@ pub async fn read_uninit<T: SizedIoBuffer>(
     }
 }
 
+#[doc(hidden)]
+pub unsafe extern "C" fn bdrv_co_preadv_part<D: BlockDriver>(
+    bs: *mut bindings::BlockDriverState,
+    offset: i64,
+    bytes: i64,
+    qiov: *mut bindings::QEMUIOVector,
+    mut qiov_offset: usize,
+    flags: bindings::BdrvRequestFlags,
+) -> std::os::raw::c_int {
+    let s = unsafe { &mut *((*bs).opaque as *mut D) };
+
+    let mut offset = offset as u64;
+    let mut bytes = bytes as u64;
+
+    while bytes > 0 {
+        let req = Request::Read { offset, len: bytes };
+        let mapping = match qemu_co_run_future(s.map(&req)) {
+            Ok(mapping) => mapping,
+            Err(e) => return -i32::from(Errno::from(e).0),
+        };
+
+        let mapping_offset = offset - mapping.offset;
+        let cur_bytes = min(bytes, mapping.len - mapping_offset);
+
+        match mapping.target {
+            MappingTarget::Unmapped => unsafe {
+                bindings::qemu_iovec_memset(qiov, qiov_offset, 0, cur_bytes.try_into().unwrap());
+            },
+            MappingTarget::Data {
+                node,
+                offset: target_offset,
+            } => unsafe {
+                let ret = bindings::bdrv_co_preadv_part(
+                    node.child,
+                    (target_offset + mapping_offset) as i64,
+                    cur_bytes as i64,
+                    qiov,
+                    qiov_offset,
+                    flags,
+                );
+                if ret < 0 {
+                    return ret;
+                }
+            },
+        }
+
+        offset += cur_bytes;
+        qiov_offset += cur_bytes as usize;
+        bytes -= cur_bytes;
+    }
+
+    0
+}
+
 /// Declare a format block driver. This macro is meant to be used at the top level.
 ///
 /// `typ` is a type implementing the [`BlockDriver`] trait to handle the image format with the
@@ -179,6 +273,7 @@ macro_rules! block_driver {
                     instance_size: ::std::mem::size_of::<$typ>() as i32,
                     bdrv_open: Some($crate::driver::bdrv_open::<$typ>),
                     bdrv_close: Some($crate::driver::bdrv_close::<$typ>),
+                    bdrv_co_preadv_part: Some($crate::driver::bdrv_co_preadv_part::<$typ>),
                     bdrv_child_perm: Some(::qemu_api::bindings::bdrv_default_perms),
                     is_format: true,
                     ..::qemu_api::zeroable::Zeroable::ZERO
-- 
2.48.1
Re: [PATCH v2 09/11] rust/block: Add read support for block drivers
Posted by Stefan Hajnoczi 4 weeks, 1 day ago
On Tue, Feb 18, 2025 at 07:20:17PM +0100, Kevin Wolf wrote:
> +/// A request to a block driver
> +pub enum Request {
> +    Read { offset: u64, len: u64 },
> +}
> +
> +/// The target for a number of guest blocks, e.g. a location in a child node or the information
> +/// that the described blocks are unmapped.
> +pub enum MappingTarget {
> +    /// The described blocks are unallocated. Reading from them yields zeros.
> +    Unmapped,
> +
> +    /// The described blocks are stored in a child node.
> +    Data {
> +        /// Child node in which the data is stored
> +        node: Arc<BdrvChild>,
> +
> +        /// Offset in the child node at which the data is stored
> +        offset: u64,
> +    },
> +}
> +
> +/// A mapping for a number of contiguous guest blocks
> +pub struct Mapping {
> +    /// Offset of the mapped blocks from the perspective of the guest
> +    pub offset: u64,
> +    /// Length of the mapping in bytes
> +    pub len: u64,
> +    /// Where the data for the described blocks is stored
> +    pub target: MappingTarget,
> +}
>  
>  /// A trait for writing block drivers.
>  ///
> @@ -37,6 +72,11 @@ unsafe fn open(
>  
>      /// Returns the size of the image in bytes
>      fn size(&self) -> u64;
> +
> +    /// Returns the mapping for the first part of `req`. If the returned mapping is shorter than
> +    /// the request, the function can be called again with a shortened request to get the mapping
> +    /// for the remaining part.
> +    async fn map(&self, req: &Request) -> io::Result<Mapping>;

Here are my thoughts about how the interface could evolve as more
functionality is added (beyond the scope of this patch). I'm sure you
have your own ideas that aren't in this patch series, so take it for
what it's worth.

The main point:
The current interface has Request, which tells the BlockDriver that this
is a Read. I think this interface is still based on I/O requests rather
than being about mappings. The caller should handle
read/write/discard/etc request logic themselves based on the mapping
information from the BlockDriver. Then the BlockDriver just worries
about mapping blocks rather than how to treat different types of
requests.

A sketch of the interface:
The interface consists of fetch(), alloc(), and update().

1. Fetch mappings located around a given range:
   fetch(offset, len) -> [Mapping]

   The returned array of mappings must cover <offset, len>, but it can
   also be larger. For example, it could return the entire extent that
   includes <offset, len>. Or it could even contain the <offset, len>
   mapping along with a bunch of other mappings.

   If the BlockDriver pays the cost of a metadata read operation to load
   mappings, then let's report all mappings even if they precede or
   follow the <offset, len> range. The exact amount of extra mappings
   reported depends on the BlockDriver implementation and the metadata
   layout, but in qcow2 it might be an L2 table, for example. The idea
   is that if the BlockDriver volunteers extra mapping information, then
   the caller can perform future operations without calling into the
   BlockDriver again.

   If it turns out that certain BlockDriver implementations don't
   benefit, that's okay, but let's include the flexibility to volunteer
   more mapping information than just <offset, len> in the interface.

2. Allocate new unmapped blocks:
   alloc(num_blocks) -> Result<(BdrvChild, Offset)>

   (A choice needs to be made whether the blocks are contiguous or not,
   I'm showing the contiguous function prototype here because it's
   simpler, but it could also support non-contiguous allocations.)

   These blocks are not mapped yet and the caller is expected to write
   to them eventually, populating them with data.

   I'm assuming the qcow2 crash consistency model where leaking clusters
   is acceptable here. Not sure whether this API makes sense for all
   other image formats, but please bear with me for the final part of
   the interface where everything fits together.

3. Update mapping:
   update(offset, len, MappingTarget) -> Result<()>

   This adds or changes mappings. It's the most complicated function and
   there are a lot of details about which inputs should be
   allowed/forbidden (e.g. creating completely new mappings, splitting
   existing mappings, toggling MappingTarget on an existing mapping).
   Perhaps in practice only a few valid combinations of inputs will be
   allowed rather than full ability to manipulate mappings.

   Anyway, here is how it can be used:
   - Allocating write. After blocks allocated with alloc() have been
     written with data, call update() to establish a mapping.
   - Discard. Change the MappingTarget of an existing mapping to unmap
     blocks. This frees blocks unless they should be anchored (aka
     pre-allocated).
   - Write to anchored blocks. If the mapping had blocks allocated but
     they were not mapped, then update() should be called after data has
     been written to the blocks to restore the mapping from anchored to
     normal mapped blocks.

The overall idea is that I/O requests are not part of the interface,
only mappings. The caller decides how to perform I/O requests and calls
the appropriate combination of fetch(), alloc(), and update() APIs to
ensure the necessary mapping state is present.

It will probably be necessary to add additional mapping information
indicating permissions, compression/encryption info, etc so that the
caller can process I/O requests rather than sending them into the
BlockDriver.

Stefan
Re: [PATCH v2 09/11] rust/block: Add read support for block drivers
Posted by Stefan Hajnoczi 1 month ago
On Tue, Feb 18, 2025 at 07:20:17PM +0100, Kevin Wolf wrote:
> This adds a map() function to the BlockDriver trait and makes use of it
> to implement reading from an image.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  rust/block/src/driver.rs | 95 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 95 insertions(+)
> 
> diff --git a/rust/block/src/driver.rs b/rust/block/src/driver.rs
> index fe19f4b88f..022d50ffbc 100644
> --- a/rust/block/src/driver.rs
> +++ b/rust/block/src/driver.rs
> @@ -9,10 +9,45 @@
>  
>  use crate::{IoBuffer, SizedIoBuffer};
>  use qemu_api::bindings;
> +use qemu_api::errno::Errno;
> +use qemu_api::futures::qemu_co_run_future;
> +use std::cmp::min;
>  use std::ffi::c_void;
>  use std::io::{self, Error, ErrorKind};
>  use std::mem::MaybeUninit;
>  use std::ptr;
> +use std::sync::Arc;
> +
> +/// A request to a block driver
> +pub enum Request {
> +    Read { offset: u64, len: u64 },
> +}
> +
> +/// The target for a number of guest blocks, e.g. a location in a child node or the information
> +/// that the described blocks are unmapped.
> +pub enum MappingTarget {
> +    /// The described blocks are unallocated. Reading from them yields zeros.
> +    Unmapped,
> +
> +    /// The described blocks are stored in a child node.
> +    Data {
> +        /// Child node in which the data is stored
> +        node: Arc<BdrvChild>,
> +
> +        /// Offset in the child node at which the data is stored
> +        offset: u64,
> +    },
> +}
> +
> +/// A mapping for a number of contiguous guest blocks
> +pub struct Mapping {
> +    /// Offset of the mapped blocks from the perspective of the guest
> +    pub offset: u64,
> +    /// Length of the mapping in bytes
> +    pub len: u64,
> +    /// Where the data for the described blocks is stored
> +    pub target: MappingTarget,
> +}
>  
>  /// A trait for writing block drivers.
>  ///
> @@ -37,6 +72,11 @@ unsafe fn open(
>  
>      /// Returns the size of the image in bytes
>      fn size(&self) -> u64;
> +
> +    /// Returns the mapping for the first part of `req`. If the returned mapping is shorter than
> +    /// the request, the function can be called again with a shortened request to get the mapping
> +    /// for the remaining part.
> +    async fn map(&self, req: &Request) -> io::Result<Mapping>;

What are the constraints on the lifetime of returned mappings? I don't
mean a Rust lifetimes, but how long a returned mapping remains valid. I
guess at the moment returned mappings stay valid forever (i.e. the
BlockDriver cannot move data once it has been mapped)?

This becomes more interesting once write or discard requests are
supported or truncate() is implemented, but it would be worth spelling
out the lifetime of a mappings from the start and extending that model
later because it's an important assumption.

>  }
>  
>  /// Represents the connection between a parent and its child node.
> @@ -166,6 +206,60 @@ pub async fn read_uninit<T: SizedIoBuffer>(
>      }
>  }
>  
> +#[doc(hidden)]
> +pub unsafe extern "C" fn bdrv_co_preadv_part<D: BlockDriver>(
> +    bs: *mut bindings::BlockDriverState,
> +    offset: i64,
> +    bytes: i64,
> +    qiov: *mut bindings::QEMUIOVector,
> +    mut qiov_offset: usize,
> +    flags: bindings::BdrvRequestFlags,
> +) -> std::os::raw::c_int {
> +    let s = unsafe { &mut *((*bs).opaque as *mut D) };
> +
> +    let mut offset = offset as u64;
> +    let mut bytes = bytes as u64;
> +
> +    while bytes > 0 {
> +        let req = Request::Read { offset, len: bytes };
> +        let mapping = match qemu_co_run_future(s.map(&req)) {
> +            Ok(mapping) => mapping,
> +            Err(e) => return -i32::from(Errno::from(e).0),
> +        };
> +
> +        let mapping_offset = offset - mapping.offset;
> +        let cur_bytes = min(bytes, mapping.len - mapping_offset);
> +
> +        match mapping.target {
> +            MappingTarget::Unmapped => unsafe {
> +                bindings::qemu_iovec_memset(qiov, qiov_offset, 0, cur_bytes.try_into().unwrap());
> +            },
> +            MappingTarget::Data {
> +                node,
> +                offset: target_offset,
> +            } => unsafe {
> +                let ret = bindings::bdrv_co_preadv_part(
> +                    node.child,
> +                    (target_offset + mapping_offset) as i64,
> +                    cur_bytes as i64,
> +                    qiov,
> +                    qiov_offset,
> +                    flags,
> +                );
> +                if ret < 0 {
> +                    return ret;
> +                }
> +            },
> +        }
> +
> +        offset += cur_bytes;
> +        qiov_offset += cur_bytes as usize;
> +        bytes -= cur_bytes;
> +    }
> +
> +    0
> +}
> +
>  /// Declare a format block driver. This macro is meant to be used at the top level.
>  ///
>  /// `typ` is a type implementing the [`BlockDriver`] trait to handle the image format with the
> @@ -179,6 +273,7 @@ macro_rules! block_driver {
>                      instance_size: ::std::mem::size_of::<$typ>() as i32,
>                      bdrv_open: Some($crate::driver::bdrv_open::<$typ>),
>                      bdrv_close: Some($crate::driver::bdrv_close::<$typ>),
> +                    bdrv_co_preadv_part: Some($crate::driver::bdrv_co_preadv_part::<$typ>),
>                      bdrv_child_perm: Some(::qemu_api::bindings::bdrv_default_perms),
>                      is_format: true,
>                      ..::qemu_api::zeroable::Zeroable::ZERO
> -- 
> 2.48.1
> 
> 
Re: [PATCH v2 09/11] rust/block: Add read support for block drivers
Posted by Paolo Bonzini 1 month, 1 week ago
On 2/18/25 19:20, Kevin Wolf wrote:
> +    /// The described blocks are stored in a child node.
> +    Data {
> +        /// Child node in which the data is stored
> +        node: Arc<BdrvChild>,

Having Arc<> here shouldn't be necessary, since the BdrvChild is already 
reference counted.  Since the code is called under the bdrv_graph_rdlock 
there's no risk of the BdrvChild going away, and you can just make it a 
&BdrvChild.

Likewise, even BochsImage should not need a standard Rust 
Arc<BdrvChild>.  However you need to add your own block::Arc<BdrvChild> 
and map Clone/Drop to bdrv_ref/bdrv_unref.  Then BochsImage can use 
block::Arc<BdrvChild>; this makes it even clearer that Mapping should 
not use the Arc<> wrapper, because bdrv_ref is GLOBAL_STATE_CODE() and 
would abort if run from a non-main thread.

That said, I'm not sure how to include "block graph lock must be taken" 
into the types, yet.  That has to be taken into account too, sooner or 
later.  You probably have a lot of items like this one so it'd be nice 
to have TODO comments as much as you can.

(This boundary is where you get an unholy mix of C and Rust concepts. 
It takes a while to get used to, and it teaches you a lot of the parts 
of Rust that you usually take for granted.  So while it's not hard, it's 
unusual and it does feel like water and oil in the beginning).

> +) -> std::os::raw::c_int {
> +    let s = unsafe { &mut *((*bs).opaque as *mut D) };

&mut is not safe here (don't worry, we went through the same thing for 
devices :)).  You can only get an & unless you go through an UnsafeCell 
(or something that contains one).  You'll need to split the mutable and 
immutable parts of BochsImage in separate structs, and embed the former 
into the latter.  Long term you there should be a 
qemu_api::coroutine::CoMutex<>, but for the short term you can just use 
a BqlRefCell<> or a standard Rust RefCell<>.  You can see how 
PL011Registers is included into PL011State in 
rust/hw/char/pl011/src/device.rs, and a small intro is also present in 
docs/devel/rust.rst.

Anyway, the BdrvChild needs to remain in BochsImage, so that it is 
accessible outside the CoMutex critical section and can be placed into 
the Mapping.

> +    let mut offset = offset as u64;
> +    let mut bytes = bytes as u64;
> +
> +    while bytes > 0 {
> +        let req = Request::Read { offset, len: bytes };
> +        let mapping = match qemu_co_run_future(s.map(&req)) {
> +            Ok(mapping) => mapping,
> +            Err(e) => return -i32::from(Errno::from(e).0),

This is indeed not great, but it's partly so because you're doing a lot 
(for some definition of "a lot") in the function.  While it would be 
possible to use a trait, I wrote the API thinking of minimal glue code 
that only does the C<->Rust conversion.

In this case, because you have a lot more code than just a call into the 
BlockDriver trait, you'd have something like

fn bdrv_co_preadv_part(
     bs: &dyn BlockDriver,
     offset: i64,
     bytes: i64,
     qiov: &bindings::QEMUIOVector,
     mut qiov_offset: usize,
     flags: bindings::BdrvRequestFlags) -> io::Result<()>

and then a wrapper (e.g. rust_co_preadv_part?) that only does

    let s = unsafe { &mut *((*bs).opaque as *mut D) };
    let qiov = unsafe { &*qiov };
    let result = bdrv_co_preadv_part(s, offset, bytes,
          qiov, qiov_offset, flags);
    errno::into_negative_errno(result)

This by the way has also code size benefits because &dyn, unlike 
generics, does not need to result in duplicated code.

For now, I'd rather keep into_negative_errno() this way, to keep an eye 
on other cases where you have an io::Error<()>.  Since Rust rarely has 
Error objects that aren't part of a Result, it stands to reason that the 
same is true of QEMU code, but if I'm wrong then it can be changed.

Paolo
Re: [PATCH v2 09/11] rust/block: Add read support for block drivers
Posted by Kevin Wolf 1 month, 1 week ago
Am 19.02.2025 um 07:11 hat Paolo Bonzini geschrieben:
> On 2/18/25 19:20, Kevin Wolf wrote:
> > +    /// The described blocks are stored in a child node.
> > +    Data {
> > +        /// Child node in which the data is stored
> > +        node: Arc<BdrvChild>,
> 
> Having Arc<> here shouldn't be necessary, since the BdrvChild is already
> reference counted.  Since the code is called under the bdrv_graph_rdlock
> there's no risk of the BdrvChild going away, and you can just make it a
> &BdrvChild.

That would mean that you need keep the BlockDriver borrowed as long as
you're using the mapping. It would work today, but as soon as I want to
cache mappings, it won't any more.

> Likewise, even BochsImage should not need a standard Rust Arc<BdrvChild>.
> However you need to add your own block::Arc<BdrvChild> and map Clone/Drop to
> bdrv_ref/bdrv_unref.  Then BochsImage can use block::Arc<BdrvChild>; this
> makes it even clearer that Mapping should not use the Arc<> wrapper, because
> bdrv_ref is GLOBAL_STATE_CODE() and would abort if run from a non-main
> thread.

It's not BdrvChild that is refcounted on the C side, but
BlockDriverState. We definitely don't bdrv_ref()/unref() for each
request on the C side and we shouldn't on the Rust side either. The
refcount only changes when you modify the graph.

I'm not entirely sure how your block::Arc<T> is supposed to work. It
would be tied to one specific type (BlockDriverState), not generic.
Which probably means that it can't be a separate pointer type, but
BlockDriverState itself should just implement Clone with bdrv_ref().

Though that doesn't help here, obviously, because we have a BdrvChild.

> That said, I'm not sure how to include "block graph lock must be taken" into
> the types, yet.  That has to be taken into account too, sooner or later.
> You probably have a lot of items like this one so it'd be nice to have TODO
> comments as much as you can.

Actually, I'm not aware of that many items. But yes, there is a TODO
item for the graph lock.

I think I'll have something like:

    pub struct BdrvChild {
        child: GraphLock<*mut bindings::BdrvChild>,
    }

where you can access the inner object either by calling a lock function,
or passing another graph lock guard that you already own. And for the
FFI boundary unsafe functions like "I promise I already own the lock".

> (This boundary is where you get an unholy mix of C and Rust concepts. It
> takes a while to get used to, and it teaches you a lot of the parts of Rust
> that you usually take for granted.  So while it's not hard, it's unusual and
> it does feel like water and oil in the beginning).
> 
> > +) -> std::os::raw::c_int {
> > +    let s = unsafe { &mut *((*bs).opaque as *mut D) };
> 
> &mut is not safe here (don't worry, we went through the same thing for
> devices :)).  You can only get an & unless you go through an UnsafeCell (or
> something that contains one).

Right, we can have multiple requests in flight.

The fix is easy here: Even though bindgen gives us a *mut, we only want
a immutable reference.

> You'll need to split the mutable and immutable parts of BochsImage in
> separate structs, and embed the former into the latter.  Long term you
> there should be a qemu_api::coroutine::CoMutex<>, but for the short
> term you can just use a BqlRefCell<> or a standard Rust RefCell<>.
> You can see how PL011Registers is included into PL011State in
> rust/hw/char/pl011/src/device.rs, and a small intro is also present in
> docs/devel/rust.rst.

There is no mutable part in BochsImage, which makes this easy. The only
thing is the *mut bindings::BdrvChild, but we never dereference that in
Rust. It is also essentially interior mutability protected by the graph
lock, even though this isn't explicit yet.

But if we were to introduce a mutable part (I think we will add write
support to it sooner or later), then BqlRefCell or RefCell are
definitely not right. They would only turn the UB into a safe panic when
you have more than one request in flight. (Or actually, BqlRefCell
should already panic with just one request from an iothread, because we
don't actually hold the BQL.)

> Anyway, the BdrvChild needs to remain in BochsImage, so that it is
> accessible outside the CoMutex critical section and can be placed into
> the Mapping.
> 
> > +    let mut offset = offset as u64;
> > +    let mut bytes = bytes as u64;
> > +
> > +    while bytes > 0 {
> > +        let req = Request::Read { offset, len: bytes };
> > +        let mapping = match qemu_co_run_future(s.map(&req)) {
> > +            Ok(mapping) => mapping,
> > +            Err(e) => return -i32::from(Errno::from(e).0),
> 
> This is indeed not great, but it's partly so because you're doing a
> lot (for some definition of "a lot") in the function.  While it would
> be possible to use a trait, I wrote the API thinking of minimal glue
> code that only does the C<->Rust conversion.
> 
> In this case, because you have a lot more code than just a call into
> the BlockDriver trait, you'd have something like
> 
> fn bdrv_co_preadv_part(
>     bs: &dyn BlockDriver,
>     offset: i64,
>     bytes: i64,
>     qiov: &bindings::QEMUIOVector,
>     mut qiov_offset: usize,
>     flags: bindings::BdrvRequestFlags) -> io::Result<()>
> 
> and then a wrapper (e.g. rust_co_preadv_part?) that only does
> 
>    let s = unsafe { &mut *((*bs).opaque as *mut D) };
>    let qiov = unsafe { &*qiov };
>    let result = bdrv_co_preadv_part(s, offset, bytes,
>          qiov, qiov_offset, flags);
>    errno::into_negative_errno(result)
> 
> This by the way has also code size benefits because &dyn, unlike
> generics, does not need to result in duplicated code.

I don't really like the aesthetics of having two functions on the
Rust side for each C function, but I guess ugliness is expected in
bindings...

For the errno conversion functions, I'm still not sure that they should
only support trivial wrappers with no early returns. I reluctantly buy
your &dyn argument (though in C block drivers this is generally
duplicated code, too), but that's unrelated to error handling.

> For now, I'd rather keep into_negative_errno() this way, to keep an
> eye on other cases where you have an io::Error<()>.  Since Rust rarely
> has Error objects that aren't part of a Result, it stands to reason
> that the same is true of QEMU code, but if I'm wrong then it can be
> changed.

This one is part of a Result, too. But not a result that is directly
returned in both success and error cases, but where the error leads to
an early return. That is, an equivalent for the ubiquitous pattern:

    ret = foo();
    if (ret < 0) {
        return ret;
    }

    /* Do something with the successful result of foo() */

Kevin
Re: [PATCH v2 09/11] rust/block: Add read support for block drivers
Posted by Paolo Bonzini 1 month, 1 week ago
Il mer 19 feb 2025, 14:02 Kevin Wolf <kwolf@redhat.com> ha scritto:

> > Likewise, even BochsImage should not need a standard Rust Arc<BdrvChild>.
> > However you need to add your own block::Arc<BdrvChild> and map
> Clone/Drop to
> > bdrv_ref/bdrv_unref.  Then BochsImage can use block::Arc<BdrvChild>; this
> > makes it even clearer that Mapping should not use the Arc<> wrapper,
> because
> > bdrv_ref is GLOBAL_STATE_CODE() and would abort if run from a non-main
> > thread.
>
> It's not BdrvChild that is refcounted on the C side, but
> BlockDriverState. We definitely don't bdrv_ref()/unref() for each
> request on the C side and we shouldn't on the Rust side either. The
> refcount only changes when you modify the graph.
>

I keep confusing BdrvChild and BlockDriverState, sorry.

Talking in general about Rust and not QEMU, if you wanted to be able to
optionally store the mapping, and not always modify the refcount, you'd
probably want an &Arc<T>. Then you only clone it if needed for the cache
(for example if you cached the *content* rather than the mapping, you
wouldn't need to clone).

However, Arc doesn't work here. In order to globally invalidate all cached
mappings, you would need the cache to store something similar to a Weak<T>.
But even that doesn't quite work, because if you passed an &Arc the caller
could clone it as it wished. So using weak clones to do cache invalidation
shows as the hack that it is.

Going back to QEMU, the Mapping needs some smart pointer (something that
implements Deref<Target = bindings;:BdrvChild>), which does implement some
reference counting unlike the C version but, unlike &Arc<BdrvChild>, cannot
be cloned willy nilly. Instead it can be 1) weakly-cloned under the graph
rdlock 2) used from a weak reference but only for the duration of an rdlock
critical section (no way to make it strong for an arbitrary lifetime!) 3)
invalidated under the graph wrlock.

The nice thing here is that, even if you have a good way to invalidate the
cache, that's orthogonal to how you make the Rust API memory safe. Cache
invalidation becomes just a way to quickly free the BdrvChild—the invalid
entry would be detected anyway later when trying to use the weak reference.

More practical/immediate suggestion below...

I'm not entirely sure how your block::Arc<T> is supposed to work. It
> would be tied to one specific type (BlockDriverState), not generic.
> Which probably means that it can't be a separate pointer type, but
> BlockDriverState itself should just implement Clone with bdrv_ref().
>

You're right, I always forget the new BdrvChild world.

> That said, I'm not sure how to include "block graph lock must be taken"
> into
> > the types, yet.  That has to be taken into account too, sooner or later.
> > You probably have a lot of items like this one so it'd be nice to have
> TODO
> > comments as much as you can.
>
> Actually, I'm not aware of that many items.

But yes, there is a TODO
> item for the graph lock.
>
> I think I'll have something like:
>
>     pub struct BdrvChild {
>         child: GraphLock<*mut bindings::BdrvChild>,
>     }
>

Arc<BdrvChild> poses another problem then, in that graph changes would
invalidate the raw pointer even if the Arc is still alive. Something like
the aforementioned smart pointer would prevent the cache from accessing a
dead pointer (at the cost of adding a refcount field to BdrvChild that C
doesn't use).

But for now maybe you can just rename the *mut-wrapping BdrvChild to
BdrvChildRef, get rid of Arc, and store an &BdrvChildRef into Mapping? Then
if you ever decide to go with the refcounting plan you can implement Deref.

> > +) -> std::os::raw::c_int {
> > > +    let s = unsafe { &mut *((*bs).opaque as *mut D) };
> >
> > &mut is not safe here (don't worry, we went through the same thing for
> > devices :)).  You can only get an & unless you go through an UnsafeCell
> (or
> > something that contains one).
>
> Right, we can have multiple requests in flight.
> The fix is easy here: Even though bindgen gives us a *mut, we only want
> a immutable reference.
>

Right.

There is no mutable part in BochsImage, which makes this easy. [...] But if
> we were to introduce a mutable part (I think we will add write

support to it sooner or later), then BqlRefCell or RefCell are
> definitely not right. They would only turn the UB into a safe panic when
> you have more than one request in flight. (Or actually, BqlRefCell
> should already panic with just one request from an iothread, because we
> don't actually hold the BQL.)
>

Yes, I mentioned RefCell because of the iothread case but I agree it also
isn't right. It wouldn't panic when you have more than one request in
flight however, as long as only map() needs to borrow_mut(). If instead you
need slightly more complex locking, for example similar to the vbox driver,
you need CoMutex/CoRwLock bindings.

> > +    let mut offset = offset as u64;
> > > +    let mut bytes = bytes as u64;
> > > +
> > > +    while bytes > 0 {
> > > +        let req = Request::Read { offset, len: bytes };
> > > +        let mapping = match qemu_co_run_future(s.map(&req)) {
> > > +            Ok(mapping) => mapping,
> > > +            Err(e) => return -i32::from(Errno::from(e).0),
> >
> > This is indeed not great, but it's partly so because you're doing a
> > lot (for some definition of "a lot") in the function.  While it would
> > be possible to use a trait, I wrote the API thinking of minimal glue
> > code that only does the C<->Rust conversion.
> >
> > In this case, because you have a lot more code than just a call into
> > the BlockDriver trait, you'd have something like
> >
> > fn bdrv_co_preadv_part(
> >     bs: &dyn BlockDriver,
> >     offset: i64,
> >     bytes: i64,
> >     qiov: &bindings::QEMUIOVector,
> >     mut qiov_offset: usize,
> >     flags: bindings::BdrvRequestFlags) -> io::Result<()>
> >
> > and then a wrapper (e.g. rust_co_preadv_part?) that only does
> >
> >    let s = unsafe { &mut *((*bs).opaque as *mut D) };
> >    let qiov = unsafe { &*qiov };
> >    let result = bdrv_co_preadv_part(s, offset, bytes,
> >          qiov, qiov_offset, flags);
> >    errno::into_negative_errno(result)
> >
> > This by the way has also code size benefits because &dyn, unlike
> > generics, does not need to result in duplicated code.
>
> I don't really like the aesthetics of having two functions on the
> Rust side for each C function, but I guess ugliness is expected in
> bindings...
>

Well, you don't *have* to have two. In this case I suggested two just
because the C and Rust APIs are completely different. But also...

> For now, I'd rather keep into_negative_errno() this way, to keep an
> > eye on other cases where you have an io::Error<()>.  Since Rust rarely
> > has Error objects that aren't part of a Result, it stands to reason
> > that the same is true of QEMU code, but if I'm wrong then it can be
> > changed.
>
> This one is part of a Result, too. But not a result that is directly
> returned in both success and error cases, but where the error leads to
> an early return. That is, an equivalent for the ubiquitous pattern:
>
>     ret = foo();
>     if (ret < 0) {
>         return ret;
>     }
>

... this is just "?" in Rust and it's the C/Rust boundary that complicates
things, because "?" assumes that you return another Result. So the
two-function idea helps because the inner function can just use "?", and
the outer one qemu_api::errno. It let's you use the language more
effectively. &dyn is just an addition on top.

If needed your inner function could return Result<T, Errno> instead of
Result<T, io::Error>, too. That works nicely because "?" would also convert
io::Error into Errno as needed.

>
But if you don't want the two functions, why wouldn't you just do "e =>
..." in the match statement instead of Err(e)?

Paolo