hw/pci/pci.c | 8 ++++ include/exec/memory.h | 14 +++---- include/hw/pci/pci_device.h | 3 ++ system/memory.c | 5 ++- system/physmem.c | 82 ++++++++++++++++++++++++++----------- 5 files changed, 76 insertions(+), 36 deletions(-)
When DMA memory can't be directly accessed, as is the case when
running the device model in a separate process without shareable DMA
file descriptors, bounce buffering is used.
It is not uncommon for device models to request mapping of several DMA
regions at the same time. Examples include:
* net devices, e.g. when transmitting a packet that is split across
several TX descriptors (observed with igb)
* USB host controllers, when handling a packet with multiple data TRBs
(observed with xhci)
Previously, qemu only provided a single bounce buffer per AddressSpace
and would fail DMA map requests while the buffer was already in use. In
turn, this would cause DMA failures that ultimately manifest as hardware
errors from the guest perspective.
This change allocates DMA bounce buffers dynamically instead of
supporting only a single buffer. Thus, multiple DMA mappings work
correctly also when RAM can't be mmap()-ed.
The total bounce buffer allocation size is limited individually for each
AddressSpace. The default limit is 4096 bytes, matching the previous
maximum buffer size. A new x-max-bounce-buffer-size parameter is
provided to configure the limit for PCI devices.
Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Acked-by: Peter Xu <peterx@redhat.com>
---
This patch is split out from my "Support message-based DMA in vfio-user server"
series. With the series having been partially applied, I'm splitting this one
out as the only remaining patch to system emulation code in the hope to
simplify getting it landed. The code has previously been reviewed by Stefan
Hajnoczi and Peter Xu. This latest version includes changes to switch the
bounce buffer size bookkeeping to `size_t` as requested and LGTM'd by Phil in
v9.
---
hw/pci/pci.c | 8 ++++
include/exec/memory.h | 14 +++----
include/hw/pci/pci_device.h | 3 ++
system/memory.c | 5 ++-
system/physmem.c | 82 ++++++++++++++++++++++++++-----------
5 files changed, 76 insertions(+), 36 deletions(-)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index fab86d0567..d2caf3ee8b 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -85,6 +85,8 @@ static Property pci_props[] = {
QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
+ DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice,
+ max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE),
DEFINE_PROP_END_OF_LIST()
};
@@ -1204,6 +1206,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
"bus master container", UINT64_MAX);
address_space_init(&pci_dev->bus_master_as,
&pci_dev->bus_master_container_region, pci_dev->name);
+ pci_dev->bus_master_as.max_bounce_buffer_size =
+ pci_dev->max_bounce_buffer_size;
if (phase_check(PHASE_MACHINE_READY)) {
pci_init_bus_master(pci_dev);
@@ -2633,6 +2637,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data)
k->unrealize = pci_qdev_unrealize;
k->bus_type = TYPE_PCI_BUS;
device_class_set_props(k, pci_props);
+ object_class_property_set_description(
+ klass, "x-max-bounce-buffer-size",
+ "Maximum buffer size allocated for bounce buffers used for mapped "
+ "access to indirect DMA memory");
}
static void pci_device_class_base_init(ObjectClass *klass, void *data)
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 296fd068c0..e5e865d1a9 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -1084,13 +1084,7 @@ typedef struct AddressSpaceMapClient {
QLIST_ENTRY(AddressSpaceMapClient) link;
} AddressSpaceMapClient;
-typedef struct {
- MemoryRegion *mr;
- void *buffer;
- hwaddr addr;
- hwaddr len;
- bool in_use;
-} BounceBuffer;
+#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096)
/**
* struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
@@ -1110,8 +1104,10 @@ struct AddressSpace {
QTAILQ_HEAD(, MemoryListener) listeners;
QTAILQ_ENTRY(AddressSpace) address_spaces_link;
- /* Bounce buffer to use for this address space. */
- BounceBuffer bounce;
+ /* Maximum DMA bounce buffer size used for indirect memory map requests */
+ size_t max_bounce_buffer_size;
+ /* Total size of bounce buffers currently allocated, atomically accessed */
+ size_t bounce_buffer_size;
/* List of callbacks to invoke when buffers free up */
QemuMutex map_client_list_lock;
QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index 15694f2489..91df40f989 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -167,6 +167,9 @@ struct PCIDevice {
/* ID of standby device in net_failover pair */
char *failover_pair_id;
uint32_t acpi_index;
+
+ /* Maximum DMA bounce buffer size used for indirect memory map requests */
+ uint32_t max_bounce_buffer_size;
};
static inline int pci_intx(PCIDevice *pci_dev)
diff --git a/system/memory.c b/system/memory.c
index 5e6eb459d5..f6f6fee6d8 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -3148,7 +3148,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
as->ioeventfds = NULL;
QTAILQ_INIT(&as->listeners);
QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
- as->bounce.in_use = false;
+ as->max_bounce_buffer_size = DEFAULT_MAX_BOUNCE_BUFFER_SIZE;
+ as->bounce_buffer_size = 0;
qemu_mutex_init(&as->map_client_list_lock);
QLIST_INIT(&as->map_client_list);
as->name = g_strdup(name ? name : "anonymous");
@@ -3158,7 +3159,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
static void do_address_space_destroy(AddressSpace *as)
{
- assert(!qatomic_read(&as->bounce.in_use));
+ assert(qatomic_read(&as->bounce_buffer_size) == 0);
assert(QLIST_EMPTY(&as->map_client_list));
qemu_mutex_destroy(&as->map_client_list_lock);
diff --git a/system/physmem.c b/system/physmem.c
index 94600a33ec..971bfa0855 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -3095,6 +3095,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
NULL, len, FLUSH_CACHE);
}
+/*
+ * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
+ * to detect illegal pointers passed to address_space_unmap.
+ */
+#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
+
+typedef struct {
+ uint64_t magic;
+ MemoryRegion *mr;
+ hwaddr addr;
+ size_t len;
+ uint8_t buffer[];
+} BounceBuffer;
+
static void
address_space_unregister_map_client_do(AddressSpaceMapClient *client)
{
@@ -3120,9 +3134,9 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
QEMU_LOCK_GUARD(&as->map_client_list_lock);
client->bh = bh;
QLIST_INSERT_HEAD(&as->map_client_list, client, link);
- /* Write map_client_list before reading in_use. */
+ /* Write map_client_list before reading bounce_buffer_size. */
smp_mb();
- if (!qatomic_read(&as->bounce.in_use)) {
+ if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
address_space_notify_map_clients_locked(as);
}
}
@@ -3251,28 +3265,40 @@ void *address_space_map(AddressSpace *as,
mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
if (!memory_access_is_direct(mr, is_write)) {
- if (qatomic_xchg(&as->bounce.in_use, true)) {
+ size_t used = qatomic_read(&as->bounce_buffer_size);
+ for (;;) {
+ hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l);
+ size_t new_size = used + alloc;
+ size_t actual =
+ qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size);
+ if (actual == used) {
+ l = alloc;
+ break;
+ }
+ used = actual;
+ }
+
+ if (l == 0) {
*plen = 0;
return NULL;
}
- /* Avoid unbounded allocations */
- l = MIN(l, TARGET_PAGE_SIZE);
- as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
- as->bounce.addr = addr;
- as->bounce.len = l;
+ BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
+ bounce->magic = BOUNCE_BUFFER_MAGIC;
memory_region_ref(mr);
- as->bounce.mr = mr;
+ bounce->mr = mr;
+ bounce->addr = addr;
+ bounce->len = l;
+
if (!is_write) {
flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
- as->bounce.buffer, l);
+ bounce->buffer, l);
}
*plen = l;
- return as->bounce.buffer;
+ return bounce->buffer;
}
-
memory_region_ref(mr);
*plen = flatview_extend_translation(fv, addr, len, mr, xlat,
l, is_write, attrs);
@@ -3287,12 +3313,11 @@ void *address_space_map(AddressSpace *as,
void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
bool is_write, hwaddr access_len)
{
- if (buffer != as->bounce.buffer) {
- MemoryRegion *mr;
- ram_addr_t addr1;
+ MemoryRegion *mr;
+ ram_addr_t addr1;
- mr = memory_region_from_host(buffer, &addr1);
- assert(mr != NULL);
+ mr = memory_region_from_host(buffer, &addr1);
+ if (mr != NULL) {
if (is_write) {
invalidate_and_set_dirty(mr, addr1, access_len);
}
@@ -3302,15 +3327,22 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
memory_region_unref(mr);
return;
}
+
+
+ BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
+ assert(bounce->magic == BOUNCE_BUFFER_MAGIC);
+
if (is_write) {
- address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
- as->bounce.buffer, access_len);
- }
- qemu_vfree(as->bounce.buffer);
- as->bounce.buffer = NULL;
- memory_region_unref(as->bounce.mr);
- /* Clear in_use before reading map_client_list. */
- qatomic_set_mb(&as->bounce.in_use, false);
+ address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
+ bounce->buffer, access_len);
+ }
+
+ qatomic_sub(&as->bounce_buffer_size, bounce->len);
+ bounce->magic = ~BOUNCE_BUFFER_MAGIC;
+ memory_region_unref(bounce->mr);
+ g_free(bounce);
+ /* Write bounce_buffer_size before reading map_client_list. */
+ smp_mb();
address_space_notify_map_clients(as);
}
--
2.34.1
19.08.2024 16:54, Mattias Nissler wrote: > When DMA memory can't be directly accessed, as is the case when > running the device model in a separate process without shareable DMA > file descriptors, bounce buffering is used. > > It is not uncommon for device models to request mapping of several DMA > regions at the same time. Examples include: > * net devices, e.g. when transmitting a packet that is split across > several TX descriptors (observed with igb) > * USB host controllers, when handling a packet with multiple data TRBs > (observed with xhci) > > Previously, qemu only provided a single bounce buffer per AddressSpace > and would fail DMA map requests while the buffer was already in use. In > turn, this would cause DMA failures that ultimately manifest as hardware > errors from the guest perspective. > > This change allocates DMA bounce buffers dynamically instead of > supporting only a single buffer. Thus, multiple DMA mappings work > correctly also when RAM can't be mmap()-ed. > > The total bounce buffer allocation size is limited individually for each > AddressSpace. The default limit is 4096 bytes, matching the previous > maximum buffer size. A new x-max-bounce-buffer-size parameter is > provided to configure the limit for PCI devices. So, the issue has now become CVE-2024-8612 (information leak), with this commit (v9.1.0-134-g637b0aa139) being the fix. Should we back-port it to previous stable releases of qemu? (it applies to 9.1 but not to 9.0, and I haven't tested it even in 9.1. If anything it needs some work for 9.0 and before) Thanks, /mjt
On Wed, Sep 25, 2024 at 12:03 PM Michael Tokarev <mjt@tls.msk.ru> wrote: > > 19.08.2024 16:54, Mattias Nissler wrote: > > When DMA memory can't be directly accessed, as is the case when > > running the device model in a separate process without shareable DMA > > file descriptors, bounce buffering is used. > > > > It is not uncommon for device models to request mapping of several DMA > > regions at the same time. Examples include: > > * net devices, e.g. when transmitting a packet that is split across > > several TX descriptors (observed with igb) > > * USB host controllers, when handling a packet with multiple data TRBs > > (observed with xhci) > > > > Previously, qemu only provided a single bounce buffer per AddressSpace > > and would fail DMA map requests while the buffer was already in use. In > > turn, this would cause DMA failures that ultimately manifest as hardware > > errors from the guest perspective. > > > > This change allocates DMA bounce buffers dynamically instead of > > supporting only a single buffer. Thus, multiple DMA mappings work > > correctly also when RAM can't be mmap()-ed. > > > > The total bounce buffer allocation size is limited individually for each > > AddressSpace. The default limit is 4096 bytes, matching the previous > > maximum buffer size. A new x-max-bounce-buffer-size parameter is > > provided to configure the limit for PCI devices. > > So, the issue has now become CVE-2024-8612 (information leak), with this > commit (v9.1.0-134-g637b0aa139) being the fix. Interesting. IIUC, this is triggered by device implementations calling dma_memory_unmap with an incorrect size parameter as provided by a hostile guest. Shouldn't the device implementations be fixed to validate the parameter as well? Maybe this has already happened? It would seem the more targeted fix to me. > > Should we back-port it to previous stable releases of qemu? > (it applies to 9.1 but not to 9.0, and I haven't tested it even in 9.1. > If anything it needs some work for 9.0 and before) FWIW, I've been running with earlier variants of this since at least 8.0.50, so a backport shouldn't be hard. Note that if we decide to backport, we should also include "mac_dbdma: Remove leftover `dma_memory_unmap` calls", which fixes a bug uncovered in mac_dbdma uncovered by the concurrent bounce buffers change. > > Thanks, > > /mjt
25.09.2024 13:23, Mattias Nissler wrote: > On Wed, Sep 25, 2024 at 12:03 PM Michael Tokarev <mjt@tls.msk.ru> wrote: .. >> So, the issue has now become CVE-2024-8612 (information leak), with this >> commit (v9.1.0-134-g637b0aa139) being the fix. > > Interesting. IIUC, this is triggered by device implementations calling > dma_memory_unmap with an incorrect size parameter as provided by a > hostile guest. Shouldn't the device implementations be fixed to > validate the parameter as well? Maybe this has already happened? It > would seem the more targeted fix to me. Yes, a similar question occurred to me too, - this change does not look like a proper fix for CVE-2024-8612. And nope, no other changes has been made to fix it properly, in the device implementations. Maybe now with CVE-2024-8612 in place, we can fix the actual problem in the right place, instead of relying on this change.. >> Should we back-port it to previous stable releases of qemu? >> (it applies to 9.1 but not to 9.0, and I haven't tested it even in 9.1. >> If anything it needs some work for 9.0 and before) > > FWIW, I've been running with earlier variants of this since at least > 8.0.50, so a backport shouldn't be hard. Note that if we decide to > backport, we should also include "mac_dbdma: Remove leftover > `dma_memory_unmap` calls", which fixes a bug uncovered in mac_dbdma > uncovered by the concurrent bounce buffers change. So far I picked this and mac_dbdma change for 9.1, and will try to back-port things up to 8.2. But it is better - IMHO - to have a real, more targetting, fix for CVE-2024-8612. Thanks, /mjt
On Thu, Sep 26, 2024 at 10:58:57AM +0300, Michael Tokarev wrote: > 25.09.2024 13:23, Mattias Nissler wrote: > > On Wed, Sep 25, 2024 at 12:03 PM Michael Tokarev <mjt@tls.msk.ru> wrote: > .. > > > So, the issue has now become CVE-2024-8612 (information leak), with this > > > commit (v9.1.0-134-g637b0aa139) being the fix. > > > > Interesting. IIUC, this is triggered by device implementations calling > > dma_memory_unmap with an incorrect size parameter as provided by a > > hostile guest. Shouldn't the device implementations be fixed to > > validate the parameter as well? Maybe this has already happened? It > > would seem the more targeted fix to me. > > Yes, a similar question occurred to me too, - this change does not look > like a proper fix for CVE-2024-8612. And nope, no other changes has been > made to fix it properly, in the device implementations. > > Maybe now with CVE-2024-8612 in place, we can fix the actual problem in > the right place, instead of relying on this change.. > > > > Should we back-port it to previous stable releases of qemu? > > > (it applies to 9.1 but not to 9.0, and I haven't tested it even in 9.1. > > > If anything it needs some work for 9.0 and before) > > > > FWIW, I've been running with earlier variants of this since at least > > 8.0.50, so a backport shouldn't be hard. Note that if we decide to > > backport, we should also include "mac_dbdma: Remove leftover > > `dma_memory_unmap` calls", which fixes a bug uncovered in mac_dbdma > > uncovered by the concurrent bounce buffers change. > > So far I picked this and mac_dbdma change for 9.1, and will try to > back-port things up to 8.2. But it is better - IMHO - to have a real, > more targetting, fix for CVE-2024-8612. Agree 100% here. Cc a bunch more people involved. > Thanks, > > /mjt
26.09.2024 11:12, Michael S. Tsirkin wrote: > On Thu, Sep 26, 2024 at 10:58:57AM +0300, Michael Tokarev wrote: >> 25.09.2024 13:23, Mattias Nissler wrote: >>> On Wed, Sep 25, 2024 at 12:03 PM Michael Tokarev <mjt@tls.msk.ru> wrote: >> .. >>>> So, the issue has now become CVE-2024-8612 (information leak), with this >>>> commit (v9.1.0-134-g637b0aa139) being the fix. >>> >>> Interesting. IIUC, this is triggered by device implementations calling >>> dma_memory_unmap with an incorrect size parameter as provided by a >>> hostile guest. Shouldn't the device implementations be fixed to >>> validate the parameter as well? Maybe this has already happened? It >>> would seem the more targeted fix to me. >> >> Yes, a similar question occurred to me too, - this change does not look >> like a proper fix for CVE-2024-8612. And nope, no other changes has been >> made to fix it properly, in the device implementations. >> >> Maybe now with CVE-2024-8612 in place, we can fix the actual problem in >> the right place, instead of relying on this change.. ...>> So far I picked this and mac_dbdma change for 9.1, and will try to >> back-port things up to 8.2. But it is better - IMHO - to have a real, >> more targetting, fix for CVE-2024-8612. > > Agree 100% here. > > Cc a bunch more people involved. a little ping? Thanks, /mjt
On Mon, 19 Aug 2024 at 14:56, Mattias Nissler <mnissler@rivosinc.com> wrote: > > When DMA memory can't be directly accessed, as is the case when > running the device model in a separate process without shareable DMA > file descriptors, bounce buffering is used. > > It is not uncommon for device models to request mapping of several DMA > regions at the same time. Examples include: > * net devices, e.g. when transmitting a packet that is split across > several TX descriptors (observed with igb) > * USB host controllers, when handling a packet with multiple data TRBs > (observed with xhci) > > Previously, qemu only provided a single bounce buffer per AddressSpace > and would fail DMA map requests while the buffer was already in use. In > turn, this would cause DMA failures that ultimately manifest as hardware > errors from the guest perspective. > > This change allocates DMA bounce buffers dynamically instead of > supporting only a single buffer. Thus, multiple DMA mappings work > correctly also when RAM can't be mmap()-ed. > > The total bounce buffer allocation size is limited individually for each > AddressSpace. The default limit is 4096 bytes, matching the previous > maximum buffer size. A new x-max-bounce-buffer-size parameter is > provided to configure the limit for PCI devices. > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > Acked-by: Peter Xu <peterx@redhat.com> > --- > @@ -3251,28 +3265,40 @@ void *address_space_map(AddressSpace *as, > mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); > > if (!memory_access_is_direct(mr, is_write)) { > - if (qatomic_xchg(&as->bounce.in_use, true)) { > + size_t used = qatomic_read(&as->bounce_buffer_size); > + for (;;) { > + hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l); > + size_t new_size = used + alloc; > + size_t actual = > + qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size); > + if (actual == used) { > + l = alloc; > + break; > + } > + used = actual; > + } > + > + if (l == 0) { > *plen = 0; > return NULL; > } > - /* Avoid unbounded allocations */ > - l = MIN(l, TARGET_PAGE_SIZE); > - as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); > - as->bounce.addr = addr; > - as->bounce.len = l; > > + BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer)); > + bounce->magic = BOUNCE_BUFFER_MAGIC; > memory_region_ref(mr); > - as->bounce.mr = mr; > + bounce->mr = mr; > + bounce->addr = addr; > + bounce->len = l; > + > if (!is_write) { > flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, > - as->bounce.buffer, l); > + bounce->buffer, l); > } > > *plen = l; > - return as->bounce.buffer; > + return bounce->buffer; Coverity is pretty unhappy about this trick, because it isn't able to recognise that we can figure out the address of 'bounce' from the address of 'bounce->buffer' and free it in the address_space_unmap() code, so it thinks that every use of address_space_map(), pci_dma_map(), etc, is a memory leak. We can mark all those as false positives, of course, but it got me wondering whether maybe we should have this function return a struct that has all the information address_space_unmap() needs rather than relying on it being able to figure it out from the host memory pointer... -- PMM
On Thu, Sep 12, 2024 at 03:27:55PM +0100, Peter Maydell wrote: > On Mon, 19 Aug 2024 at 14:56, Mattias Nissler <mnissler@rivosinc.com> wrote: > > > > When DMA memory can't be directly accessed, as is the case when > > running the device model in a separate process without shareable DMA > > file descriptors, bounce buffering is used. > > > > It is not uncommon for device models to request mapping of several DMA > > regions at the same time. Examples include: > > * net devices, e.g. when transmitting a packet that is split across > > several TX descriptors (observed with igb) > > * USB host controllers, when handling a packet with multiple data TRBs > > (observed with xhci) > > > > Previously, qemu only provided a single bounce buffer per AddressSpace > > and would fail DMA map requests while the buffer was already in use. In > > turn, this would cause DMA failures that ultimately manifest as hardware > > errors from the guest perspective. > > > > This change allocates DMA bounce buffers dynamically instead of > > supporting only a single buffer. Thus, multiple DMA mappings work > > correctly also when RAM can't be mmap()-ed. > > > > The total bounce buffer allocation size is limited individually for each > > AddressSpace. The default limit is 4096 bytes, matching the previous > > maximum buffer size. A new x-max-bounce-buffer-size parameter is > > provided to configure the limit for PCI devices. > > > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > > Acked-by: Peter Xu <peterx@redhat.com> > > --- > > > @@ -3251,28 +3265,40 @@ void *address_space_map(AddressSpace *as, > > mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); > > > > if (!memory_access_is_direct(mr, is_write)) { > > - if (qatomic_xchg(&as->bounce.in_use, true)) { > > + size_t used = qatomic_read(&as->bounce_buffer_size); > > + for (;;) { > > + hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l); > > + size_t new_size = used + alloc; > > + size_t actual = > > + qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size); > > + if (actual == used) { > > + l = alloc; > > + break; > > + } > > + used = actual; > > + } > > + > > + if (l == 0) { > > *plen = 0; > > return NULL; > > } > > - /* Avoid unbounded allocations */ > > - l = MIN(l, TARGET_PAGE_SIZE); > > - as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); > > - as->bounce.addr = addr; > > - as->bounce.len = l; > > > > + BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer)); > > + bounce->magic = BOUNCE_BUFFER_MAGIC; > > memory_region_ref(mr); > > - as->bounce.mr = mr; > > + bounce->mr = mr; > > + bounce->addr = addr; > > + bounce->len = l; > > + > > if (!is_write) { > > flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, > > - as->bounce.buffer, l); > > + bounce->buffer, l); > > } > > > > *plen = l; > > - return as->bounce.buffer; > > + return bounce->buffer; > > Coverity is pretty unhappy about this trick, because it isn't able > to recognise that we can figure out the address of 'bounce' > from the address of 'bounce->buffer' and free it in the > address_space_unmap() code, so it thinks that every use > of address_space_map(), pci_dma_map(), etc, is a memory leak. > We can mark all those as false positives, of course, but it got > me wondering whether maybe we should have this function return > a struct that has all the information address_space_unmap() > needs rather than relying on it being able to figure it out > from the host memory pointer... Indeed that sounds like a viable option. Looks like we don't have a lot of address_space_map() users. There might be some challenge on users who cache the results (rather than immediately unmap() after using the buffer in the same function). Still doable but the changeset can spread all over, and also testing is harder if just to get some coverage. Not sure whether Mattias have some more clue when he's working on that. Thanks, -- Peter Xu
On Fri, 13 Sept 2024 at 16:55, Peter Xu <peterx@redhat.com> wrote: > > On Thu, Sep 12, 2024 at 03:27:55PM +0100, Peter Maydell wrote: > > Coverity is pretty unhappy about this trick, because it isn't able > > to recognise that we can figure out the address of 'bounce' > > from the address of 'bounce->buffer' and free it in the > > address_space_unmap() code, so it thinks that every use > > of address_space_map(), pci_dma_map(), etc, is a memory leak. > > We can mark all those as false positives, of course, but it got > > me wondering whether maybe we should have this function return > > a struct that has all the information address_space_unmap() > > needs rather than relying on it being able to figure it out > > from the host memory pointer... > > Indeed that sounds like a viable option. Looks like we don't have a lot of > address_space_map() users. There's quite a few wrappers of it too, so it's a little hard to count. We might want to avoid the memory allocation in the common case by having the caller pass in an ASMapInfo struct to be filled in rather than having address_space_map() allocate-and-return one. -- PMM
On Fri, Sep 13, 2024 at 6:47 PM Peter Maydell <peter.maydell@linaro.org> wrote: > > On Fri, 13 Sept 2024 at 16:55, Peter Xu <peterx@redhat.com> wrote: > > > > On Thu, Sep 12, 2024 at 03:27:55PM +0100, Peter Maydell wrote: > > > Coverity is pretty unhappy about this trick, because it isn't able > > > to recognise that we can figure out the address of 'bounce' > > > from the address of 'bounce->buffer' and free it in the > > > address_space_unmap() code, so it thinks that every use > > > of address_space_map(), pci_dma_map(), etc, is a memory leak. > > > We can mark all those as false positives, of course, but it got > > > me wondering whether maybe we should have this function return > > > a struct that has all the information address_space_unmap() > > > needs rather than relying on it being able to figure it out > > > from the host memory pointer... > > > > Indeed that sounds like a viable option. Looks like we don't have a lot of > > address_space_map() users. > > There's quite a few wrappers of it too, so it's a little hard to count. > We might want to avoid the memory allocation in the common case > by having the caller pass in an ASMapInfo struct to be filled > in rather than having address_space_map() allocate-and-return one. Hm, this would work, but not only does it complicate the code consuming address_space_map, but it also increases memory footprint (a pointer being replaced by a struct of sizeof(BounceBuffer) if done naively), plus there's an additional pointer indirection (I'm doubtful whether this can be optimized away by the compiler). I haven't done any measurements of these effects, so can't say anything definitive, but this seems pretty costly just to appease coverity... Is there no way to inform coverity that a resource pointer is being transmuted into a handle, so it can track that instead? Given that pointer tricks like this and container_of usage is quite frequent, I would expect coverity to have a better strategy to handle these rather than suppressing false positive leak reports? If coverity can be made to understand this, it seems we should at least be able to mark the allocation in question as intentional leak (and thus not to report) at the position in the code where we allocate the bounce buffer, rather than having to create false positive suppressions for every caller. Perhaps this would be simplified if we extract out a function to allocate and initialize the bounce buffer, and then tell coverity to assume that anything it returns is not a leak? (My coverity experience is rather limited, just thinking out loud based on what I've seen in other projects to help static analysis tools in general)
On Mon, 16 Sept 2024 at 08:35, Mattias Nissler <mnissler@rivosinc.com> wrote: > > On Fri, Sep 13, 2024 at 6:47 PM Peter Maydell <peter.maydell@linaro.org> wrote: > > > > On Fri, 13 Sept 2024 at 16:55, Peter Xu <peterx@redhat.com> wrote: > > > > > > On Thu, Sep 12, 2024 at 03:27:55PM +0100, Peter Maydell wrote: > > > > Coverity is pretty unhappy about this trick, because it isn't able > > > > to recognise that we can figure out the address of 'bounce' > > > > from the address of 'bounce->buffer' and free it in the > > > > address_space_unmap() code, so it thinks that every use > > > > of address_space_map(), pci_dma_map(), etc, is a memory leak. > > > > We can mark all those as false positives, of course, but it got > > > > me wondering whether maybe we should have this function return > > > > a struct that has all the information address_space_unmap() > > > > needs rather than relying on it being able to figure it out > > > > from the host memory pointer... > > > > > > Indeed that sounds like a viable option. Looks like we don't have a lot of > > > address_space_map() users. > > > > There's quite a few wrappers of it too, so it's a little hard to count. > > We might want to avoid the memory allocation in the common case > > by having the caller pass in an ASMapInfo struct to be filled > > in rather than having address_space_map() allocate-and-return one. > > Hm, this would work, but not only does it complicate the code > consuming address_space_map, but it also increases memory footprint (a > pointer being replaced by a struct of sizeof(BounceBuffer) if done > naively), plus there's an additional pointer indirection (I'm doubtful > whether this can be optimized away by the compiler). I haven't done > any measurements of these effects, so can't say anything definitive, > but this seems pretty costly just to appease coverity... > > Is there no way to inform coverity that a resource pointer is being > transmuted into a handle, so it can track that instead? Given that > pointer tricks like this and container_of usage is quite frequent, I > would expect coverity to have a better strategy to handle these rather > than suppressing false positive leak reports? It's not purely that I want to appease Coverity. I also think for human readers that the current trick with passing back a pointer into host memory and relying on being able to get back to either the MR or to the bounce-buffer struct from that is pretty tricky. Would we have designed it that way if we weren't starting with the pre-existing address_space_map() function signature? thanks -- PMM
On Mon, Sep 16, 2024 at 11:05 AM Peter Maydell <peter.maydell@linaro.org> wrote: > > On Mon, 16 Sept 2024 at 08:35, Mattias Nissler <mnissler@rivosinc.com> wrote: > > > > On Fri, Sep 13, 2024 at 6:47 PM Peter Maydell <peter.maydell@linaro.org> wrote: > > > > > > On Fri, 13 Sept 2024 at 16:55, Peter Xu <peterx@redhat.com> wrote: > > > > > > > > On Thu, Sep 12, 2024 at 03:27:55PM +0100, Peter Maydell wrote: > > > > > Coverity is pretty unhappy about this trick, because it isn't able > > > > > to recognise that we can figure out the address of 'bounce' > > > > > from the address of 'bounce->buffer' and free it in the > > > > > address_space_unmap() code, so it thinks that every use > > > > > of address_space_map(), pci_dma_map(), etc, is a memory leak. > > > > > We can mark all those as false positives, of course, but it got > > > > > me wondering whether maybe we should have this function return > > > > > a struct that has all the information address_space_unmap() > > > > > needs rather than relying on it being able to figure it out > > > > > from the host memory pointer... > > > > > > > > Indeed that sounds like a viable option. Looks like we don't have a lot of > > > > address_space_map() users. > > > > > > There's quite a few wrappers of it too, so it's a little hard to count. > > > We might want to avoid the memory allocation in the common case > > > by having the caller pass in an ASMapInfo struct to be filled > > > in rather than having address_space_map() allocate-and-return one. > > > > Hm, this would work, but not only does it complicate the code > > consuming address_space_map, but it also increases memory footprint (a > > pointer being replaced by a struct of sizeof(BounceBuffer) if done > > naively), plus there's an additional pointer indirection (I'm doubtful > > whether this can be optimized away by the compiler). I haven't done > > any measurements of these effects, so can't say anything definitive, > > but this seems pretty costly just to appease coverity... > > > > Is there no way to inform coverity that a resource pointer is being > > transmuted into a handle, so it can track that instead? Given that > > pointer tricks like this and container_of usage is quite frequent, I > > would expect coverity to have a better strategy to handle these rather > > than suppressing false positive leak reports? > > It's not purely that I want to appease Coverity. I also > think for human readers that the current trick with passing > back a pointer into host memory and relying on being able to > get back to either the MR or to the bounce-buffer struct > from that is pretty tricky. Would we have designed it that > way if we weren't starting with the pre-existing address_space_map() > function signature? Identifying a mapping by its address seems pretty natural to me for callers of the API, at least as long as the address is all that the caller ever needs. The solution I implemented didn't/doesn't seem overly complicated to me, and I am hoping it wouldn't overwhelm anyone else who has worked with container_of (which seems to be used liberally in qemu). But at the end of the day it's a matter of personal taste and the background of the person writing the code, so I'll readily admit that this is personal opinion, and if qemu does prefer a different style then by all means we should change it and I'll be happy to help with that.
On Mon, Aug 19, 2024 at 06:54:54AM -0700, Mattias Nissler wrote: > When DMA memory can't be directly accessed, as is the case when > running the device model in a separate process without shareable DMA > file descriptors, bounce buffering is used. > > It is not uncommon for device models to request mapping of several DMA > regions at the same time. Examples include: > * net devices, e.g. when transmitting a packet that is split across > several TX descriptors (observed with igb) > * USB host controllers, when handling a packet with multiple data TRBs > (observed with xhci) > > Previously, qemu only provided a single bounce buffer per AddressSpace > and would fail DMA map requests while the buffer was already in use. In > turn, this would cause DMA failures that ultimately manifest as hardware > errors from the guest perspective. > > This change allocates DMA bounce buffers dynamically instead of > supporting only a single buffer. Thus, multiple DMA mappings work > correctly also when RAM can't be mmap()-ed. > > The total bounce buffer allocation size is limited individually for each > AddressSpace. The default limit is 4096 bytes, matching the previous > maximum buffer size. A new x-max-bounce-buffer-size parameter is > provided to configure the limit for PCI devices. > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > Acked-by: Peter Xu <peterx@redhat.com> > --- > This patch is split out from my "Support message-based DMA in vfio-user server" > series. With the series having been partially applied, I'm splitting this one > out as the only remaining patch to system emulation code in the hope to > simplify getting it landed. The code has previously been reviewed by Stefan > Hajnoczi and Peter Xu. This latest version includes changes to switch the > bounce buffer size bookkeeping to `size_t` as requested and LGTM'd by Phil in > v9. > --- > hw/pci/pci.c | 8 ++++ > include/exec/memory.h | 14 +++---- > include/hw/pci/pci_device.h | 3 ++ > system/memory.c | 5 ++- > system/physmem.c | 82 ++++++++++++++++++++++++++----------- > 5 files changed, 76 insertions(+), 36 deletions(-) > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > index fab86d0567..d2caf3ee8b 100644 > --- a/hw/pci/pci.c > +++ b/hw/pci/pci.c > @@ -85,6 +85,8 @@ static Property pci_props[] = { > QEMU_PCIE_ERR_UNC_MASK_BITNR, true), > DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, > QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), > + DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, > + max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), > DEFINE_PROP_END_OF_LIST() > }; > I'm a bit puzzled by now there being two fields named max_bounce_buffer_size, one directly controllable by a property. Pls add code comments explaining how they are related. Also, what is the point of adding a property without making it part of an API? No one will be able to rely on it working. > @@ -1204,6 +1206,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, > "bus master container", UINT64_MAX); > address_space_init(&pci_dev->bus_master_as, > &pci_dev->bus_master_container_region, pci_dev->name); > + pci_dev->bus_master_as.max_bounce_buffer_size = > + pci_dev->max_bounce_buffer_size; > > if (phase_check(PHASE_MACHINE_READY)) { > pci_init_bus_master(pci_dev); > @@ -2633,6 +2637,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data) > k->unrealize = pci_qdev_unrealize; > k->bus_type = TYPE_PCI_BUS; > device_class_set_props(k, pci_props); > + object_class_property_set_description( > + klass, "x-max-bounce-buffer-size", > + "Maximum buffer size allocated for bounce buffers used for mapped " > + "access to indirect DMA memory"); > } > > static void pci_device_class_base_init(ObjectClass *klass, void *data) > diff --git a/include/exec/memory.h b/include/exec/memory.h > index 296fd068c0..e5e865d1a9 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -1084,13 +1084,7 @@ typedef struct AddressSpaceMapClient { > QLIST_ENTRY(AddressSpaceMapClient) link; > } AddressSpaceMapClient; > > -typedef struct { > - MemoryRegion *mr; > - void *buffer; > - hwaddr addr; > - hwaddr len; > - bool in_use; > -} BounceBuffer; > +#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096) > > /** > * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects > @@ -1110,8 +1104,10 @@ struct AddressSpace { > QTAILQ_HEAD(, MemoryListener) listeners; > QTAILQ_ENTRY(AddressSpace) address_spaces_link; > > - /* Bounce buffer to use for this address space. */ > - BounceBuffer bounce; > + /* Maximum DMA bounce buffer size used for indirect memory map requests */ > + size_t max_bounce_buffer_size; > + /* Total size of bounce buffers currently allocated, atomically accessed */ > + size_t bounce_buffer_size; > /* List of callbacks to invoke when buffers free up */ > QemuMutex map_client_list_lock; > QLIST_HEAD(, AddressSpaceMapClient) map_client_list; > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h > index 15694f2489..91df40f989 100644 > --- a/include/hw/pci/pci_device.h > +++ b/include/hw/pci/pci_device.h > @@ -167,6 +167,9 @@ struct PCIDevice { > /* ID of standby device in net_failover pair */ > char *failover_pair_id; > uint32_t acpi_index; > + > + /* Maximum DMA bounce buffer size used for indirect memory map requests */ > + uint32_t max_bounce_buffer_size; > }; > > static inline int pci_intx(PCIDevice *pci_dev) > diff --git a/system/memory.c b/system/memory.c > index 5e6eb459d5..f6f6fee6d8 100644 > --- a/system/memory.c > +++ b/system/memory.c > @@ -3148,7 +3148,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) > as->ioeventfds = NULL; > QTAILQ_INIT(&as->listeners); > QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link); > - as->bounce.in_use = false; > + as->max_bounce_buffer_size = DEFAULT_MAX_BOUNCE_BUFFER_SIZE; > + as->bounce_buffer_size = 0; > qemu_mutex_init(&as->map_client_list_lock); > QLIST_INIT(&as->map_client_list); > as->name = g_strdup(name ? name : "anonymous"); > @@ -3158,7 +3159,7 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) > > static void do_address_space_destroy(AddressSpace *as) > { > - assert(!qatomic_read(&as->bounce.in_use)); > + assert(qatomic_read(&as->bounce_buffer_size) == 0); > assert(QLIST_EMPTY(&as->map_client_list)); > qemu_mutex_destroy(&as->map_client_list_lock); > > diff --git a/system/physmem.c b/system/physmem.c > index 94600a33ec..971bfa0855 100644 > --- a/system/physmem.c > +++ b/system/physmem.c > @@ -3095,6 +3095,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len) > NULL, len, FLUSH_CACHE); > } > > +/* > + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used > + * to detect illegal pointers passed to address_space_unmap. > + */ > +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed > + > +typedef struct { > + uint64_t magic; > + MemoryRegion *mr; > + hwaddr addr; > + size_t len; > + uint8_t buffer[]; > +} BounceBuffer; > + > static void > address_space_unregister_map_client_do(AddressSpaceMapClient *client) > { > @@ -3120,9 +3134,9 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh) > QEMU_LOCK_GUARD(&as->map_client_list_lock); > client->bh = bh; > QLIST_INSERT_HEAD(&as->map_client_list, client, link); > - /* Write map_client_list before reading in_use. */ > + /* Write map_client_list before reading bounce_buffer_size. */ > smp_mb(); > - if (!qatomic_read(&as->bounce.in_use)) { > + if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) { > address_space_notify_map_clients_locked(as); > } > } > @@ -3251,28 +3265,40 @@ void *address_space_map(AddressSpace *as, > mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); > > if (!memory_access_is_direct(mr, is_write)) { > - if (qatomic_xchg(&as->bounce.in_use, true)) { > + size_t used = qatomic_read(&as->bounce_buffer_size); > + for (;;) { > + hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l); > + size_t new_size = used + alloc; > + size_t actual = > + qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size); > + if (actual == used) { > + l = alloc; > + break; > + } > + used = actual; > + } > + > + if (l == 0) { > *plen = 0; > return NULL; > } > - /* Avoid unbounded allocations */ > - l = MIN(l, TARGET_PAGE_SIZE); > - as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); > - as->bounce.addr = addr; > - as->bounce.len = l; > > + BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer)); > + bounce->magic = BOUNCE_BUFFER_MAGIC; > memory_region_ref(mr); > - as->bounce.mr = mr; > + bounce->mr = mr; > + bounce->addr = addr; > + bounce->len = l; > + > if (!is_write) { > flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, > - as->bounce.buffer, l); > + bounce->buffer, l); > } > > *plen = l; > - return as->bounce.buffer; > + return bounce->buffer; > } > > - > memory_region_ref(mr); > *plen = flatview_extend_translation(fv, addr, len, mr, xlat, > l, is_write, attrs); > @@ -3287,12 +3313,11 @@ void *address_space_map(AddressSpace *as, > void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, > bool is_write, hwaddr access_len) > { > - if (buffer != as->bounce.buffer) { > - MemoryRegion *mr; > - ram_addr_t addr1; > + MemoryRegion *mr; > + ram_addr_t addr1; > > - mr = memory_region_from_host(buffer, &addr1); > - assert(mr != NULL); > + mr = memory_region_from_host(buffer, &addr1); > + if (mr != NULL) { > if (is_write) { > invalidate_and_set_dirty(mr, addr1, access_len); > } > @@ -3302,15 +3327,22 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, > memory_region_unref(mr); > return; > } > + > + > + BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer); > + assert(bounce->magic == BOUNCE_BUFFER_MAGIC); > + > if (is_write) { > - address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED, > - as->bounce.buffer, access_len); > - } > - qemu_vfree(as->bounce.buffer); > - as->bounce.buffer = NULL; > - memory_region_unref(as->bounce.mr); > - /* Clear in_use before reading map_client_list. */ > - qatomic_set_mb(&as->bounce.in_use, false); > + address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED, > + bounce->buffer, access_len); > + } > + > + qatomic_sub(&as->bounce_buffer_size, bounce->len); > + bounce->magic = ~BOUNCE_BUFFER_MAGIC; > + memory_region_unref(bounce->mr); > + g_free(bounce); > + /* Write bounce_buffer_size before reading map_client_list. */ > + smp_mb(); > address_space_notify_map_clients(as); > } > > -- > 2.34.1
On Tue, 10 Sept 2024 at 15:53, Michael S. Tsirkin <mst@redhat.com> wrote: > > On Mon, Aug 19, 2024 at 06:54:54AM -0700, Mattias Nissler wrote: > > When DMA memory can't be directly accessed, as is the case when > > running the device model in a separate process without shareable DMA > > file descriptors, bounce buffering is used. > > > > It is not uncommon for device models to request mapping of several DMA > > regions at the same time. Examples include: > > * net devices, e.g. when transmitting a packet that is split across > > several TX descriptors (observed with igb) > > * USB host controllers, when handling a packet with multiple data TRBs > > (observed with xhci) > > > > Previously, qemu only provided a single bounce buffer per AddressSpace > > and would fail DMA map requests while the buffer was already in use. In > > turn, this would cause DMA failures that ultimately manifest as hardware > > errors from the guest perspective. > > > > This change allocates DMA bounce buffers dynamically instead of > > supporting only a single buffer. Thus, multiple DMA mappings work > > correctly also when RAM can't be mmap()-ed. > > > > The total bounce buffer allocation size is limited individually for each > > AddressSpace. The default limit is 4096 bytes, matching the previous > > maximum buffer size. A new x-max-bounce-buffer-size parameter is > > provided to configure the limit for PCI devices. > > > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > > Acked-by: Peter Xu <peterx@redhat.com> > > --- > > This patch is split out from my "Support message-based DMA in vfio-user server" > > series. With the series having been partially applied, I'm splitting this one > > out as the only remaining patch to system emulation code in the hope to > > simplify getting it landed. The code has previously been reviewed by Stefan > > Hajnoczi and Peter Xu. This latest version includes changes to switch the > > bounce buffer size bookkeeping to `size_t` as requested and LGTM'd by Phil in > > v9. > > --- > > hw/pci/pci.c | 8 ++++ > > include/exec/memory.h | 14 +++---- > > include/hw/pci/pci_device.h | 3 ++ > > system/memory.c | 5 ++- > > system/physmem.c | 82 ++++++++++++++++++++++++++----------- > > 5 files changed, 76 insertions(+), 36 deletions(-) > > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > > index fab86d0567..d2caf3ee8b 100644 > > --- a/hw/pci/pci.c > > +++ b/hw/pci/pci.c > > @@ -85,6 +85,8 @@ static Property pci_props[] = { > > QEMU_PCIE_ERR_UNC_MASK_BITNR, true), > > DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, > > QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), > > + DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, > > + max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), > > DEFINE_PROP_END_OF_LIST() > > }; > > > > I'm a bit puzzled by now there being two fields named > max_bounce_buffer_size, one directly controllable by > a property. > > Pls add code comments explaining how they are related. > > > Also, what is the point of adding a property without > making it part of an API? No one will be able to rely on > it working. Note that this patch is already upstream as commit 637b0aa13. thanks -- PMM
On Tue, Sep 10, 2024 at 5:44 PM Peter Maydell <peter.maydell@linaro.org> wrote: > > On Tue, 10 Sept 2024 at 15:53, Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Mon, Aug 19, 2024 at 06:54:54AM -0700, Mattias Nissler wrote: > > > When DMA memory can't be directly accessed, as is the case when > > > running the device model in a separate process without shareable DMA > > > file descriptors, bounce buffering is used. > > > > > > It is not uncommon for device models to request mapping of several DMA > > > regions at the same time. Examples include: > > > * net devices, e.g. when transmitting a packet that is split across > > > several TX descriptors (observed with igb) > > > * USB host controllers, when handling a packet with multiple data TRBs > > > (observed with xhci) > > > > > > Previously, qemu only provided a single bounce buffer per AddressSpace > > > and would fail DMA map requests while the buffer was already in use. In > > > turn, this would cause DMA failures that ultimately manifest as hardware > > > errors from the guest perspective. > > > > > > This change allocates DMA bounce buffers dynamically instead of > > > supporting only a single buffer. Thus, multiple DMA mappings work > > > correctly also when RAM can't be mmap()-ed. > > > > > > The total bounce buffer allocation size is limited individually for each > > > AddressSpace. The default limit is 4096 bytes, matching the previous > > > maximum buffer size. A new x-max-bounce-buffer-size parameter is > > > provided to configure the limit for PCI devices. > > > > > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > > > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > > > Acked-by: Peter Xu <peterx@redhat.com> > > > --- > > > This patch is split out from my "Support message-based DMA in vfio-user server" > > > series. With the series having been partially applied, I'm splitting this one > > > out as the only remaining patch to system emulation code in the hope to > > > simplify getting it landed. The code has previously been reviewed by Stefan > > > Hajnoczi and Peter Xu. This latest version includes changes to switch the > > > bounce buffer size bookkeeping to `size_t` as requested and LGTM'd by Phil in > > > v9. > > > --- > > > hw/pci/pci.c | 8 ++++ > > > include/exec/memory.h | 14 +++---- > > > include/hw/pci/pci_device.h | 3 ++ > > > system/memory.c | 5 ++- > > > system/physmem.c | 82 ++++++++++++++++++++++++++----------- > > > 5 files changed, 76 insertions(+), 36 deletions(-) > > > > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > > > index fab86d0567..d2caf3ee8b 100644 > > > --- a/hw/pci/pci.c > > > +++ b/hw/pci/pci.c > > > @@ -85,6 +85,8 @@ static Property pci_props[] = { > > > QEMU_PCIE_ERR_UNC_MASK_BITNR, true), > > > DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, > > > QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), > > > + DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, > > > + max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), > > > DEFINE_PROP_END_OF_LIST() > > > }; > > > > > > > I'm a bit puzzled by now there being two fields named > > max_bounce_buffer_size, one directly controllable by > > a property. One is one the pci device, the other is on the address space. The former can be set via a command line parameter, and that value is used to initialize the field on the address space, which is then consulted when allocating bounce buffers. I'm not sure which aspect of this is unclear and/or deserves additional commenting - let me know and I'll be happy to send a patch. > > > > Pls add code comments explaining how they are related. > > > > > > Also, what is the point of adding a property without > > making it part of an API? No one will be able to rely on > > it working. > > Note that this patch is already upstream as commit 637b0aa13. > > thanks > -- PMM
On Tue, Sep 10, 2024 at 06:10:50PM +0200, Mattias Nissler wrote: > On Tue, Sep 10, 2024 at 5:44 PM Peter Maydell <peter.maydell@linaro.org> wrote: > > > > On Tue, 10 Sept 2024 at 15:53, Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Mon, Aug 19, 2024 at 06:54:54AM -0700, Mattias Nissler wrote: > > > > When DMA memory can't be directly accessed, as is the case when > > > > running the device model in a separate process without shareable DMA > > > > file descriptors, bounce buffering is used. > > > > > > > > It is not uncommon for device models to request mapping of several DMA > > > > regions at the same time. Examples include: > > > > * net devices, e.g. when transmitting a packet that is split across > > > > several TX descriptors (observed with igb) > > > > * USB host controllers, when handling a packet with multiple data TRBs > > > > (observed with xhci) > > > > > > > > Previously, qemu only provided a single bounce buffer per AddressSpace > > > > and would fail DMA map requests while the buffer was already in use. In > > > > turn, this would cause DMA failures that ultimately manifest as hardware > > > > errors from the guest perspective. > > > > > > > > This change allocates DMA bounce buffers dynamically instead of > > > > supporting only a single buffer. Thus, multiple DMA mappings work > > > > correctly also when RAM can't be mmap()-ed. > > > > > > > > The total bounce buffer allocation size is limited individually for each > > > > AddressSpace. The default limit is 4096 bytes, matching the previous > > > > maximum buffer size. A new x-max-bounce-buffer-size parameter is > > > > provided to configure the limit for PCI devices. > > > > > > > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > > > > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > > > > Acked-by: Peter Xu <peterx@redhat.com> > > > > --- > > > > This patch is split out from my "Support message-based DMA in vfio-user server" > > > > series. With the series having been partially applied, I'm splitting this one > > > > out as the only remaining patch to system emulation code in the hope to > > > > simplify getting it landed. The code has previously been reviewed by Stefan > > > > Hajnoczi and Peter Xu. This latest version includes changes to switch the > > > > bounce buffer size bookkeeping to `size_t` as requested and LGTM'd by Phil in > > > > v9. > > > > --- > > > > hw/pci/pci.c | 8 ++++ > > > > include/exec/memory.h | 14 +++---- > > > > include/hw/pci/pci_device.h | 3 ++ > > > > system/memory.c | 5 ++- > > > > system/physmem.c | 82 ++++++++++++++++++++++++++----------- > > > > 5 files changed, 76 insertions(+), 36 deletions(-) > > > > > > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > > > > index fab86d0567..d2caf3ee8b 100644 > > > > --- a/hw/pci/pci.c > > > > +++ b/hw/pci/pci.c > > > > @@ -85,6 +85,8 @@ static Property pci_props[] = { > > > > QEMU_PCIE_ERR_UNC_MASK_BITNR, true), > > > > DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, > > > > QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), > > > > + DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, > > > > + max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), > > > > DEFINE_PROP_END_OF_LIST() > > > > }; > > > > > > > > > > I'm a bit puzzled by now there being two fields named > > > max_bounce_buffer_size, one directly controllable by > > > a property. > > One is one the pci device, the other is on the address space. The > former can be set via a command line parameter, and that value is used > to initialize the field on the address space, which is then consulted > when allocating bounce buffers. > > I'm not sure which aspect of this is unclear and/or deserves > additional commenting - let me know and I'll be happy to send a patch. I'd document what does each field do. > > > > > > Pls add code comments explaining how they are related. > > > > > > > > > Also, what is the point of adding a property without > > > making it part of an API? No one will be able to rely on > > > it working. > > > > Note that this patch is already upstream as commit 637b0aa13. > > > > thanks > > -- PMM Maybe you can answer this?
On Tue, Sep 10, 2024 at 6:40 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Tue, Sep 10, 2024 at 06:10:50PM +0200, Mattias Nissler wrote: > > On Tue, Sep 10, 2024 at 5:44 PM Peter Maydell <peter.maydell@linaro.org> wrote: > > > > > > On Tue, 10 Sept 2024 at 15:53, Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > On Mon, Aug 19, 2024 at 06:54:54AM -0700, Mattias Nissler wrote: > > > > > When DMA memory can't be directly accessed, as is the case when > > > > > running the device model in a separate process without shareable DMA > > > > > file descriptors, bounce buffering is used. > > > > > > > > > > It is not uncommon for device models to request mapping of several DMA > > > > > regions at the same time. Examples include: > > > > > * net devices, e.g. when transmitting a packet that is split across > > > > > several TX descriptors (observed with igb) > > > > > * USB host controllers, when handling a packet with multiple data TRBs > > > > > (observed with xhci) > > > > > > > > > > Previously, qemu only provided a single bounce buffer per AddressSpace > > > > > and would fail DMA map requests while the buffer was already in use. In > > > > > turn, this would cause DMA failures that ultimately manifest as hardware > > > > > errors from the guest perspective. > > > > > > > > > > This change allocates DMA bounce buffers dynamically instead of > > > > > supporting only a single buffer. Thus, multiple DMA mappings work > > > > > correctly also when RAM can't be mmap()-ed. > > > > > > > > > > The total bounce buffer allocation size is limited individually for each > > > > > AddressSpace. The default limit is 4096 bytes, matching the previous > > > > > maximum buffer size. A new x-max-bounce-buffer-size parameter is > > > > > provided to configure the limit for PCI devices. > > > > > > > > > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > > > > > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > > > > > Acked-by: Peter Xu <peterx@redhat.com> > > > > > --- > > > > > This patch is split out from my "Support message-based DMA in vfio-user server" > > > > > series. With the series having been partially applied, I'm splitting this one > > > > > out as the only remaining patch to system emulation code in the hope to > > > > > simplify getting it landed. The code has previously been reviewed by Stefan > > > > > Hajnoczi and Peter Xu. This latest version includes changes to switch the > > > > > bounce buffer size bookkeeping to `size_t` as requested and LGTM'd by Phil in > > > > > v9. > > > > > --- > > > > > hw/pci/pci.c | 8 ++++ > > > > > include/exec/memory.h | 14 +++---- > > > > > include/hw/pci/pci_device.h | 3 ++ > > > > > system/memory.c | 5 ++- > > > > > system/physmem.c | 82 ++++++++++++++++++++++++++----------- > > > > > 5 files changed, 76 insertions(+), 36 deletions(-) > > > > > > > > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > > > > > index fab86d0567..d2caf3ee8b 100644 > > > > > --- a/hw/pci/pci.c > > > > > +++ b/hw/pci/pci.c > > > > > @@ -85,6 +85,8 @@ static Property pci_props[] = { > > > > > QEMU_PCIE_ERR_UNC_MASK_BITNR, true), > > > > > DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, > > > > > QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), > > > > > + DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, > > > > > + max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), > > > > > DEFINE_PROP_END_OF_LIST() > > > > > }; > > > > > > > > > > > > > I'm a bit puzzled by now there being two fields named > > > > max_bounce_buffer_size, one directly controllable by > > > > a property. > > > > One is one the pci device, the other is on the address space. The > > former can be set via a command line parameter, and that value is used > > to initialize the field on the address space, which is then consulted > > when allocating bounce buffers. > > > > I'm not sure which aspect of this is unclear and/or deserves > > additional commenting - let me know and I'll be happy to send a patch. > > I'd document what does each field do. I have just sent a patch to expand the comments, let's discuss details there. > > > > > > > > > Pls add code comments explaining how they are related. > > > > > > > > > > > > Also, what is the point of adding a property without > > > > making it part of an API? No one will be able to rely on > > > > it working. All I needed was a practical way to allow the bounce buffer size limit to be adjusted in the somewhat exotic situations where we're making DMA requests to indirect memory regions (in my case it is a qemu vfio-user server accessed by a client that can't or doesn't want to provide direct memory-mapped access to its RAM). There was some discussion about the nature of the parameter when I first proposed the patch, see https://lore.kernel.org/qemu-devel/20230823092905.2259418-2-mnissler@rivosinc.com/ - an x-prefixed experimental command-line parameter was suggested there as a practical way to allow this without qemu committing to supporting this forever. For the unlikely case that this parameter proves popular, it can still be added to a stable API (or alternatively we could discuss whether a large-enough limit is feasible after all, or even consider DMA API changes to obviate the need for bounce buffering). > > > > > > Note that this patch is already upstream as commit 637b0aa13. > > > > > > thanks > > > -- PMM > > Maybe you can answer this? >
On Tue, Sep 10, 2024 at 11:36:08PM +0200, Mattias Nissler wrote: > On Tue, Sep 10, 2024 at 6:40 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > On Tue, Sep 10, 2024 at 06:10:50PM +0200, Mattias Nissler wrote: > > > On Tue, Sep 10, 2024 at 5:44 PM Peter Maydell <peter.maydell@linaro.org> wrote: > > > > > > > > On Tue, 10 Sept 2024 at 15:53, Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > On Mon, Aug 19, 2024 at 06:54:54AM -0700, Mattias Nissler wrote: > > > > > > When DMA memory can't be directly accessed, as is the case when > > > > > > running the device model in a separate process without shareable DMA > > > > > > file descriptors, bounce buffering is used. > > > > > > > > > > > > It is not uncommon for device models to request mapping of several DMA > > > > > > regions at the same time. Examples include: > > > > > > * net devices, e.g. when transmitting a packet that is split across > > > > > > several TX descriptors (observed with igb) > > > > > > * USB host controllers, when handling a packet with multiple data TRBs > > > > > > (observed with xhci) > > > > > > > > > > > > Previously, qemu only provided a single bounce buffer per AddressSpace > > > > > > and would fail DMA map requests while the buffer was already in use. In > > > > > > turn, this would cause DMA failures that ultimately manifest as hardware > > > > > > errors from the guest perspective. > > > > > > > > > > > > This change allocates DMA bounce buffers dynamically instead of > > > > > > supporting only a single buffer. Thus, multiple DMA mappings work > > > > > > correctly also when RAM can't be mmap()-ed. > > > > > > > > > > > > The total bounce buffer allocation size is limited individually for each > > > > > > AddressSpace. The default limit is 4096 bytes, matching the previous > > > > > > maximum buffer size. A new x-max-bounce-buffer-size parameter is > > > > > > provided to configure the limit for PCI devices. > > > > > > > > > > > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > > > > > > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > > > > > > Acked-by: Peter Xu <peterx@redhat.com> > > > > > > --- > > > > > > This patch is split out from my "Support message-based DMA in vfio-user server" > > > > > > series. With the series having been partially applied, I'm splitting this one > > > > > > out as the only remaining patch to system emulation code in the hope to > > > > > > simplify getting it landed. The code has previously been reviewed by Stefan > > > > > > Hajnoczi and Peter Xu. This latest version includes changes to switch the > > > > > > bounce buffer size bookkeeping to `size_t` as requested and LGTM'd by Phil in > > > > > > v9. > > > > > > --- > > > > > > hw/pci/pci.c | 8 ++++ > > > > > > include/exec/memory.h | 14 +++---- > > > > > > include/hw/pci/pci_device.h | 3 ++ > > > > > > system/memory.c | 5 ++- > > > > > > system/physmem.c | 82 ++++++++++++++++++++++++++----------- > > > > > > 5 files changed, 76 insertions(+), 36 deletions(-) > > > > > > > > > > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > > > > > > index fab86d0567..d2caf3ee8b 100644 > > > > > > --- a/hw/pci/pci.c > > > > > > +++ b/hw/pci/pci.c > > > > > > @@ -85,6 +85,8 @@ static Property pci_props[] = { > > > > > > QEMU_PCIE_ERR_UNC_MASK_BITNR, true), > > > > > > DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, > > > > > > QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), > > > > > > + DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, > > > > > > + max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), > > > > > > DEFINE_PROP_END_OF_LIST() > > > > > > }; > > > > > > > > > > > > > > > > I'm a bit puzzled by now there being two fields named > > > > > max_bounce_buffer_size, one directly controllable by > > > > > a property. > > > > > > One is one the pci device, the other is on the address space. The > > > former can be set via a command line parameter, and that value is used > > > to initialize the field on the address space, which is then consulted > > > when allocating bounce buffers. > > > > > > I'm not sure which aspect of this is unclear and/or deserves > > > additional commenting - let me know and I'll be happy to send a patch. > > > > I'd document what does each field do. > > I have just sent a patch to expand the comments, let's discuss details there. > > > > > > > > > > > > > Pls add code comments explaining how they are related. > > > > > > > > > > > > > > > Also, what is the point of adding a property without > > > > > making it part of an API? No one will be able to rely on > > > > > it working. > > All I needed was a practical way to allow the bounce buffer size limit > to be adjusted in the somewhat exotic situations where we're making > DMA requests to indirect memory regions (in my case it is a qemu > vfio-user server accessed by a client that can't or doesn't want to > provide direct memory-mapped access to its RAM). There was some > discussion about the nature of the parameter when I first proposed the > patch, see https://lore.kernel.org/qemu-devel/20230823092905.2259418-2-mnissler@rivosinc.com/ > - an x-prefixed experimental command-line parameter was suggested > there as a practical way to allow this without qemu committing to > supporting this forever. For the unlikely case that this parameter > proves popular, it can still be added to a stable API (or > alternatively we could discuss whether a large-enough limit is > feasible after all, or even consider DMA API changes to obviate the > need for bounce buffering). Yes but how happy will you be if we rename the parameter in the future? All your scripts will break. > > > > > > > > Note that this patch is already upstream as commit 637b0aa13. > > > > > > > > thanks > > > > -- PMM > > > > Maybe you can answer this? > >
On Wed, Sep 11, 2024 at 12:24 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > On Tue, Sep 10, 2024 at 11:36:08PM +0200, Mattias Nissler wrote: > > On Tue, Sep 10, 2024 at 6:40 PM Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > On Tue, Sep 10, 2024 at 06:10:50PM +0200, Mattias Nissler wrote: > > > > On Tue, Sep 10, 2024 at 5:44 PM Peter Maydell <peter.maydell@linaro.org> wrote: > > > > > > > > > > On Tue, 10 Sept 2024 at 15:53, Michael S. Tsirkin <mst@redhat.com> wrote: > > > > > > > > > > > > On Mon, Aug 19, 2024 at 06:54:54AM -0700, Mattias Nissler wrote: > > > > > > > When DMA memory can't be directly accessed, as is the case when > > > > > > > running the device model in a separate process without shareable DMA > > > > > > > file descriptors, bounce buffering is used. > > > > > > > > > > > > > > It is not uncommon for device models to request mapping of several DMA > > > > > > > regions at the same time. Examples include: > > > > > > > * net devices, e.g. when transmitting a packet that is split across > > > > > > > several TX descriptors (observed with igb) > > > > > > > * USB host controllers, when handling a packet with multiple data TRBs > > > > > > > (observed with xhci) > > > > > > > > > > > > > > Previously, qemu only provided a single bounce buffer per AddressSpace > > > > > > > and would fail DMA map requests while the buffer was already in use. In > > > > > > > turn, this would cause DMA failures that ultimately manifest as hardware > > > > > > > errors from the guest perspective. > > > > > > > > > > > > > > This change allocates DMA bounce buffers dynamically instead of > > > > > > > supporting only a single buffer. Thus, multiple DMA mappings work > > > > > > > correctly also when RAM can't be mmap()-ed. > > > > > > > > > > > > > > The total bounce buffer allocation size is limited individually for each > > > > > > > AddressSpace. The default limit is 4096 bytes, matching the previous > > > > > > > maximum buffer size. A new x-max-bounce-buffer-size parameter is > > > > > > > provided to configure the limit for PCI devices. > > > > > > > > > > > > > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > > > > > > > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > > > > > > > Acked-by: Peter Xu <peterx@redhat.com> > > > > > > > --- > > > > > > > This patch is split out from my "Support message-based DMA in vfio-user server" > > > > > > > series. With the series having been partially applied, I'm splitting this one > > > > > > > out as the only remaining patch to system emulation code in the hope to > > > > > > > simplify getting it landed. The code has previously been reviewed by Stefan > > > > > > > Hajnoczi and Peter Xu. This latest version includes changes to switch the > > > > > > > bounce buffer size bookkeeping to `size_t` as requested and LGTM'd by Phil in > > > > > > > v9. > > > > > > > --- > > > > > > > hw/pci/pci.c | 8 ++++ > > > > > > > include/exec/memory.h | 14 +++---- > > > > > > > include/hw/pci/pci_device.h | 3 ++ > > > > > > > system/memory.c | 5 ++- > > > > > > > system/physmem.c | 82 ++++++++++++++++++++++++++----------- > > > > > > > 5 files changed, 76 insertions(+), 36 deletions(-) > > > > > > > > > > > > > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > > > > > > > index fab86d0567..d2caf3ee8b 100644 > > > > > > > --- a/hw/pci/pci.c > > > > > > > +++ b/hw/pci/pci.c > > > > > > > @@ -85,6 +85,8 @@ static Property pci_props[] = { > > > > > > > QEMU_PCIE_ERR_UNC_MASK_BITNR, true), > > > > > > > DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, > > > > > > > QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), > > > > > > > + DEFINE_PROP_SIZE32("x-max-bounce-buffer-size", PCIDevice, > > > > > > > + max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE), > > > > > > > DEFINE_PROP_END_OF_LIST() > > > > > > > }; > > > > > > > > > > > > > > > > > > > I'm a bit puzzled by now there being two fields named > > > > > > max_bounce_buffer_size, one directly controllable by > > > > > > a property. > > > > > > > > One is one the pci device, the other is on the address space. The > > > > former can be set via a command line parameter, and that value is used > > > > to initialize the field on the address space, which is then consulted > > > > when allocating bounce buffers. > > > > > > > > I'm not sure which aspect of this is unclear and/or deserves > > > > additional commenting - let me know and I'll be happy to send a patch. > > > > > > I'd document what does each field do. > > > > I have just sent a patch to expand the comments, let's discuss details there. > > > > > > > > > > > > > > > > > Pls add code comments explaining how they are related. > > > > > > > > > > > > > > > > > > Also, what is the point of adding a property without > > > > > > making it part of an API? No one will be able to rely on > > > > > > it working. > > > > All I needed was a practical way to allow the bounce buffer size limit > > to be adjusted in the somewhat exotic situations where we're making > > DMA requests to indirect memory regions (in my case it is a qemu > > vfio-user server accessed by a client that can't or doesn't want to > > provide direct memory-mapped access to its RAM). There was some > > discussion about the nature of the parameter when I first proposed the > > patch, see https://lore.kernel.org/qemu-devel/20230823092905.2259418-2-mnissler@rivosinc.com/ > > - an x-prefixed experimental command-line parameter was suggested > > there as a practical way to allow this without qemu committing to > > supporting this forever. For the unlikely case that this parameter > > proves popular, it can still be added to a stable API (or > > alternatively we could discuss whether a large-enough limit is > > feasible after all, or even consider DMA API changes to obviate the > > need for bounce buffering). > > > Yes but how happy will you be if we rename the parameter in the > future? All your scripts will break. It's not that I'm running random qemu versions in production, in fact I'm using this for semi-automated testing of hardware designs. We'd find out when upgrading our qemu, and adjust. In fact, if you come up with a better way to handle this bounce buffering kludge, I'd be willing to not only adjust, but even help implement. > > > > > > > > > > > Note that this patch is already upstream as commit 637b0aa13. > > > > > > > > > > thanks > > > > > -- PMM > > > > > > Maybe you can answer this? > > > >
On Mon, Aug 19, 2024 at 06:54:54AM -0700, Mattias Nissler wrote: > When DMA memory can't be directly accessed, as is the case when > running the device model in a separate process without shareable DMA > file descriptors, bounce buffering is used. > > It is not uncommon for device models to request mapping of several DMA > regions at the same time. Examples include: > * net devices, e.g. when transmitting a packet that is split across > several TX descriptors (observed with igb) > * USB host controllers, when handling a packet with multiple data TRBs > (observed with xhci) > > Previously, qemu only provided a single bounce buffer per AddressSpace > and would fail DMA map requests while the buffer was already in use. In > turn, this would cause DMA failures that ultimately manifest as hardware > errors from the guest perspective. > > This change allocates DMA bounce buffers dynamically instead of > supporting only a single buffer. Thus, multiple DMA mappings work > correctly also when RAM can't be mmap()-ed. > > The total bounce buffer allocation size is limited individually for each > AddressSpace. The default limit is 4096 bytes, matching the previous > maximum buffer size. A new x-max-bounce-buffer-size parameter is > provided to configure the limit for PCI devices. > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com> > Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> > Acked-by: Peter Xu <peterx@redhat.com> > --- > This patch is split out from my "Support message-based DMA in vfio-user server" > series. With the series having been partially applied, I'm splitting this one > out as the only remaining patch to system emulation code in the hope to > simplify getting it landed. The code has previously been reviewed by Stefan > Hajnoczi and Peter Xu. This latest version includes changes to switch the > bounce buffer size bookkeeping to `size_t` as requested and LGTM'd by Phil in > v9. +Peter Maydell. I'll tentatively collect this in my next 9.2 pull, should be at least in 1-2 weeks. Thanks, -- Peter Xu
© 2016 - 2024 Red Hat, Inc.