Introduce load/store helpers which take a MemOp argument.
Inspired-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
MAINTAINERS | 1 +
include/qemu/ldst_unaligned.h | 25 +++++++++++++
util/ldst.c | 69 +++++++++++++++++++++++++++++++++++
util/meson.build | 1 +
4 files changed, 96 insertions(+)
create mode 100644 util/ldst.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 2d8583cbb52..4eb4e34ce75 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3272,6 +3272,7 @@ F: system/physmem.c
F: system/memory-internal.h
F: system/ram-block-attributes.c
F: scripts/coccinelle/memory-region-housekeeping.cocci
+F: util/ldst.c
Memory devices
M: David Hildenbrand <david@kernel.org>
diff --git a/include/qemu/ldst_unaligned.h b/include/qemu/ldst_unaligned.h
index 63a091ad401..2c5c6723802 100644
--- a/include/qemu/ldst_unaligned.h
+++ b/include/qemu/ldst_unaligned.h
@@ -6,6 +6,8 @@
#ifndef QEMU_LDST_UNALIGNED_H
#define QEMU_LDST_UNALIGNED_H
+#include "exec/memop.h"
+
/*
* Any compiler worth its salt will turn these memcpy into native unaligned
* operations. Thus we don't need to play games with packed attributes, or
@@ -64,4 +66,27 @@ static inline void stq_unaligned_p(void *ptr, uint64_t v)
__builtin_memcpy(ptr, &v, sizeof(v));
}
+/**
+ * ldm_p: Load value from host memory (byteswapping if necessary)
+ *
+ * @ptr: the host pointer to be accessed
+ * @mop: #MemOp mask containing access size and optional byteswapping
+ *
+ * Convert the value stored at @ptr in host memory and byteswap if necessary.
+ *
+ * Returns: the converted value.
+ */
+uint64_t ldm_p(const void *ptr, MemOp mop);
+
+/**
+ * stm_p: Store value to host memory (byteswapping if necessary)
+ *
+ * @ptr: the host pointer to be accessed
+ * @mop: #MemOp mask containing access size and optional byteswapping
+ * @val: the value to store
+ *
+ * Convert the value (byteswap if necessary) and stored at @ptr in host memory.
+ */
+void stm_p(void *ptr, MemOp mop, uint64_t val);
+
#endif
diff --git a/util/ldst.c b/util/ldst.c
new file mode 100644
index 00000000000..fb11c073bcb
--- /dev/null
+++ b/util/ldst.c
@@ -0,0 +1,69 @@
+/*
+ * Load/Store helpers for QEMU
+ *
+ * Copyright (c) Linaro Ltd.
+ *
+ * Authors:
+ * Philippe Mathieu-Daudé
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/ldst_unaligned.h"
+
+uint64_t ldm_p(const void *ptr, MemOp mop)
+{
+ const unsigned size = memop_size(mop);
+ uint64_t val;
+ uint8_t *pval = (uint8_t *)&val;
+
+ if (HOST_BIG_ENDIAN) {
+ pval += sizeof(val) - size;
+ }
+
+ __builtin_memcpy(pval, ptr, size);
+ if (unlikely(mop & MO_BSWAP)) {
+ switch (size) {
+ case sizeof(uint16_t):
+ val = __builtin_bswap16(val);
+ break;
+ case sizeof(uint32_t):
+ val = __builtin_bswap32(val);
+ break;
+ case sizeof(uint64_t):
+ val = __builtin_bswap64(val);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+ return val;
+}
+
+void stm_p(void *ptr, MemOp mop, uint64_t val)
+{
+ const unsigned size = memop_size(mop);
+ const uint8_t *pval = (const uint8_t *)&val;
+
+ if (HOST_BIG_ENDIAN) {
+ pval += sizeof(val) - size;
+ }
+
+ if (unlikely(mop & MO_BSWAP)) {
+ switch (size) {
+ case sizeof(uint16_t):
+ val = __builtin_bswap16(val);
+ break;
+ case sizeof(uint32_t):
+ val = __builtin_bswap32(val);
+ break;
+ case sizeof(uint64_t):
+ val = __builtin_bswap64(val);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+ __builtin_memcpy(ptr, pval, size);
+}
diff --git a/util/meson.build b/util/meson.build
index 35029380a37..b695b6e4728 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -38,6 +38,7 @@ util_ss.add(files('envlist.c', 'path.c', 'module.c'))
util_ss.add(files('event.c'))
util_ss.add(files('host-utils.c'))
util_ss.add(files('bitmap.c', 'bitops.c'))
+util_ss.add(files('ldst.c'))
util_ss.add(files('fifo8.c'))
util_ss.add(files('cacheflush.c'))
util_ss.add(files('error.c', 'error-report.c'))
--
2.52.0
On 1/10/26 03:50, Philippe Mathieu-Daudé wrote:
> +uint64_t ldm_p(const void *ptr, MemOp mop)
> +{
> + const unsigned size = memop_size(mop);
> + uint64_t val;
> + uint8_t *pval = (uint8_t *)&val;
> +
> + if (HOST_BIG_ENDIAN) {
> + pval += sizeof(val) - size;
> + }
> +
> + __builtin_memcpy(pval, ptr, size);
> + if (unlikely(mop & MO_BSWAP)) {
> + switch (size) {
> + case sizeof(uint16_t):
> + val = __builtin_bswap16(val);
> + break;
> + case sizeof(uint32_t):
> + val = __builtin_bswap32(val);
> + break;
> + case sizeof(uint64_t):
> + val = __builtin_bswap64(val);
> + break;
> + default:
> + g_assert_not_reached();
> + }
> + }
> + return val;
> +}
I'm not fond of the pointer arithmetic or the code structure.
Perhaps better as
switch (mop & (MO_BSWAP | MO_SIZE)) {
case MO_LEUW:
return lduw_le_p(ptr);
case MO_BEUW:
return lduw_be_p(ptr);
...
default:
g_assert_not_reached();
}
which would hopefully compile to host endian-swapping load insns like
.L1:
mov (ptr), %eax
ret
.L2:
movbe (ptr), %eax
ret
.L3:
mov (ptr), %rax
ret
.L4:
movbe (ptr), %rax
ret
etc.
And the default case assert makes sure that we're not passing garbage, or too large MO_SIZE.
r~
On 1/10/26 00:23, Richard Henderson wrote:
> I'm not fond of the pointer arithmetic or the code structure.
>
> Perhaps better as
>
> switch (mop & (MO_BSWAP | MO_SIZE)) {
> case MO_LEUW:
> return lduw_le_p(ptr);
> case MO_BEUW:
> return lduw_be_p(ptr);
> ...
> default:
> g_assert_not_reached();
> }
>
> which would hopefully compile to host endian-swapping load insns like
>
> .L1:
> mov (ptr), %eax
> ret
> .L2:
> movbe (ptr), %eax
> ret
It only might do so for 32-bits, because movbe also bundles a free
32->64-bit zero extension, but not for the smaller ones. Thinking about
which, I think ldm_p also needs to handle MO_SIGN? It can be done all
in one with
static inline uint64_t ldm_p(const void *ptr, MemOp mop)
{
const unsigned size = memop_size(mop);
uint64_t val;
uint8_t *pval = (uint8_t *)&val;
if (HOST_BIG_ENDIAN) {
pval += sizeof(val) - size;
}
assert(size < 8);
__builtin_memcpy(pval, ptr, size);
if (mop & MO_BSWAP) {
val = __builtin_bswap64(val);
} else if (mop & MO_SIGN) {
val <<= (64 - 8 * size);
} else {
return val;
}
if (mop & MO_SIGN) {
return ((int64_t) val) >> (64 - 8 * size);
} else {
return val >> (64 - 8 * size);
}
}
static inline void stm_p(void *ptr, uint64_t val, MemOp mop)
{
const unsigned size = memop_size(mop);
uint8_t *pval = (uint8_t *)&val;
assert(size < 8);
if ((mop & MO_BSWAP)) {
val = __builtin_bswap64(val) >> (64 - size * 8);
}
if (HOST_BIG_ENDIAN) {
pval += sizeof(val) - size;
}
__builtin_memcpy(ptr, pval, size);
}
When inlining ldm_p, GCC is able to generate movzx/movsx instruction but
doesn't recognize bswap64 + right shift as a smaller-width bswap + zero
extension; clang does. Neither is able to generate movbe instructions,
though.
I attach a standalone file I played with.
Paolo
On 12/1/26 09:56, Paolo Bonzini wrote:
> On 1/10/26 00:23, Richard Henderson wrote:
>> I'm not fond of the pointer arithmetic or the code structure.
>>
>> Perhaps better as
>>
>> switch (mop & (MO_BSWAP | MO_SIZE)) {
>> case MO_LEUW:
>> return lduw_le_p(ptr);
>> case MO_BEUW:
>> return lduw_be_p(ptr);
>> ...
>> default:
>> g_assert_not_reached();
>> }
>>
>> which would hopefully compile to host endian-swapping load insns like
>>
>> .L1:
>> mov (ptr), %eax
>> ret
>> .L2:
>> movbe (ptr), %eax
>> ret
>
> It only might do so for 32-bits, because movbe also bundles a free 32-
> >64-bit zero extension, but not for the smaller ones. Thinking about
> which, I think ldm_p also needs to handle MO_SIGN? It can be done all
> in one with
>
> static inline uint64_t ldm_p(const void *ptr, MemOp mop)
> {
> const unsigned size = memop_size(mop);
> uint64_t val;
> uint8_t *pval = (uint8_t *)&val;
>
> if (HOST_BIG_ENDIAN) {
> pval += sizeof(val) - size;
> }
>
> assert(size < 8);
> __builtin_memcpy(pval, ptr, size);
>
> if (mop & MO_BSWAP) {
> val = __builtin_bswap64(val);
> } else if (mop & MO_SIGN) {
Can't we have both MO_BSWAP && MO_SIGN bits set?
> val <<= (64 - 8 * size);
> } else {
> return val;
> }
>
> if (mop & MO_SIGN) {
> return ((int64_t) val) >> (64 - 8 * size);
> } else {
> return val >> (64 - 8 * size);
> }
> }
>
> static inline void stm_p(void *ptr, uint64_t val, MemOp mop)
> {
> const unsigned size = memop_size(mop);
> uint8_t *pval = (uint8_t *)&val;
>
> assert(size < 8);
> if ((mop & MO_BSWAP)) {
> val = __builtin_bswap64(val) >> (64 - size * 8);
> }
>
> if (HOST_BIG_ENDIAN) {
> pval += sizeof(val) - size;
> }
>
> __builtin_memcpy(ptr, pval, size);
> }
>
> When inlining ldm_p, GCC is able to generate movzx/movsx instruction but
> doesn't recognize bswap64 + right shift as a smaller-width bswap + zero
> extension; clang does. Neither is able to generate movbe instructions,
> though.
>
> I attach a standalone file I played with.
>
> Paolo
On 2/4/26 08:44, Philippe Mathieu-Daudé wrote:
> On 12/1/26 09:56, Paolo Bonzini wrote:
>> On 1/10/26 00:23, Richard Henderson wrote:
>>> I'm not fond of the pointer arithmetic or the code structure.
>>>
>>> Perhaps better as
>>>
>>> switch (mop & (MO_BSWAP | MO_SIZE)) {
>>> case MO_LEUW:
>>> return lduw_le_p(ptr);
>>> case MO_BEUW:
>>> return lduw_be_p(ptr);
>>> ...
>>> default:
>>> g_assert_not_reached();
>>> }
>>>
>>> which would hopefully compile to host endian-swapping load insns like
>>>
>>> .L1:
>>> mov (ptr), %eax
>>> ret
>>> .L2:
>>> movbe (ptr), %eax
>>> ret
>>
>> It only might do so for 32-bits, because movbe also bundles a free 32- >64-bit zero
>> extension, but not for the smaller ones. Thinking about which, I think ldm_p also needs
>> to handle MO_SIGN? It can be done all in one with
>>
>> static inline uint64_t ldm_p(const void *ptr, MemOp mop)
>> {
>> const unsigned size = memop_size(mop);
>> uint64_t val;
>> uint8_t *pval = (uint8_t *)&val;
>>
>> if (HOST_BIG_ENDIAN) {
>> pval += sizeof(val) - size;
>> }
>>
>> assert(size < 8);
>> __builtin_memcpy(pval, ptr, size);
>>
>> if (mop & MO_BSWAP) {
>> val = __builtin_bswap64(val);
>> } else if (mop & MO_SIGN) {
>
> Can't we have both MO_BSWAP && MO_SIGN bits set?
>
>> val <<= (64 - 8 * size);
>> } else {
>> return val;
>> }
>>
>> if (mop & MO_SIGN) {
>> return ((int64_t) val) >> (64 - 8 * size);
>> } else {
>> return val >> (64 - 8 * size);
>> }
Yes, but that's handled by always using bswap64 so that the lsb ends up at the msb, and
the shift takes care of the sign extend.
r~
Il ven 9 gen 2026, 17:51 Philippe Mathieu-Daudé <philmd@linaro.org> ha
scritto:
> Introduce load/store helpers which take a MemOp argument.
>
> Inspired-by: Paolo Bonzini <pbonzini@redhat.com>
>
That's a new one. :)
I think they should be inline and so should be
address_space_{ld,st}m_internal (maybe even always_inline). The amount of
optimization enabled by constant MemOp is huge.
One other small change:
+ const unsigned size = memop_size(mop);
> + uint64_t val;
>
This must be initialized to zero.
Otherwise the whole series looks great, thanks. Sorry again for delaying my
review.
Paolo
+ uint8_t *pval = (uint8_t *)&val;
> +
> + if (HOST_BIG_ENDIAN) {
> + pval += sizeof(val) - size;
> + }
> +
> + __builtin_memcpy(pval, ptr, size);
> + if (unlikely(mop & MO_BSWAP)) {
> + switch (size) {
> + case sizeof(uint16_t):
> + val = __builtin_bswap16(val);
> + break;
> + case sizeof(uint32_t):
> + val = __builtin_bswap32(val);
> + break;
> + case sizeof(uint64_t):
> + val = __builtin_bswap64(val);
> + break;
> + default:
> + g_assert_not_reached();
> + }
> + }
> + return val;
> +}
> +
> +void stm_p(void *ptr, MemOp mop, uint64_t val)
> +{
> + const unsigned size = memop_size(mop);
> + const uint8_t *pval = (const uint8_t *)&val;
> +
> + if (HOST_BIG_ENDIAN) {
> + pval += sizeof(val) - size;
> + }
> +
> + if (unlikely(mop & MO_BSWAP)) {
> + switch (size) {
> + case sizeof(uint16_t):
> + val = __builtin_bswap16(val);
> + break;
> + case sizeof(uint32_t):
> + val = __builtin_bswap32(val);
> + break;
> + case sizeof(uint64_t):
> + val = __builtin_bswap64(val);
> + break;
> + default:
> + g_assert_not_reached();
> + }
> + }
> + __builtin_memcpy(ptr, pval, size);
> +}
> diff --git a/util/meson.build b/util/meson.build
> index 35029380a37..b695b6e4728 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -38,6 +38,7 @@ util_ss.add(files('envlist.c', 'path.c', 'module.c'))
> util_ss.add(files('event.c'))
> util_ss.add(files('host-utils.c'))
> util_ss.add(files('bitmap.c', 'bitops.c'))
> +util_ss.add(files('ldst.c'))
> util_ss.add(files('fifo8.c'))
> util_ss.add(files('cacheflush.c'))
> util_ss.add(files('error.c', 'error-report.c'))
> --
> 2.52.0
>
>
On 1/10/26 04:23, Paolo Bonzini wrote:
>
>
> Il ven 9 gen 2026, 17:51 Philippe Mathieu-Daudé <philmd@linaro.org
> <mailto:philmd@linaro.org>> ha scritto:
>
> Introduce load/store helpers which take a MemOp argument.
>
> Inspired-by: Paolo Bonzini <pbonzini@redhat.com <mailto:pbonzini@redhat.com>>
>
>
> That's a new one. :)
>
> I think they should be inline and so should be address_space_{ld,st}m_internal (maybe even
> always_inline). The amount of optimization enabled by constant MemOp is huge.
When are we going to have constant MemOp?
r~
On 1/10/26 00:26, Richard Henderson wrote:
> On 1/10/26 04:23, Paolo Bonzini wrote:
>>
>>
>> Il ven 9 gen 2026, 17:51 Philippe Mathieu-Daudé <philmd@linaro.org
>> <mailto:philmd@linaro.org>> ha scritto:
>>
>> Introduce load/store helpers which take a MemOp argument.
>>
>> Inspired-by: Paolo Bonzini <pbonzini@redhat.com
>> <mailto:pbonzini@redhat.com>>
>>
>>
>> That's a new one. :)
>>
>> I think they should be inline and so should be address_space_{ld,st}
>> m_internal (maybe even always_inline). The amount of optimization
>> enabled by constant MemOp is huge.
>
> When are we going to have constant MemOp?
Every time address_space_{ld,st}m_internal is called, it's done with
constant MemOp. For example:
uint16_t ADDRESS_SPACE_LD(uw)(ARG1_DECL, hwaddr addr,
MemTxAttrs attrs, MemTxResult *result)
{
return ADDRESS_SPACE_LD_INTERNAL(m)(ARG1, MO_ENDIAN | MO_16,
addr, attrs, result);
}
Paolo
© 2016 - 2026 Red Hat, Inc.