[PATCH v5 02/22] util: Introduce ldm_p() and stm_p() load/store helpers

Philippe Mathieu-Daudé posted 22 patches 1 month ago
There is a newer version of this series
[PATCH v5 02/22] util: Introduce ldm_p() and stm_p() load/store helpers
Posted by Philippe Mathieu-Daudé 1 month ago
Introduce load/store helpers which take a MemOp argument.

Inspired-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
 MAINTAINERS                   |  1 +
 include/qemu/ldst_unaligned.h | 25 +++++++++++++
 util/ldst.c                   | 69 +++++++++++++++++++++++++++++++++++
 util/meson.build              |  1 +
 4 files changed, 96 insertions(+)
 create mode 100644 util/ldst.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 2d8583cbb52..4eb4e34ce75 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3272,6 +3272,7 @@ F: system/physmem.c
 F: system/memory-internal.h
 F: system/ram-block-attributes.c
 F: scripts/coccinelle/memory-region-housekeeping.cocci
+F: util/ldst.c
 
 Memory devices
 M: David Hildenbrand <david@kernel.org>
diff --git a/include/qemu/ldst_unaligned.h b/include/qemu/ldst_unaligned.h
index 63a091ad401..2c5c6723802 100644
--- a/include/qemu/ldst_unaligned.h
+++ b/include/qemu/ldst_unaligned.h
@@ -6,6 +6,8 @@
 #ifndef QEMU_LDST_UNALIGNED_H
 #define QEMU_LDST_UNALIGNED_H
 
+#include "exec/memop.h"
+
 /*
  * Any compiler worth its salt will turn these memcpy into native unaligned
  * operations.  Thus we don't need to play games with packed attributes, or
@@ -64,4 +66,27 @@ static inline void stq_unaligned_p(void *ptr, uint64_t v)
     __builtin_memcpy(ptr, &v, sizeof(v));
 }
 
+/**
+ * ldm_p: Load value from host memory (byteswapping if necessary)
+ *
+ * @ptr: the host pointer to be accessed
+ * @mop: #MemOp mask containing access size and optional byteswapping
+ *
+ * Convert the value stored at @ptr in host memory and byteswap if necessary.
+ *
+ * Returns: the converted value.
+ */
+uint64_t ldm_p(const void *ptr, MemOp mop);
+
+/**
+ * stm_p: Store value to host memory (byteswapping if necessary)
+ *
+ * @ptr: the host pointer to be accessed
+ * @mop: #MemOp mask containing access size and optional byteswapping
+ * @val: the value to store
+ *
+ * Convert the value (byteswap if necessary) and stored at @ptr in host memory.
+ */
+void stm_p(void *ptr, MemOp mop, uint64_t val);
+
 #endif
diff --git a/util/ldst.c b/util/ldst.c
new file mode 100644
index 00000000000..fb11c073bcb
--- /dev/null
+++ b/util/ldst.c
@@ -0,0 +1,69 @@
+/*
+ * Load/Store helpers for QEMU
+ *
+ * Copyright (c) Linaro Ltd.
+ *
+ * Authors:
+ *   Philippe Mathieu-Daudé
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/ldst_unaligned.h"
+
+uint64_t ldm_p(const void *ptr, MemOp mop)
+{
+    const unsigned size = memop_size(mop);
+    uint64_t val;
+    uint8_t *pval = (uint8_t *)&val;
+
+    if (HOST_BIG_ENDIAN) {
+        pval += sizeof(val) - size;
+    }
+
+    __builtin_memcpy(pval, ptr, size);
+    if (unlikely(mop & MO_BSWAP)) {
+        switch (size) {
+        case sizeof(uint16_t):
+            val = __builtin_bswap16(val);
+            break;
+        case sizeof(uint32_t):
+            val = __builtin_bswap32(val);
+            break;
+        case sizeof(uint64_t):
+            val = __builtin_bswap64(val);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
+    return val;
+}
+
+void stm_p(void *ptr, MemOp mop, uint64_t val)
+{
+    const unsigned size = memop_size(mop);
+    const uint8_t *pval = (const uint8_t *)&val;
+
+    if (HOST_BIG_ENDIAN) {
+        pval += sizeof(val) - size;
+    }
+
+    if (unlikely(mop & MO_BSWAP)) {
+        switch (size) {
+        case sizeof(uint16_t):
+            val = __builtin_bswap16(val);
+            break;
+        case sizeof(uint32_t):
+            val = __builtin_bswap32(val);
+            break;
+        case sizeof(uint64_t):
+            val = __builtin_bswap64(val);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+    }
+    __builtin_memcpy(ptr, pval, size);
+}
diff --git a/util/meson.build b/util/meson.build
index 35029380a37..b695b6e4728 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -38,6 +38,7 @@ util_ss.add(files('envlist.c', 'path.c', 'module.c'))
 util_ss.add(files('event.c'))
 util_ss.add(files('host-utils.c'))
 util_ss.add(files('bitmap.c', 'bitops.c'))
+util_ss.add(files('ldst.c'))
 util_ss.add(files('fifo8.c'))
 util_ss.add(files('cacheflush.c'))
 util_ss.add(files('error.c', 'error-report.c'))
-- 
2.52.0


Re: [PATCH v5 02/22] util: Introduce ldm_p() and stm_p() load/store helpers
Posted by Richard Henderson 1 month ago
On 1/10/26 03:50, Philippe Mathieu-Daudé wrote:
> +uint64_t ldm_p(const void *ptr, MemOp mop)
> +{
> +    const unsigned size = memop_size(mop);
> +    uint64_t val;
> +    uint8_t *pval = (uint8_t *)&val;
> +
> +    if (HOST_BIG_ENDIAN) {
> +        pval += sizeof(val) - size;
> +    }
> +
> +    __builtin_memcpy(pval, ptr, size);
> +    if (unlikely(mop & MO_BSWAP)) {
> +        switch (size) {
> +        case sizeof(uint16_t):
> +            val = __builtin_bswap16(val);
> +            break;
> +        case sizeof(uint32_t):
> +            val = __builtin_bswap32(val);
> +            break;
> +        case sizeof(uint64_t):
> +            val = __builtin_bswap64(val);
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +    }
> +    return val;
> +}

I'm not fond of the pointer arithmetic or the code structure.

Perhaps better as

     switch (mop & (MO_BSWAP | MO_SIZE)) {
     case MO_LEUW:
         return lduw_le_p(ptr);
     case MO_BEUW:
         return lduw_be_p(ptr);
     ...
     default:
         g_assert_not_reached();
     }

which would hopefully compile to host endian-swapping load insns like

.L1:
	mov	(ptr), %eax
	ret
.L2:
	movbe	(ptr), %eax
	ret
.L3:
	mov	(ptr), %rax
	ret
.L4:
	movbe	(ptr), %rax
	ret

etc.

And the default case assert makes sure that we're not passing garbage, or too large MO_SIZE.


r~

Re: [PATCH v5 02/22] util: Introduce ldm_p() and stm_p() load/store helpers
Posted by Paolo Bonzini 4 weeks ago
On 1/10/26 00:23, Richard Henderson wrote:
> I'm not fond of the pointer arithmetic or the code structure.
> 
> Perhaps better as
> 
>      switch (mop & (MO_BSWAP | MO_SIZE)) {
>      case MO_LEUW:
>          return lduw_le_p(ptr);
>      case MO_BEUW:
>          return lduw_be_p(ptr);
>      ...
>      default:
>          g_assert_not_reached();
>      }
> 
> which would hopefully compile to host endian-swapping load insns like
> 
> .L1:
> 	mov	(ptr), %eax
> 	ret
> .L2:
> 	movbe	(ptr), %eax
> 	ret

It only might do so for 32-bits, because movbe also bundles a free 
32->64-bit zero extension, but not for the smaller ones.  Thinking about 
which, I think ldm_p also needs to handle MO_SIGN?  It can be done all 
in one with

static inline uint64_t ldm_p(const void *ptr, MemOp mop)
{
     const unsigned size = memop_size(mop);
     uint64_t val;
     uint8_t *pval = (uint8_t *)&val;

     if (HOST_BIG_ENDIAN) {
         pval += sizeof(val) - size;
     }

     assert(size < 8);
     __builtin_memcpy(pval, ptr, size);

     if (mop & MO_BSWAP) {
         val = __builtin_bswap64(val);
     } else if (mop & MO_SIGN) {
         val <<= (64 - 8 * size);
     } else {
         return val;
     }

     if (mop & MO_SIGN) {
         return ((int64_t) val) >> (64 - 8 * size);
     } else {
         return val >> (64 - 8 * size);
     }
}

static inline void stm_p(void *ptr, uint64_t val, MemOp mop)
{
     const unsigned size = memop_size(mop);
     uint8_t *pval = (uint8_t *)&val;

     assert(size < 8);
     if ((mop & MO_BSWAP)) {
         val = __builtin_bswap64(val) >> (64 - size * 8);
     }

     if (HOST_BIG_ENDIAN) {
         pval += sizeof(val) - size;
     }

     __builtin_memcpy(ptr, pval, size);
}

When inlining ldm_p, GCC is able to generate movzx/movsx instruction but 
doesn't recognize bswap64 + right shift as a smaller-width bswap + zero 
extension; clang does.  Neither is able to generate movbe instructions, 
though.

I attach a standalone file I played with.

Paolo
Re: [PATCH v5 02/22] util: Introduce ldm_p() and stm_p() load/store helpers
Posted by Philippe Mathieu-Daudé 5 days, 17 hours ago
On 12/1/26 09:56, Paolo Bonzini wrote:
> On 1/10/26 00:23, Richard Henderson wrote:
>> I'm not fond of the pointer arithmetic or the code structure.
>>
>> Perhaps better as
>>
>>      switch (mop & (MO_BSWAP | MO_SIZE)) {
>>      case MO_LEUW:
>>          return lduw_le_p(ptr);
>>      case MO_BEUW:
>>          return lduw_be_p(ptr);
>>      ...
>>      default:
>>          g_assert_not_reached();
>>      }
>>
>> which would hopefully compile to host endian-swapping load insns like
>>
>> .L1:
>>     mov    (ptr), %eax
>>     ret
>> .L2:
>>     movbe    (ptr), %eax
>>     ret
> 
> It only might do so for 32-bits, because movbe also bundles a free 32- 
>  >64-bit zero extension, but not for the smaller ones.  Thinking about 
> which, I think ldm_p also needs to handle MO_SIGN?  It can be done all 
> in one with
> 
> static inline uint64_t ldm_p(const void *ptr, MemOp mop)
> {
>      const unsigned size = memop_size(mop);
>      uint64_t val;
>      uint8_t *pval = (uint8_t *)&val;
> 
>      if (HOST_BIG_ENDIAN) {
>          pval += sizeof(val) - size;
>      }
> 
>      assert(size < 8);
>      __builtin_memcpy(pval, ptr, size);
> 
>      if (mop & MO_BSWAP) {
>          val = __builtin_bswap64(val);
>      } else if (mop & MO_SIGN) {

Can't we have both MO_BSWAP && MO_SIGN bits set?

>          val <<= (64 - 8 * size);
>      } else {
>          return val;
>      }
> 
>      if (mop & MO_SIGN) {
>          return ((int64_t) val) >> (64 - 8 * size);
>      } else {
>          return val >> (64 - 8 * size);
>      }
> }
> 
> static inline void stm_p(void *ptr, uint64_t val, MemOp mop)
> {
>      const unsigned size = memop_size(mop);
>      uint8_t *pval = (uint8_t *)&val;
> 
>      assert(size < 8);
>      if ((mop & MO_BSWAP)) {
>          val = __builtin_bswap64(val) >> (64 - size * 8);
>      }
> 
>      if (HOST_BIG_ENDIAN) {
>          pval += sizeof(val) - size;
>      }
> 
>      __builtin_memcpy(ptr, pval, size);
> }
> 
> When inlining ldm_p, GCC is able to generate movzx/movsx instruction but 
> doesn't recognize bswap64 + right shift as a smaller-width bswap + zero 
> extension; clang does.  Neither is able to generate movbe instructions, 
> though.
> 
> I attach a standalone file I played with.
> 
> Paolo


Re: [PATCH v5 02/22] util: Introduce ldm_p() and stm_p() load/store helpers
Posted by Richard Henderson 5 days, 14 hours ago
On 2/4/26 08:44, Philippe Mathieu-Daudé wrote:
> On 12/1/26 09:56, Paolo Bonzini wrote:
>> On 1/10/26 00:23, Richard Henderson wrote:
>>> I'm not fond of the pointer arithmetic or the code structure.
>>>
>>> Perhaps better as
>>>
>>>      switch (mop & (MO_BSWAP | MO_SIZE)) {
>>>      case MO_LEUW:
>>>          return lduw_le_p(ptr);
>>>      case MO_BEUW:
>>>          return lduw_be_p(ptr);
>>>      ...
>>>      default:
>>>          g_assert_not_reached();
>>>      }
>>>
>>> which would hopefully compile to host endian-swapping load insns like
>>>
>>> .L1:
>>>     mov    (ptr), %eax
>>>     ret
>>> .L2:
>>>     movbe    (ptr), %eax
>>>     ret
>>
>> It only might do so for 32-bits, because movbe also bundles a free 32-  >64-bit zero 
>> extension, but not for the smaller ones.  Thinking about which, I think ldm_p also needs 
>> to handle MO_SIGN?  It can be done all in one with
>>
>> static inline uint64_t ldm_p(const void *ptr, MemOp mop)
>> {
>>      const unsigned size = memop_size(mop);
>>      uint64_t val;
>>      uint8_t *pval = (uint8_t *)&val;
>>
>>      if (HOST_BIG_ENDIAN) {
>>          pval += sizeof(val) - size;
>>      }
>>
>>      assert(size < 8);
>>      __builtin_memcpy(pval, ptr, size);
>>
>>      if (mop & MO_BSWAP) {
>>          val = __builtin_bswap64(val);
>>      } else if (mop & MO_SIGN) {
> 
> Can't we have both MO_BSWAP && MO_SIGN bits set?
> 
>>          val <<= (64 - 8 * size);
>>      } else {
>>          return val;
>>      }
>>
>>      if (mop & MO_SIGN) {
>>          return ((int64_t) val) >> (64 - 8 * size);
>>      } else {
>>          return val >> (64 - 8 * size);
>>      }

Yes, but that's handled by always using bswap64 so that the lsb ends up at the msb, and 
the shift takes care of the sign extend.


r~

Re: [PATCH v5 02/22] util: Introduce ldm_p() and stm_p() load/store helpers
Posted by Paolo Bonzini 1 month ago
Il ven 9 gen 2026, 17:51 Philippe Mathieu-Daudé <philmd@linaro.org> ha
scritto:

> Introduce load/store helpers which take a MemOp argument.
>
> Inspired-by: Paolo Bonzini <pbonzini@redhat.com>
>

That's a new one. :)

I think they should be inline and so should be
address_space_{ld,st}m_internal (maybe even always_inline). The amount of
optimization enabled by constant MemOp is huge.

One other small change:

+    const unsigned size = memop_size(mop);
> +    uint64_t val;
>

This must be initialized to zero.

Otherwise the whole series looks great, thanks. Sorry again for delaying my
review.

Paolo

+    uint8_t *pval = (uint8_t *)&val;
> +
> +    if (HOST_BIG_ENDIAN) {
> +        pval += sizeof(val) - size;
> +    }
> +
> +    __builtin_memcpy(pval, ptr, size);
> +    if (unlikely(mop & MO_BSWAP)) {
> +        switch (size) {
> +        case sizeof(uint16_t):
> +            val = __builtin_bswap16(val);
> +            break;
> +        case sizeof(uint32_t):
> +            val = __builtin_bswap32(val);
> +            break;
> +        case sizeof(uint64_t):
> +            val = __builtin_bswap64(val);
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +    }
> +    return val;
> +}
> +
> +void stm_p(void *ptr, MemOp mop, uint64_t val)
> +{
> +    const unsigned size = memop_size(mop);
> +    const uint8_t *pval = (const uint8_t *)&val;
> +
> +    if (HOST_BIG_ENDIAN) {
> +        pval += sizeof(val) - size;
> +    }
> +
> +    if (unlikely(mop & MO_BSWAP)) {
> +        switch (size) {
> +        case sizeof(uint16_t):
> +            val = __builtin_bswap16(val);
> +            break;
> +        case sizeof(uint32_t):
> +            val = __builtin_bswap32(val);
> +            break;
> +        case sizeof(uint64_t):
> +            val = __builtin_bswap64(val);
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +    }
> +    __builtin_memcpy(ptr, pval, size);
> +}
> diff --git a/util/meson.build b/util/meson.build
> index 35029380a37..b695b6e4728 100644
> --- a/util/meson.build
> +++ b/util/meson.build
> @@ -38,6 +38,7 @@ util_ss.add(files('envlist.c', 'path.c', 'module.c'))
>  util_ss.add(files('event.c'))
>  util_ss.add(files('host-utils.c'))
>  util_ss.add(files('bitmap.c', 'bitops.c'))
> +util_ss.add(files('ldst.c'))
>  util_ss.add(files('fifo8.c'))
>  util_ss.add(files('cacheflush.c'))
>  util_ss.add(files('error.c', 'error-report.c'))
> --
> 2.52.0
>
>
Re: [PATCH v5 02/22] util: Introduce ldm_p() and stm_p() load/store helpers
Posted by Richard Henderson 1 month ago
On 1/10/26 04:23, Paolo Bonzini wrote:
> 
> 
> Il ven 9 gen 2026, 17:51 Philippe Mathieu-Daudé <philmd@linaro.org 
> <mailto:philmd@linaro.org>> ha scritto:
> 
>     Introduce load/store helpers which take a MemOp argument.
> 
>     Inspired-by: Paolo Bonzini <pbonzini@redhat.com <mailto:pbonzini@redhat.com>>
> 
> 
> That's a new one. :)
> 
> I think they should be inline and so should be address_space_{ld,st}m_internal (maybe even 
> always_inline). The amount of optimization enabled by constant MemOp is huge.

When are we going to have constant MemOp?


r~

Re: [PATCH v5 02/22] util: Introduce ldm_p() and stm_p() load/store helpers
Posted by Paolo Bonzini 1 month ago
On 1/10/26 00:26, Richard Henderson wrote:
> On 1/10/26 04:23, Paolo Bonzini wrote:
>>
>>
>> Il ven 9 gen 2026, 17:51 Philippe Mathieu-Daudé <philmd@linaro.org 
>> <mailto:philmd@linaro.org>> ha scritto:
>>
>>     Introduce load/store helpers which take a MemOp argument.
>>
>>     Inspired-by: Paolo Bonzini <pbonzini@redhat.com 
>> <mailto:pbonzini@redhat.com>>
>>
>>
>> That's a new one. :)
>>
>> I think they should be inline and so should be address_space_{ld,st} 
>> m_internal (maybe even always_inline). The amount of optimization 
>> enabled by constant MemOp is huge.
> 
> When are we going to have constant MemOp?

Every time address_space_{ld,st}m_internal is called, it's done with 
constant MemOp.  For example:

uint16_t ADDRESS_SPACE_LD(uw)(ARG1_DECL, hwaddr addr,
                               MemTxAttrs attrs, MemTxResult *result)
{
     return ADDRESS_SPACE_LD_INTERNAL(m)(ARG1, MO_ENDIAN | MO_16,
                                         addr, attrs, result);
}

Paolo