Endianness access is constant between device resets.
Use the field instead of calling the same function.
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
include/hw/virtio/virtio-access.h | 24 ++++++++++++------------
hw/virtio/virtio.c | 4 ++--
2 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
index cd17d0c87eb..f3b4d0075b5 100644
--- a/include/hw/virtio/virtio-access.h
+++ b/include/hw/virtio/virtio-access.h
@@ -42,7 +42,7 @@ static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
{
- if (virtio_access_is_big_endian(vdev)) {
+ if (vdev->access_is_big_endian) {
stw_be_p(ptr, v);
} else {
stw_le_p(ptr, v);
@@ -51,7 +51,7 @@ static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
{
- if (virtio_access_is_big_endian(vdev)) {
+ if (vdev->access_is_big_endian) {
stl_be_p(ptr, v);
} else {
stl_le_p(ptr, v);
@@ -60,7 +60,7 @@ static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
{
- if (virtio_access_is_big_endian(vdev)) {
+ if (vdev->access_is_big_endian) {
stq_be_p(ptr, v);
} else {
stq_le_p(ptr, v);
@@ -69,7 +69,7 @@ static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
{
- if (virtio_access_is_big_endian(vdev)) {
+ if (vdev->access_is_big_endian) {
return lduw_be_p(ptr);
} else {
return lduw_le_p(ptr);
@@ -78,7 +78,7 @@ static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
{
- if (virtio_access_is_big_endian(vdev)) {
+ if (vdev->access_is_big_endian) {
return ldl_be_p(ptr);
} else {
return ldl_le_p(ptr);
@@ -87,7 +87,7 @@ static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
{
- if (virtio_access_is_big_endian(vdev)) {
+ if (vdev->access_is_big_endian) {
return ldq_be_p(ptr);
} else {
return ldq_le_p(ptr);
@@ -97,9 +97,9 @@ static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
static inline uint16_t virtio_tswap16(VirtIODevice *vdev, uint16_t s)
{
#if HOST_BIG_ENDIAN
- return virtio_access_is_big_endian(vdev) ? s : bswap16(s);
+ return vdev->access_is_big_endian ? s : bswap16(s);
#else
- return virtio_access_is_big_endian(vdev) ? bswap16(s) : s;
+ return vdev->access_is_big_endian ? bswap16(s) : s;
#endif
}
@@ -111,9 +111,9 @@ static inline void virtio_tswap16s(VirtIODevice *vdev, uint16_t *s)
static inline uint32_t virtio_tswap32(VirtIODevice *vdev, uint32_t s)
{
#if HOST_BIG_ENDIAN
- return virtio_access_is_big_endian(vdev) ? s : bswap32(s);
+ return vdev->access_is_big_endian ? s : bswap32(s);
#else
- return virtio_access_is_big_endian(vdev) ? bswap32(s) : s;
+ return vdev->access_is_big_endian ? bswap32(s) : s;
#endif
}
@@ -125,9 +125,9 @@ static inline void virtio_tswap32s(VirtIODevice *vdev, uint32_t *s)
static inline uint64_t virtio_tswap64(VirtIODevice *vdev, uint64_t s)
{
#if HOST_BIG_ENDIAN
- return virtio_access_is_big_endian(vdev) ? s : bswap64(s);
+ return vdev->access_is_big_endian ? s : bswap64(s);
#else
- return virtio_access_is_big_endian(vdev) ? bswap64(s) : s;
+ return vdev->access_is_big_endian ? bswap64(s) : s;
#endif
}
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 242c207a591..1dc60d37cb4 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -220,7 +220,7 @@ static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev,
MemoryRegionCache *cache,
hwaddr pa)
{
- if (virtio_access_is_big_endian(vdev)) {
+ if (vdev->access_is_big_endian) {
return lduw_be_phys_cached(cache, pa);
}
return lduw_le_phys_cached(cache, pa);
@@ -230,7 +230,7 @@ static inline void virtio_stw_phys_cached(VirtIODevice *vdev,
MemoryRegionCache *cache,
hwaddr pa, uint16_t value)
{
- if (virtio_access_is_big_endian(vdev)) {
+ if (vdev->access_is_big_endian) {
stw_be_phys_cached(cache, pa, value);
} else {
stw_le_phys_cached(cache, pa, value);
--
2.52.0
On Mon, Feb 02, 2026 at 12:29:16AM +0100, Philippe Mathieu-Daudé wrote:
> Endianness access is constant between device resets.
> Use the field instead of calling the same function.
yes but the function was inlined and often a NOP. worth mentioning.
>
> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
> ---
> include/hw/virtio/virtio-access.h | 24 ++++++++++++------------
> hw/virtio/virtio.c | 4 ++--
> 2 files changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
> index cd17d0c87eb..f3b4d0075b5 100644
> --- a/include/hw/virtio/virtio-access.h
> +++ b/include/hw/virtio/virtio-access.h
> @@ -42,7 +42,7 @@ static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
>
> static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
> {
> - if (virtio_access_is_big_endian(vdev)) {
> + if (vdev->access_is_big_endian) {
> stw_be_p(ptr, v);
> } else {
> stw_le_p(ptr, v);
So this is the main extra cost: on an LE host, we now have this branch on every
access where previously it was all optimized out.
Is it measureable? It is worth testing.
> @@ -51,7 +51,7 @@ static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
>
> static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
> {
> - if (virtio_access_is_big_endian(vdev)) {
> + if (vdev->access_is_big_endian) {
> stl_be_p(ptr, v);
> } else {
> stl_le_p(ptr, v);
> @@ -60,7 +60,7 @@ static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
>
> static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
> {
> - if (virtio_access_is_big_endian(vdev)) {
> + if (vdev->access_is_big_endian) {
> stq_be_p(ptr, v);
> } else {
> stq_le_p(ptr, v);
> @@ -69,7 +69,7 @@ static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
>
> static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
> {
> - if (virtio_access_is_big_endian(vdev)) {
> + if (vdev->access_is_big_endian) {
> return lduw_be_p(ptr);
> } else {
> return lduw_le_p(ptr);
and this one.
> @@ -78,7 +78,7 @@ static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
>
> static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
> {
> - if (virtio_access_is_big_endian(vdev)) {
> + if (vdev->access_is_big_endian) {
> return ldl_be_p(ptr);
> } else {
> return ldl_le_p(ptr);
> @@ -87,7 +87,7 @@ static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
>
> static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
> {
> - if (virtio_access_is_big_endian(vdev)) {
> + if (vdev->access_is_big_endian) {
> return ldq_be_p(ptr);
> } else {
> return ldq_le_p(ptr);
> @@ -97,9 +97,9 @@ static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
> static inline uint16_t virtio_tswap16(VirtIODevice *vdev, uint16_t s)
> {
> #if HOST_BIG_ENDIAN
> - return virtio_access_is_big_endian(vdev) ? s : bswap16(s);
> + return vdev->access_is_big_endian ? s : bswap16(s);
> #else
> - return virtio_access_is_big_endian(vdev) ? bswap16(s) : s;
> + return vdev->access_is_big_endian ? bswap16(s) : s;
> #endif
> }
>
> @@ -111,9 +111,9 @@ static inline void virtio_tswap16s(VirtIODevice *vdev, uint16_t *s)
> static inline uint32_t virtio_tswap32(VirtIODevice *vdev, uint32_t s)
> {
> #if HOST_BIG_ENDIAN
> - return virtio_access_is_big_endian(vdev) ? s : bswap32(s);
> + return vdev->access_is_big_endian ? s : bswap32(s);
> #else
> - return virtio_access_is_big_endian(vdev) ? bswap32(s) : s;
> + return vdev->access_is_big_endian ? bswap32(s) : s;
> #endif
> }
>
> @@ -125,9 +125,9 @@ static inline void virtio_tswap32s(VirtIODevice *vdev, uint32_t *s)
> static inline uint64_t virtio_tswap64(VirtIODevice *vdev, uint64_t s)
> {
> #if HOST_BIG_ENDIAN
> - return virtio_access_is_big_endian(vdev) ? s : bswap64(s);
> + return vdev->access_is_big_endian ? s : bswap64(s);
> #else
> - return virtio_access_is_big_endian(vdev) ? bswap64(s) : s;
> + return vdev->access_is_big_endian ? bswap64(s) : s;
> #endif
> }
>
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 242c207a591..1dc60d37cb4 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -220,7 +220,7 @@ static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev,
> MemoryRegionCache *cache,
> hwaddr pa)
> {
> - if (virtio_access_is_big_endian(vdev)) {
> + if (vdev->access_is_big_endian) {
> return lduw_be_phys_cached(cache, pa);
> }
> return lduw_le_phys_cached(cache, pa);
> @@ -230,7 +230,7 @@ static inline void virtio_stw_phys_cached(VirtIODevice *vdev,
> MemoryRegionCache *cache,
> hwaddr pa, uint16_t value)
> {
> - if (virtio_access_is_big_endian(vdev)) {
> + if (vdev->access_is_big_endian) {
> stw_be_phys_cached(cache, pa, value);
> } else {
> stw_le_phys_cached(cache, pa, value);
> --
> 2.52.0
"Michael S. Tsirkin" <mst@redhat.com> writes:
> On Mon, Feb 02, 2026 at 12:29:16AM +0100, Philippe Mathieu-Daudé wrote:
>> Endianness access is constant between device resets.
>> Use the field instead of calling the same function.
>
> yes but the function was inlined and often a NOP. worth mentioning.
>
>>
>> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
>> ---
>> include/hw/virtio/virtio-access.h | 24 ++++++++++++------------
>> hw/virtio/virtio.c | 4 ++--
>> 2 files changed, 14 insertions(+), 14 deletions(-)
>>
>> diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
>> index cd17d0c87eb..f3b4d0075b5 100644
>> --- a/include/hw/virtio/virtio-access.h
>> +++ b/include/hw/virtio/virtio-access.h
>> @@ -42,7 +42,7 @@ static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
>>
>> static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
>> {
>> - if (virtio_access_is_big_endian(vdev)) {
>> + if (vdev->access_is_big_endian) {
>> stw_be_p(ptr, v);
>> } else {
>> stw_le_p(ptr, v);
>
>
> So this is the main extra cost: on an LE host, we now have this branch on every
> access where previously it was all optimized out.
>
> Is it measureable? It is worth testing.
I suspect not given this is a well predicted branch on the control
plane. But we can certainly try an measure it. Do we have any virtio
performance benchmarks?
We only have a few in meson:
./pyvenv/bin/meson test --benchmark --list
bufferiszero-bench
benchmark-crypto-hash
benchmark-crypto-hmac
benchmark-crypto-cipher
benchmark-crypto-akcipher
>
>
>
>> @@ -51,7 +51,7 @@ static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
>>
>> static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
>> {
>> - if (virtio_access_is_big_endian(vdev)) {
>> + if (vdev->access_is_big_endian) {
>> stl_be_p(ptr, v);
>> } else {
>> stl_le_p(ptr, v);
>> @@ -60,7 +60,7 @@ static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
>>
>> static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
>> {
>> - if (virtio_access_is_big_endian(vdev)) {
>> + if (vdev->access_is_big_endian) {
>> stq_be_p(ptr, v);
>> } else {
>> stq_le_p(ptr, v);
>> @@ -69,7 +69,7 @@ static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
>>
>> static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
>> {
>> - if (virtio_access_is_big_endian(vdev)) {
>> + if (vdev->access_is_big_endian) {
>> return lduw_be_p(ptr);
>> } else {
>> return lduw_le_p(ptr);
>
> and this one.
>
>
>> @@ -78,7 +78,7 @@ static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
>>
>> static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
>> {
>> - if (virtio_access_is_big_endian(vdev)) {
>> + if (vdev->access_is_big_endian) {
>> return ldl_be_p(ptr);
>> } else {
>> return ldl_le_p(ptr);
>> @@ -87,7 +87,7 @@ static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
>>
>> static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
>> {
>> - if (virtio_access_is_big_endian(vdev)) {
>> + if (vdev->access_is_big_endian) {
>> return ldq_be_p(ptr);
>> } else {
>> return ldq_le_p(ptr);
>> @@ -97,9 +97,9 @@ static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
>> static inline uint16_t virtio_tswap16(VirtIODevice *vdev, uint16_t s)
>> {
>> #if HOST_BIG_ENDIAN
>> - return virtio_access_is_big_endian(vdev) ? s : bswap16(s);
>> + return vdev->access_is_big_endian ? s : bswap16(s);
>> #else
>> - return virtio_access_is_big_endian(vdev) ? bswap16(s) : s;
>> + return vdev->access_is_big_endian ? bswap16(s) : s;
>> #endif
>> }
>>
>> @@ -111,9 +111,9 @@ static inline void virtio_tswap16s(VirtIODevice *vdev, uint16_t *s)
>> static inline uint32_t virtio_tswap32(VirtIODevice *vdev, uint32_t s)
>> {
>> #if HOST_BIG_ENDIAN
>> - return virtio_access_is_big_endian(vdev) ? s : bswap32(s);
>> + return vdev->access_is_big_endian ? s : bswap32(s);
>> #else
>> - return virtio_access_is_big_endian(vdev) ? bswap32(s) : s;
>> + return vdev->access_is_big_endian ? bswap32(s) : s;
>> #endif
>> }
>>
>> @@ -125,9 +125,9 @@ static inline void virtio_tswap32s(VirtIODevice *vdev, uint32_t *s)
>> static inline uint64_t virtio_tswap64(VirtIODevice *vdev, uint64_t s)
>> {
>> #if HOST_BIG_ENDIAN
>> - return virtio_access_is_big_endian(vdev) ? s : bswap64(s);
>> + return vdev->access_is_big_endian ? s : bswap64(s);
>> #else
>> - return virtio_access_is_big_endian(vdev) ? bswap64(s) : s;
>> + return vdev->access_is_big_endian ? bswap64(s) : s;
>> #endif
>> }
>>
>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>> index 242c207a591..1dc60d37cb4 100644
>> --- a/hw/virtio/virtio.c
>> +++ b/hw/virtio/virtio.c
>> @@ -220,7 +220,7 @@ static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev,
>> MemoryRegionCache *cache,
>> hwaddr pa)
>> {
>> - if (virtio_access_is_big_endian(vdev)) {
>> + if (vdev->access_is_big_endian) {
>> return lduw_be_phys_cached(cache, pa);
>> }
>> return lduw_le_phys_cached(cache, pa);
>> @@ -230,7 +230,7 @@ static inline void virtio_stw_phys_cached(VirtIODevice *vdev,
>> MemoryRegionCache *cache,
>> hwaddr pa, uint16_t value)
>> {
>> - if (virtio_access_is_big_endian(vdev)) {
>> + if (vdev->access_is_big_endian) {
>> stw_be_phys_cached(cache, pa, value);
>> } else {
>> stw_le_phys_cached(cache, pa, value);
>> --
>> 2.52.0
--
Alex Bennée
Virtualisation Tech Lead @ Linaro
On Mon, Feb 02, 2026 at 01:08:25PM +0000, Alex Bennée wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
>
> > On Mon, Feb 02, 2026 at 12:29:16AM +0100, Philippe Mathieu-Daudé wrote:
> >> Endianness access is constant between device resets.
> >> Use the field instead of calling the same function.
> >
> > yes but the function was inlined and often a NOP. worth mentioning.
> >
> >>
> >> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
> >> ---
> >> include/hw/virtio/virtio-access.h | 24 ++++++++++++------------
> >> hw/virtio/virtio.c | 4 ++--
> >> 2 files changed, 14 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
> >> index cd17d0c87eb..f3b4d0075b5 100644
> >> --- a/include/hw/virtio/virtio-access.h
> >> +++ b/include/hw/virtio/virtio-access.h
> >> @@ -42,7 +42,7 @@ static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
> >>
> >> static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
> >> {
> >> - if (virtio_access_is_big_endian(vdev)) {
> >> + if (vdev->access_is_big_endian) {
> >> stw_be_p(ptr, v);
> >> } else {
> >> stw_le_p(ptr, v);
> >
> >
> > So this is the main extra cost: on an LE host, we now have this branch on every
> > access where previously it was all optimized out.
> >
> > Is it measureable? It is worth testing.
>
> I suspect not given this is a well predicted branch on the control
> plane.
Actually you are right. I meant the cached variants below. Not this one.
> But we can certainly try an measure it. Do we have any virtio
> performance benchmarks?
>
> We only have a few in meson:
>
> ./pyvenv/bin/meson test --benchmark --list
> bufferiszero-bench
> benchmark-crypto-hash
> benchmark-crypto-hmac
> benchmark-crypto-cipher
> benchmark-crypto-akcipher
I usually use fio but whatever blk maintainers use is fine by me.
> >
> >
> >
> >> @@ -51,7 +51,7 @@ static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
> >>
> >> static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
> >> {
> >> - if (virtio_access_is_big_endian(vdev)) {
> >> + if (vdev->access_is_big_endian) {
> >> stl_be_p(ptr, v);
> >> } else {
> >> stl_le_p(ptr, v);
> >> @@ -60,7 +60,7 @@ static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
> >>
> >> static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
> >> {
> >> - if (virtio_access_is_big_endian(vdev)) {
> >> + if (vdev->access_is_big_endian) {
> >> stq_be_p(ptr, v);
> >> } else {
> >> stq_le_p(ptr, v);
> >> @@ -69,7 +69,7 @@ static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
> >>
> >> static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
> >> {
> >> - if (virtio_access_is_big_endian(vdev)) {
> >> + if (vdev->access_is_big_endian) {
> >> return lduw_be_p(ptr);
> >> } else {
> >> return lduw_le_p(ptr);
> >
> > and this one.
> >
> >
> >> @@ -78,7 +78,7 @@ static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
> >>
> >> static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
> >> {
> >> - if (virtio_access_is_big_endian(vdev)) {
> >> + if (vdev->access_is_big_endian) {
> >> return ldl_be_p(ptr);
> >> } else {
> >> return ldl_le_p(ptr);
> >> @@ -87,7 +87,7 @@ static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
> >>
> >> static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
> >> {
> >> - if (virtio_access_is_big_endian(vdev)) {
> >> + if (vdev->access_is_big_endian) {
> >> return ldq_be_p(ptr);
> >> } else {
> >> return ldq_le_p(ptr);
> >> @@ -97,9 +97,9 @@ static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
> >> static inline uint16_t virtio_tswap16(VirtIODevice *vdev, uint16_t s)
> >> {
> >> #if HOST_BIG_ENDIAN
> >> - return virtio_access_is_big_endian(vdev) ? s : bswap16(s);
> >> + return vdev->access_is_big_endian ? s : bswap16(s);
> >> #else
> >> - return virtio_access_is_big_endian(vdev) ? bswap16(s) : s;
> >> + return vdev->access_is_big_endian ? bswap16(s) : s;
> >> #endif
> >> }
> >>
> >> @@ -111,9 +111,9 @@ static inline void virtio_tswap16s(VirtIODevice *vdev, uint16_t *s)
> >> static inline uint32_t virtio_tswap32(VirtIODevice *vdev, uint32_t s)
> >> {
> >> #if HOST_BIG_ENDIAN
> >> - return virtio_access_is_big_endian(vdev) ? s : bswap32(s);
> >> + return vdev->access_is_big_endian ? s : bswap32(s);
> >> #else
> >> - return virtio_access_is_big_endian(vdev) ? bswap32(s) : s;
> >> + return vdev->access_is_big_endian ? bswap32(s) : s;
> >> #endif
> >> }
> >>
> >> @@ -125,9 +125,9 @@ static inline void virtio_tswap32s(VirtIODevice *vdev, uint32_t *s)
> >> static inline uint64_t virtio_tswap64(VirtIODevice *vdev, uint64_t s)
> >> {
> >> #if HOST_BIG_ENDIAN
> >> - return virtio_access_is_big_endian(vdev) ? s : bswap64(s);
> >> + return vdev->access_is_big_endian ? s : bswap64(s);
> >> #else
> >> - return virtio_access_is_big_endian(vdev) ? bswap64(s) : s;
> >> + return vdev->access_is_big_endian ? bswap64(s) : s;
> >> #endif
> >> }
> >>
> >> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> >> index 242c207a591..1dc60d37cb4 100644
> >> --- a/hw/virtio/virtio.c
> >> +++ b/hw/virtio/virtio.c
> >> @@ -220,7 +220,7 @@ static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev,
> >> MemoryRegionCache *cache,
> >> hwaddr pa)
> >> {
> >> - if (virtio_access_is_big_endian(vdev)) {
> >> + if (vdev->access_is_big_endian) {
> >> return lduw_be_phys_cached(cache, pa);
> >> }
> >> return lduw_le_phys_cached(cache, pa);
> >> @@ -230,7 +230,7 @@ static inline void virtio_stw_phys_cached(VirtIODevice *vdev,
> >> MemoryRegionCache *cache,
> >> hwaddr pa, uint16_t value)
> >> {
> >> - if (virtio_access_is_big_endian(vdev)) {
> >> + if (vdev->access_is_big_endian) {
> >> stw_be_phys_cached(cache, pa, value);
> >> } else {
> >> stw_le_phys_cached(cache, pa, value);
> >> --
> >> 2.52.0
I meant these ones.
> --
> Alex Bennée
> Virtualisation Tech Lead @ Linaro
On Mon, Feb 02, 2026 at 11:04:08AM -0500, Michael S. Tsirkin wrote:
> On Mon, Feb 02, 2026 at 01:08:25PM +0000, Alex Bennée wrote:
> > "Michael S. Tsirkin" <mst@redhat.com> writes:
> >
> > > On Mon, Feb 02, 2026 at 12:29:16AM +0100, Philippe Mathieu-Daudé wrote:
> > >> Endianness access is constant between device resets.
> > >> Use the field instead of calling the same function.
> > >
> > > yes but the function was inlined and often a NOP. worth mentioning.
> > >
> > >>
> > >> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
> > >> ---
> > >> include/hw/virtio/virtio-access.h | 24 ++++++++++++------------
> > >> hw/virtio/virtio.c | 4 ++--
> > >> 2 files changed, 14 insertions(+), 14 deletions(-)
> > >>
> > >> diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
> > >> index cd17d0c87eb..f3b4d0075b5 100644
> > >> --- a/include/hw/virtio/virtio-access.h
> > >> +++ b/include/hw/virtio/virtio-access.h
> > >> @@ -42,7 +42,7 @@ static inline bool virtio_access_is_big_endian(VirtIODevice *vdev)
> > >>
> > >> static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
> > >> {
> > >> - if (virtio_access_is_big_endian(vdev)) {
> > >> + if (vdev->access_is_big_endian) {
> > >> stw_be_p(ptr, v);
> > >> } else {
> > >> stw_le_p(ptr, v);
> > >
> > >
> > > So this is the main extra cost: on an LE host, we now have this branch on every
> > > access where previously it was all optimized out.
> > >
> > > Is it measureable? It is worth testing.
> >
> > I suspect not given this is a well predicted branch on the control
> > plane.
>
> Actually you are right. I meant the cached variants below. Not this one.
>
>
>
>
>
> > But we can certainly try an measure it. Do we have any virtio
> > performance benchmarks?
> >
> > We only have a few in meson:
> >
> > ./pyvenv/bin/meson test --benchmark --list
> > bufferiszero-bench
> > benchmark-crypto-hash
> > benchmark-crypto-hmac
> > benchmark-crypto-cipher
> > benchmark-crypto-akcipher
>
> I usually use fio but whatever blk maintainers use is fine by me.
This command-line lets you benchmark virtio-blk without actual I/O
slowing down the request processing:
qemu-system-x86_64 \
-M accel=kvm \
-cpu host \
-m 4G \
--blockdev file,node-name=drive0,filename=boot.img,cache.direct=on,aio=native \
--blockdev null-co,node-name=drive1,size=$((10 * 1024 * 1024 * 1024)) \
--object iothread,id=iothread0 \
--device virtio-blk-pci,drive=drive0,iothread=iothread0 \
--device virtio-blk-pci,drive=drive1,iothread=iothread0
Here is a fio command-line for 4 KiB random reads:
fio \
--ioengine=libaio \
--direct=1 \
--runtime=30 \
--ramp_time=10 \
--rw=randread \
--bs=4k \
--iodepth=128 \
--filename=/dev/vdb \
--name=randread
This is just a single vCPU, but it should be enough to see if there is
any difference in I/O Operations Per Second (IOPS) or efficiency
(IOPS/CPU utilization).
>
>
> > >
> > >
> > >
> > >> @@ -51,7 +51,7 @@ static inline void virtio_stw_p(VirtIODevice *vdev, void *ptr, uint16_t v)
> > >>
> > >> static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
> > >> {
> > >> - if (virtio_access_is_big_endian(vdev)) {
> > >> + if (vdev->access_is_big_endian) {
> > >> stl_be_p(ptr, v);
> > >> } else {
> > >> stl_le_p(ptr, v);
> > >> @@ -60,7 +60,7 @@ static inline void virtio_stl_p(VirtIODevice *vdev, void *ptr, uint32_t v)
> > >>
> > >> static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
> > >> {
> > >> - if (virtio_access_is_big_endian(vdev)) {
> > >> + if (vdev->access_is_big_endian) {
> > >> stq_be_p(ptr, v);
> > >> } else {
> > >> stq_le_p(ptr, v);
> > >> @@ -69,7 +69,7 @@ static inline void virtio_stq_p(VirtIODevice *vdev, void *ptr, uint64_t v)
> > >>
> > >> static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
> > >> {
> > >> - if (virtio_access_is_big_endian(vdev)) {
> > >> + if (vdev->access_is_big_endian) {
> > >> return lduw_be_p(ptr);
> > >> } else {
> > >> return lduw_le_p(ptr);
> > >
> > > and this one.
> > >
> > >
> > >> @@ -78,7 +78,7 @@ static inline int virtio_lduw_p(VirtIODevice *vdev, const void *ptr)
> > >>
> > >> static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
> > >> {
> > >> - if (virtio_access_is_big_endian(vdev)) {
> > >> + if (vdev->access_is_big_endian) {
> > >> return ldl_be_p(ptr);
> > >> } else {
> > >> return ldl_le_p(ptr);
> > >> @@ -87,7 +87,7 @@ static inline int virtio_ldl_p(VirtIODevice *vdev, const void *ptr)
> > >>
> > >> static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
> > >> {
> > >> - if (virtio_access_is_big_endian(vdev)) {
> > >> + if (vdev->access_is_big_endian) {
> > >> return ldq_be_p(ptr);
> > >> } else {
> > >> return ldq_le_p(ptr);
> > >> @@ -97,9 +97,9 @@ static inline uint64_t virtio_ldq_p(VirtIODevice *vdev, const void *ptr)
> > >> static inline uint16_t virtio_tswap16(VirtIODevice *vdev, uint16_t s)
> > >> {
> > >> #if HOST_BIG_ENDIAN
> > >> - return virtio_access_is_big_endian(vdev) ? s : bswap16(s);
> > >> + return vdev->access_is_big_endian ? s : bswap16(s);
> > >> #else
> > >> - return virtio_access_is_big_endian(vdev) ? bswap16(s) : s;
> > >> + return vdev->access_is_big_endian ? bswap16(s) : s;
> > >> #endif
> > >> }
> > >>
> > >> @@ -111,9 +111,9 @@ static inline void virtio_tswap16s(VirtIODevice *vdev, uint16_t *s)
> > >> static inline uint32_t virtio_tswap32(VirtIODevice *vdev, uint32_t s)
> > >> {
> > >> #if HOST_BIG_ENDIAN
> > >> - return virtio_access_is_big_endian(vdev) ? s : bswap32(s);
> > >> + return vdev->access_is_big_endian ? s : bswap32(s);
> > >> #else
> > >> - return virtio_access_is_big_endian(vdev) ? bswap32(s) : s;
> > >> + return vdev->access_is_big_endian ? bswap32(s) : s;
> > >> #endif
> > >> }
> > >>
> > >> @@ -125,9 +125,9 @@ static inline void virtio_tswap32s(VirtIODevice *vdev, uint32_t *s)
> > >> static inline uint64_t virtio_tswap64(VirtIODevice *vdev, uint64_t s)
> > >> {
> > >> #if HOST_BIG_ENDIAN
> > >> - return virtio_access_is_big_endian(vdev) ? s : bswap64(s);
> > >> + return vdev->access_is_big_endian ? s : bswap64(s);
> > >> #else
> > >> - return virtio_access_is_big_endian(vdev) ? bswap64(s) : s;
> > >> + return vdev->access_is_big_endian ? bswap64(s) : s;
> > >> #endif
> > >> }
> > >>
> > >> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > >> index 242c207a591..1dc60d37cb4 100644
> > >> --- a/hw/virtio/virtio.c
> > >> +++ b/hw/virtio/virtio.c
> > >> @@ -220,7 +220,7 @@ static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev,
> > >> MemoryRegionCache *cache,
> > >> hwaddr pa)
> > >> {
> > >> - if (virtio_access_is_big_endian(vdev)) {
> > >> + if (vdev->access_is_big_endian) {
> > >> return lduw_be_phys_cached(cache, pa);
> > >> }
> > >> return lduw_le_phys_cached(cache, pa);
> > >> @@ -230,7 +230,7 @@ static inline void virtio_stw_phys_cached(VirtIODevice *vdev,
> > >> MemoryRegionCache *cache,
> > >> hwaddr pa, uint16_t value)
> > >> {
> > >> - if (virtio_access_is_big_endian(vdev)) {
> > >> + if (vdev->access_is_big_endian) {
> > >> stw_be_phys_cached(cache, pa, value);
> > >> } else {
> > >> stw_le_phys_cached(cache, pa, value);
> > >> --
> > >> 2.52.0
>
>
> I meant these ones.
>
> > --
> > Alex Bennée
> > Virtualisation Tech Lead @ Linaro
>
On 2/2/26 10:52 AM, Stefan Hajnoczi wrote: > > This command-line lets you benchmark virtio-blk without actual I/O > slowing down the request processing: > > qemu-system-x86_64 \ > -M accel=kvm \ > -cpu host \ > -m 4G \ > --blockdev file,node-name=drive0,filename=boot.img,cache.direct=on,aio=native \ > --blockdev null-co,node-name=drive1,size=$((10 * 1024 * 1024 * 1024)) \ > --object iothread,id=iothread0 \ > --device virtio-blk-pci,drive=drive0,iothread=iothread0 \ > --device virtio-blk-pci,drive=drive1,iothread=iothread0 > > Here is a fio command-line for 4 KiB random reads: > > fio \ > --ioengine=libaio \ > --direct=1 \ > --runtime=30 \ > --ramp_time=10 \ > --rw=randread \ > --bs=4k \ > --iodepth=128 \ > --filename=/dev/vdb \ > --name=randread > > This is just a single vCPU, but it should be enough to see if there is > any difference in I/O Operations Per Second (IOPS) or efficiency > (IOPS/CPU utilization). > Thanks very much for the info Stefan. I didn't even know null-co blockdev, so it definitely would have taken some time to find all this. For what it's worth, I automated the benchmark here (need podman for build), so it can be reused for future changes: https://github.com/pbo-linaro/qemu-linux-stack/tree/x86_64_io_benchmark ./build.sh && ./run.sh path/to/qemu-system-x86_64 My initial testing showed a 50% slow down, which was more than surprising. After profiling, the extra time is spent here: https://github.com/qemu/qemu/blob/587f4a1805c83a4e1d59dd43cb14e0a834843d1d/target-info.c#L30 When we merged target-info, there have been several versions over a long time, and I was 100% sure we were updating the target_info structure, instead of reparsing the target_name every time. Unfortunately, that's not the case. I'll fix that. With that fix, there is no difference in performance (<1%). I'll respin a v4 with the target info fix, initial v1 changes and benchmark results. Thanks for pointing the performance issue, there was one for sure. Regards, Pierrick
On 2/2/26 20:25, Pierrick Bouvier wrote: > My initial testing showed a 50% slow down, which was more than > surprising. After profiling, the extra time is spent here: > https://github.com/qemu/qemu/ > blob/587f4a1805c83a4e1d59dd43cb14e0a834843d1d/target-info.c#L30 > > When we merged target-info, there have been several versions over a long > time, and I was 100% sure we were updating the target_info structure, > instead of reparsing the target_name every time. Unfortunately, that's > not the case. I'll fix that. We (Richard, you, myself) knew this patch was bad performance wise, but back then it was decided to be good enough for the first conversion of build-time target specific definitions to runtime methods. As a second step, Richard asked for the target-info.c file to get absorbed with page-vary-common.c, compiling it with CFLAG=-fno-lto. This wasn't a priority until TargetInfo API got involved in hot path.
On 2/2/26 11:25 AM, Pierrick Bouvier wrote: > On 2/2/26 10:52 AM, Stefan Hajnoczi wrote: >> >> This command-line lets you benchmark virtio-blk without actual I/O >> slowing down the request processing: >> >> qemu-system-x86_64 \ >> -M accel=kvm \ >> -cpu host \ >> -m 4G \ >> --blockdev file,node-name=drive0,filename=boot.img,cache.direct=on,aio=native \ >> --blockdev null-co,node-name=drive1,size=$((10 * 1024 * 1024 * 1024)) \ >> --object iothread,id=iothread0 \ >> --device virtio-blk-pci,drive=drive0,iothread=iothread0 \ >> --device virtio-blk-pci,drive=drive1,iothread=iothread0 >> >> Here is a fio command-line for 4 KiB random reads: >> >> fio \ >> --ioengine=libaio \ >> --direct=1 \ >> --runtime=30 \ >> --ramp_time=10 \ >> --rw=randread \ >> --bs=4k \ >> --iodepth=128 \ >> --filename=/dev/vdb \ >> --name=randread >> >> This is just a single vCPU, but it should be enough to see if there is >> any difference in I/O Operations Per Second (IOPS) or efficiency >> (IOPS/CPU utilization). >> > Thanks very much for the info Stefan. I didn't even know null-co > blockdev, so it definitely would have taken some time to find all this. > > For what it's worth, I automated the benchmark here (need podman for > build), so it can be reused for future changes: > https://github.com/pbo-linaro/qemu-linux-stack/tree/x86_64_io_benchmark > > ./build.sh && ./run.sh path/to/qemu-system-x86_64 > > My initial testing showed a 50% slow down, which was more than > surprising. After profiling, the extra time is spent here: > https://github.com/qemu/qemu/blob/587f4a1805c83a4e1d59dd43cb14e0a834843d1d/target-info.c#L30 > > When we merged target-info, there have been several versions over a long > time, and I was 100% sure we were updating the target_info structure, > instead of reparsing the target_name every time. Unfortunately, that's > not the case. I'll fix that. > > With that fix, there is no difference in performance (<1%). > > I'll respin a v4 with the target info fix, initial v1 changes and > benchmark results. > > Thanks for pointing the performance issue, there was one for sure. > > Regards, > Pierrick After proper benchmarking, I get those results (in kIOPS) over 20 runs, all including target_arch fix I mentioned. reference: mean=239.2 var=12.16 v1: mean=232.2 var=37.06 v4-wip (optimized virtio_access_is_big_endian): mean=235.05 var=18.64 So basically, we have a 1.7% performance hit on a torture benchmark. Is that acceptable for you, or should we dig more? Regards, Pierrick
On Mon, Feb 02, 2026 at 07:22:16PM -0800, Pierrick Bouvier wrote:
> On 2/2/26 11:25 AM, Pierrick Bouvier wrote:
> > On 2/2/26 10:52 AM, Stefan Hajnoczi wrote:
> > >
> > > This command-line lets you benchmark virtio-blk without actual I/O
> > > slowing down the request processing:
> > >
> > > qemu-system-x86_64 \
> > > -M accel=kvm \
> > > -cpu host \
> > > -m 4G \
> > > --blockdev file,node-name=drive0,filename=boot.img,cache.direct=on,aio=native \
> > > --blockdev null-co,node-name=drive1,size=$((10 * 1024 * 1024 * 1024)) \
> > > --object iothread,id=iothread0 \
> > > --device virtio-blk-pci,drive=drive0,iothread=iothread0 \
> > > --device virtio-blk-pci,drive=drive1,iothread=iothread0
> > >
> > > Here is a fio command-line for 4 KiB random reads:
> > >
> > > fio \
> > > --ioengine=libaio \
> > > --direct=1 \
> > > --runtime=30 \
> > > --ramp_time=10 \
> > > --rw=randread \
> > > --bs=4k \
> > > --iodepth=128 \
> > > --filename=/dev/vdb \
> > > --name=randread
> > >
> > > This is just a single vCPU, but it should be enough to see if there is
> > > any difference in I/O Operations Per Second (IOPS) or efficiency
> > > (IOPS/CPU utilization).
> > >
> > Thanks very much for the info Stefan. I didn't even know null-co
> > blockdev, so it definitely would have taken some time to find all this.
> >
> > For what it's worth, I automated the benchmark here (need podman for
> > build), so it can be reused for future changes:
> > https://github.com/pbo-linaro/qemu-linux-stack/tree/x86_64_io_benchmark
> >
> > ./build.sh && ./run.sh path/to/qemu-system-x86_64
> >
> > My initial testing showed a 50% slow down, which was more than
> > surprising. After profiling, the extra time is spent here:
> > https://github.com/qemu/qemu/blob/587f4a1805c83a4e1d59dd43cb14e0a834843d1d/target-info.c#L30
> >
> > When we merged target-info, there have been several versions over a long
> > time, and I was 100% sure we were updating the target_info structure,
> > instead of reparsing the target_name every time. Unfortunately, that's
> > not the case. I'll fix that.
> >
> > With that fix, there is no difference in performance (<1%).
> >
> > I'll respin a v4 with the target info fix, initial v1 changes and
> > benchmark results.
> >
> > Thanks for pointing the performance issue, there was one for sure.
> >
> > Regards,
> > Pierrick
>
> After proper benchmarking, I get those results (in kIOPS) over 20 runs, all
> including target_arch fix I mentioned.
>
> reference: mean=239.2 var=12.16
> v1: mean=232.2 var=37.06
> v4-wip (optimized virtio_access_is_big_endian): mean=235.05 var=18.64
>
> So basically, we have a 1.7% performance hit on a torture benchmark.
> Is that acceptable for you, or should we dig more?
>
> Regards,
> Pierrick
Could we use some kind of linker trick?
Split just the ring manipulation code from
virtio.c and keep it target specific.
Or it could be that even just these two would be enough:
static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev,
MemoryRegionCache *cache,
hwaddr pa)
{
if (virtio_access_is_big_endian(vdev)) {
return lduw_be_phys_cached(cache, pa);
}
return lduw_le_phys_cached(cache, pa);
}
static inline void virtio_stw_phys_cached(VirtIODevice *vdev,
MemoryRegionCache *cache,
hwaddr pa, uint16_t value)
{
if (virtio_access_is_big_endian(vdev)) {
stw_be_phys_cached(cache, pa, value);
} else {
stw_le_phys_cached(cache, pa, value);
}
}
--
MST
On 2/3/26 3:07 AM, Michael S. Tsirkin wrote:
> On Mon, Feb 02, 2026 at 07:22:16PM -0800, Pierrick Bouvier wrote:
>> On 2/2/26 11:25 AM, Pierrick Bouvier wrote:
>>> On 2/2/26 10:52 AM, Stefan Hajnoczi wrote:
>>>>
>>>> This command-line lets you benchmark virtio-blk without actual I/O
>>>> slowing down the request processing:
>>>>
>>>> qemu-system-x86_64 \
>>>> -M accel=kvm \
>>>> -cpu host \
>>>> -m 4G \
>>>> --blockdev file,node-name=drive0,filename=boot.img,cache.direct=on,aio=native \
>>>> --blockdev null-co,node-name=drive1,size=$((10 * 1024 * 1024 * 1024)) \
>>>> --object iothread,id=iothread0 \
>>>> --device virtio-blk-pci,drive=drive0,iothread=iothread0 \
>>>> --device virtio-blk-pci,drive=drive1,iothread=iothread0
>>>>
>>>> Here is a fio command-line for 4 KiB random reads:
>>>>
>>>> fio \
>>>> --ioengine=libaio \
>>>> --direct=1 \
>>>> --runtime=30 \
>>>> --ramp_time=10 \
>>>> --rw=randread \
>>>> --bs=4k \
>>>> --iodepth=128 \
>>>> --filename=/dev/vdb \
>>>> --name=randread
>>>>
>>>> This is just a single vCPU, but it should be enough to see if there is
>>>> any difference in I/O Operations Per Second (IOPS) or efficiency
>>>> (IOPS/CPU utilization).
>>>>
>>> Thanks very much for the info Stefan. I didn't even know null-co
>>> blockdev, so it definitely would have taken some time to find all this.
>>>
>>> For what it's worth, I automated the benchmark here (need podman for
>>> build), so it can be reused for future changes:
>>> https://github.com/pbo-linaro/qemu-linux-stack/tree/x86_64_io_benchmark
>>>
>>> ./build.sh && ./run.sh path/to/qemu-system-x86_64
>>>
>>> My initial testing showed a 50% slow down, which was more than
>>> surprising. After profiling, the extra time is spent here:
>>> https://github.com/qemu/qemu/blob/587f4a1805c83a4e1d59dd43cb14e0a834843d1d/target-info.c#L30
>>>
>>> When we merged target-info, there have been several versions over a long
>>> time, and I was 100% sure we were updating the target_info structure,
>>> instead of reparsing the target_name every time. Unfortunately, that's
>>> not the case. I'll fix that.
>>>
>>> With that fix, there is no difference in performance (<1%).
>>>
>>> I'll respin a v4 with the target info fix, initial v1 changes and
>>> benchmark results.
>>>
>>> Thanks for pointing the performance issue, there was one for sure.
>>>
>>> Regards,
>>> Pierrick
>>
>> After proper benchmarking, I get those results (in kIOPS) over 20 runs, all
>> including target_arch fix I mentioned.
>>
>> reference: mean=239.2 var=12.16
>> v1: mean=232.2 var=37.06
>> v4-wip (optimized virtio_access_is_big_endian): mean=235.05 var=18.64
>>
>> So basically, we have a 1.7% performance hit on a torture benchmark.
>> Is that acceptable for you, or should we dig more?
>>
>> Regards,
>> Pierrick
>
> Could we use some kind of linker trick?
> Split just the ring manipulation code from
> virtio.c and keep it target specific.
>
While it would allow to extract the common part, unfortunately, as long
as we keep a target specific part, we still have a problem to solve.
Indeed, some symbols (may be ring manipulation code, or just functions
mentioned below) will still be duplicated, and in the end, when we'll
link two targets needing different versions, we'll have a conflict.
Since our goal for a single-binary is to have at least arm and another
base architecture, we are in this situation now.
So the only solutions left are:
- accept the 1.7% regression, and we can provide more "real world"
benchmarks to show that it gets absorbed in all existing layers
- compute virtio_access_is_big_endian once and reuse this everywhere:
that's what Philippe implemented in his patches.
What would you prefer?
>
> Or it could be that even just these two would be enough:
>
> static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev,
> MemoryRegionCache *cache,
> hwaddr pa)
> {
> if (virtio_access_is_big_endian(vdev)) {
> return lduw_be_phys_cached(cache, pa);
> }
> return lduw_le_phys_cached(cache, pa);
> }
>
> static inline void virtio_stw_phys_cached(VirtIODevice *vdev,
> MemoryRegionCache *cache,
> hwaddr pa, uint16_t value)
> {
> if (virtio_access_is_big_endian(vdev)) {
> stw_be_phys_cached(cache, pa, value);
> } else {
> stw_le_phys_cached(cache, pa, value);
> }
> }
>
>
>
>
>
>
Regards,
Pierrick
On Tue, Feb 03, 2026 at 09:31:04AM -0800, Pierrick Bouvier wrote:
> On 2/3/26 3:07 AM, Michael S. Tsirkin wrote:
> > On Mon, Feb 02, 2026 at 07:22:16PM -0800, Pierrick Bouvier wrote:
> > > On 2/2/26 11:25 AM, Pierrick Bouvier wrote:
> > > > On 2/2/26 10:52 AM, Stefan Hajnoczi wrote:
> > > > >
> > > > > This command-line lets you benchmark virtio-blk without actual I/O
> > > > > slowing down the request processing:
> > > > >
> > > > > qemu-system-x86_64 \
> > > > > -M accel=kvm \
> > > > > -cpu host \
> > > > > -m 4G \
> > > > > --blockdev file,node-name=drive0,filename=boot.img,cache.direct=on,aio=native \
> > > > > --blockdev null-co,node-name=drive1,size=$((10 * 1024 * 1024 * 1024)) \
> > > > > --object iothread,id=iothread0 \
> > > > > --device virtio-blk-pci,drive=drive0,iothread=iothread0 \
> > > > > --device virtio-blk-pci,drive=drive1,iothread=iothread0
> > > > >
> > > > > Here is a fio command-line for 4 KiB random reads:
> > > > >
> > > > > fio \
> > > > > --ioengine=libaio \
> > > > > --direct=1 \
> > > > > --runtime=30 \
> > > > > --ramp_time=10 \
> > > > > --rw=randread \
> > > > > --bs=4k \
> > > > > --iodepth=128 \
> > > > > --filename=/dev/vdb \
> > > > > --name=randread
> > > > >
> > > > > This is just a single vCPU, but it should be enough to see if there is
> > > > > any difference in I/O Operations Per Second (IOPS) or efficiency
> > > > > (IOPS/CPU utilization).
> > > > >
> > > > Thanks very much for the info Stefan. I didn't even know null-co
> > > > blockdev, so it definitely would have taken some time to find all this.
> > > >
> > > > For what it's worth, I automated the benchmark here (need podman for
> > > > build), so it can be reused for future changes:
> > > > https://github.com/pbo-linaro/qemu-linux-stack/tree/x86_64_io_benchmark
> > > >
> > > > ./build.sh && ./run.sh path/to/qemu-system-x86_64
> > > >
> > > > My initial testing showed a 50% slow down, which was more than
> > > > surprising. After profiling, the extra time is spent here:
> > > > https://github.com/qemu/qemu/blob/587f4a1805c83a4e1d59dd43cb14e0a834843d1d/target-info.c#L30
> > > >
> > > > When we merged target-info, there have been several versions over a long
> > > > time, and I was 100% sure we were updating the target_info structure,
> > > > instead of reparsing the target_name every time. Unfortunately, that's
> > > > not the case. I'll fix that.
> > > >
> > > > With that fix, there is no difference in performance (<1%).
> > > >
> > > > I'll respin a v4 with the target info fix, initial v1 changes and
> > > > benchmark results.
> > > >
> > > > Thanks for pointing the performance issue, there was one for sure.
> > > >
> > > > Regards,
> > > > Pierrick
> > >
> > > After proper benchmarking, I get those results (in kIOPS) over 20 runs, all
> > > including target_arch fix I mentioned.
> > >
> > > reference: mean=239.2 var=12.16
> > > v1: mean=232.2 var=37.06
> > > v4-wip (optimized virtio_access_is_big_endian): mean=235.05 var=18.64
> > >
> > > So basically, we have a 1.7% performance hit on a torture benchmark.
> > > Is that acceptable for you, or should we dig more?
> > >
> > > Regards,
> > > Pierrick
> >
> > Could we use some kind of linker trick?
> > Split just the ring manipulation code from
> > virtio.c and keep it target specific.
> >
>
> While it would allow to extract the common part, unfortunately, as long as
> we keep a target specific part, we still have a problem to solve.
>
>
> Indeed, some symbols (may be ring manipulation code, or just functions
> mentioned below) will still be duplicated, and in the end, when we'll link
> two targets needing different versions, we'll have a conflict.
Shrug. The generic slower one can be made stronger and be used then.
This is not an new or unsolveable problem.
> Since our goal for a single-binary is to have at least arm and another base
> architecture, we are in this situation now.
>
> So the only solutions left are:
> - accept the 1.7% regression, and we can provide more "real world"
> benchmarks to show that it gets absorbed in all existing layers
> - compute virtio_access_is_big_endian once and reuse this everywhere: that's
> what Philippe implemented in his patches.
>
> What would you prefer?
Let's find a way where people who are not interested in a single binary
do not have to pay the price, please.
> >
> > Or it could be that even just these two would be enough:
> >
> > static inline uint16_t virtio_lduw_phys_cached(VirtIODevice *vdev,
> > MemoryRegionCache *cache,
> > hwaddr pa)
> > {
> > if (virtio_access_is_big_endian(vdev)) {
> > return lduw_be_phys_cached(cache, pa);
> > }
> > return lduw_le_phys_cached(cache, pa);
> > }
> >
> > static inline void virtio_stw_phys_cached(VirtIODevice *vdev,
> > MemoryRegionCache *cache,
> > hwaddr pa, uint16_t value)
> > {
> > if (virtio_access_is_big_endian(vdev)) {
> > stw_be_phys_cached(cache, pa, value);
> > } else {
> > stw_le_phys_cached(cache, pa, value);
> > }
> > }
> >
> >
> >
> >
> >
> >
>
> Regards,
> Pierrick
On 2/3/26 11:06 AM, Michael S. Tsirkin wrote: > On Tue, Feb 03, 2026 at 09:31:04AM -0800, Pierrick Bouvier wrote: >> On 2/3/26 3:07 AM, Michael S. Tsirkin wrote: >>> On Mon, Feb 02, 2026 at 07:22:16PM -0800, Pierrick Bouvier wrote: >>>> On 2/2/26 11:25 AM, Pierrick Bouvier wrote: >>>>> On 2/2/26 10:52 AM, Stefan Hajnoczi wrote: >>>>>> >>>>>> This command-line lets you benchmark virtio-blk without actual I/O >>>>>> slowing down the request processing: >>>>>> >>>>>> qemu-system-x86_64 \ >>>>>> -M accel=kvm \ >>>>>> -cpu host \ >>>>>> -m 4G \ >>>>>> --blockdev file,node-name=drive0,filename=boot.img,cache.direct=on,aio=native \ >>>>>> --blockdev null-co,node-name=drive1,size=$((10 * 1024 * 1024 * 1024)) \ >>>>>> --object iothread,id=iothread0 \ >>>>>> --device virtio-blk-pci,drive=drive0,iothread=iothread0 \ >>>>>> --device virtio-blk-pci,drive=drive1,iothread=iothread0 >>>>>> >>>>>> Here is a fio command-line for 4 KiB random reads: >>>>>> >>>>>> fio \ >>>>>> --ioengine=libaio \ >>>>>> --direct=1 \ >>>>>> --runtime=30 \ >>>>>> --ramp_time=10 \ >>>>>> --rw=randread \ >>>>>> --bs=4k \ >>>>>> --iodepth=128 \ >>>>>> --filename=/dev/vdb \ >>>>>> --name=randread >>>>>> >>>>>> This is just a single vCPU, but it should be enough to see if there is >>>>>> any difference in I/O Operations Per Second (IOPS) or efficiency >>>>>> (IOPS/CPU utilization). >>>>>> >>>>> Thanks very much for the info Stefan. I didn't even know null-co >>>>> blockdev, so it definitely would have taken some time to find all this. >>>>> >>>>> For what it's worth, I automated the benchmark here (need podman for >>>>> build), so it can be reused for future changes: >>>>> https://github.com/pbo-linaro/qemu-linux-stack/tree/x86_64_io_benchmark >>>>> >>>>> ./build.sh && ./run.sh path/to/qemu-system-x86_64 >>>>> >>>>> My initial testing showed a 50% slow down, which was more than >>>>> surprising. After profiling, the extra time is spent here: >>>>> https://github.com/qemu/qemu/blob/587f4a1805c83a4e1d59dd43cb14e0a834843d1d/target-info.c#L30 >>>>> >>>>> When we merged target-info, there have been several versions over a long >>>>> time, and I was 100% sure we were updating the target_info structure, >>>>> instead of reparsing the target_name every time. Unfortunately, that's >>>>> not the case. I'll fix that. >>>>> >>>>> With that fix, there is no difference in performance (<1%). >>>>> >>>>> I'll respin a v4 with the target info fix, initial v1 changes and >>>>> benchmark results. >>>>> >>>>> Thanks for pointing the performance issue, there was one for sure. >>>>> >>>>> Regards, >>>>> Pierrick >>>> >>>> After proper benchmarking, I get those results (in kIOPS) over 20 runs, all >>>> including target_arch fix I mentioned. >>>> >>>> reference: mean=239.2 var=12.16 >>>> v1: mean=232.2 var=37.06 >>>> v4-wip (optimized virtio_access_is_big_endian): mean=235.05 var=18.64 >>>> >>>> So basically, we have a 1.7% performance hit on a torture benchmark. >>>> Is that acceptable for you, or should we dig more? >>>> >>>> Regards, >>>> Pierrick >>> >>> Could we use some kind of linker trick? >>> Split just the ring manipulation code from >>> virtio.c and keep it target specific. >>> >> >> While it would allow to extract the common part, unfortunately, as long as >> we keep a target specific part, we still have a problem to solve. >> >> >> Indeed, some symbols (may be ring manipulation code, or just functions >> mentioned below) will still be duplicated, and in the end, when we'll link >> two targets needing different versions, we'll have a conflict. > > Shrug. The generic slower one can be made stronger and be used then. > This is not an new or unsolveable problem. > > >> Since our goal for a single-binary is to have at least arm and another base >> architecture, we are in this situation now. >> >> So the only solutions left are: >> - accept the 1.7% regression, and we can provide more "real world" >> benchmarks to show that it gets absorbed in all existing layers >> - compute virtio_access_is_big_endian once and reuse this everywhere: that's >> what Philippe implemented in his patches. >> >> What would you prefer? > > > Let's find a way where people who are not interested in a single binary > do not have to pay the price, please. > Sure, I'll try and benchmark with Philippe additional patches, it should remove the overhead completely. Thanks, Pierrick
On 2/3/26 11:10 AM, Pierrick Bouvier wrote: > On 2/3/26 11:06 AM, Michael S. Tsirkin wrote: >> On Tue, Feb 03, 2026 at 09:31:04AM -0800, Pierrick Bouvier wrote: >>> On 2/3/26 3:07 AM, Michael S. Tsirkin wrote: >>>> On Mon, Feb 02, 2026 at 07:22:16PM -0800, Pierrick Bouvier wrote: >>>>> On 2/2/26 11:25 AM, Pierrick Bouvier wrote: >>>>>> On 2/2/26 10:52 AM, Stefan Hajnoczi wrote: >>>>>>> >>>>>>> This command-line lets you benchmark virtio-blk without actual I/O >>>>>>> slowing down the request processing: >>>>>>> >>>>>>> qemu-system-x86_64 \ >>>>>>> -M accel=kvm \ >>>>>>> -cpu host \ >>>>>>> -m 4G \ >>>>>>> --blockdev file,node-name=drive0,filename=boot.img,cache.direct=on,aio=native \ >>>>>>> --blockdev null-co,node-name=drive1,size=$((10 * 1024 * 1024 * 1024)) \ >>>>>>> --object iothread,id=iothread0 \ >>>>>>> --device virtio-blk-pci,drive=drive0,iothread=iothread0 \ >>>>>>> --device virtio-blk-pci,drive=drive1,iothread=iothread0 >>>>>>> >>>>>>> Here is a fio command-line for 4 KiB random reads: >>>>>>> >>>>>>> fio \ >>>>>>> --ioengine=libaio \ >>>>>>> --direct=1 \ >>>>>>> --runtime=30 \ >>>>>>> --ramp_time=10 \ >>>>>>> --rw=randread \ >>>>>>> --bs=4k \ >>>>>>> --iodepth=128 \ >>>>>>> --filename=/dev/vdb \ >>>>>>> --name=randread >>>>>>> >>>>>>> This is just a single vCPU, but it should be enough to see if there is >>>>>>> any difference in I/O Operations Per Second (IOPS) or efficiency >>>>>>> (IOPS/CPU utilization). >>>>>>> >>>>>> Thanks very much for the info Stefan. I didn't even know null-co >>>>>> blockdev, so it definitely would have taken some time to find all this. >>>>>> >>>>>> For what it's worth, I automated the benchmark here (need podman for >>>>>> build), so it can be reused for future changes: >>>>>> https://github.com/pbo-linaro/qemu-linux-stack/tree/x86_64_io_benchmark >>>>>> >>>>>> ./build.sh && ./run.sh path/to/qemu-system-x86_64 >>>>>> >>>>>> My initial testing showed a 50% slow down, which was more than >>>>>> surprising. After profiling, the extra time is spent here: >>>>>> https://github.com/qemu/qemu/blob/587f4a1805c83a4e1d59dd43cb14e0a834843d1d/target-info.c#L30 >>>>>> >>>>>> When we merged target-info, there have been several versions over a long >>>>>> time, and I was 100% sure we were updating the target_info structure, >>>>>> instead of reparsing the target_name every time. Unfortunately, that's >>>>>> not the case. I'll fix that. >>>>>> >>>>>> With that fix, there is no difference in performance (<1%). >>>>>> >>>>>> I'll respin a v4 with the target info fix, initial v1 changes and >>>>>> benchmark results. >>>>>> >>>>>> Thanks for pointing the performance issue, there was one for sure. >>>>>> >>>>>> Regards, >>>>>> Pierrick >>>>> >>>>> After proper benchmarking, I get those results (in kIOPS) over 20 runs, all >>>>> including target_arch fix I mentioned. >>>>> >>>>> reference: mean=239.2 var=12.16 >>>>> v1: mean=232.2 var=37.06 >>>>> v4-wip (optimized virtio_access_is_big_endian): mean=235.05 var=18.64 >>>>> >>>>> So basically, we have a 1.7% performance hit on a torture benchmark. >>>>> Is that acceptable for you, or should we dig more? >>>>> >>>>> Regards, >>>>> Pierrick >>>> >>>> Could we use some kind of linker trick? >>>> Split just the ring manipulation code from >>>> virtio.c and keep it target specific. >>>> >>> >>> While it would allow to extract the common part, unfortunately, as long as >>> we keep a target specific part, we still have a problem to solve. >>> >>> >>> Indeed, some symbols (may be ring manipulation code, or just functions >>> mentioned below) will still be duplicated, and in the end, when we'll link >>> two targets needing different versions, we'll have a conflict. >> >> Shrug. The generic slower one can be made stronger and be used then. >> This is not an new or unsolveable problem. >> >> >>> Since our goal for a single-binary is to have at least arm and another base >>> architecture, we are in this situation now. >>> >>> So the only solutions left are: >>> - accept the 1.7% regression, and we can provide more "real world" >>> benchmarks to show that it gets absorbed in all existing layers >>> - compute virtio_access_is_big_endian once and reuse this everywhere: that's >>> what Philippe implemented in his patches. >>> >>> What would you prefer? >> >> >> Let's find a way where people who are not interested in a single binary >> do not have to pay the price, please. >> > > Sure, I'll try and benchmark with Philippe additional patches, it should > remove the overhead completely. > > Thanks, > Pierrick This resulted in a +1.4% speedup vs existing code, so it sounds like we have a solution. I'll assemble and post a v4 for this. Regards, Pierrick
© 2016 - 2026 Red Hat, Inc.