memory: make ram device read/write endian sensitive

[Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Yongji Xie 8 years, 11 months ago

At the moment ram device's memory regions are NATIVE_ENDIAN. This does
not work on PPC64 because VFIO PCI device is little endian but PPC64
always defines static macro TARGET_WORDS_BIGENDIAN.

This fixes endianness for ram device the same way as it is done
for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.

Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
---
 memory.c |   14 +++++++-------
 1 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/memory.c b/memory.c
index 6c58373..1ccb99f 100644
--- a/memory.c
+++ b/memory.c
@@ -1139,13 +1139,13 @@ static uint64_t memory_region_ram_device_read(void *opaque,
         data = *(uint8_t *)(mr->ram_block->host + addr);
         break;
     case 2:
-        data = *(uint16_t *)(mr->ram_block->host + addr);
+        data = le16_to_cpu(*(uint16_t *)(mr->ram_block->host + addr));
         break;
     case 4:
-        data = *(uint32_t *)(mr->ram_block->host + addr);
+        data = le32_to_cpu(*(uint32_t *)(mr->ram_block->host + addr));
         break;
     case 8:
-        data = *(uint64_t *)(mr->ram_block->host + addr);
+        data = le64_to_cpu(*(uint64_t *)(mr->ram_block->host + addr));
         break;
     }
 
@@ -1166,13 +1166,13 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
         *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data;
         break;
     case 2:
-        *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data;
+        *(uint16_t *)(mr->ram_block->host + addr) = cpu_to_le16((uint16_t)data);
         break;
     case 4:
-        *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data;
+        *(uint32_t *)(mr->ram_block->host + addr) = cpu_to_le32((uint32_t)data);
         break;
     case 8:
-        *(uint64_t *)(mr->ram_block->host + addr) = data;
+        *(uint64_t *)(mr->ram_block->host + addr) = cpu_to_le64(data);
         break;
     }
 }
@@ -1180,7 +1180,7 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
 static const MemoryRegionOps ram_device_mem_ops = {
     .read = memory_region_ram_device_read,
     .write = memory_region_ram_device_write,
-    .endianness = DEVICE_NATIVE_ENDIAN,
+    .endianness = DEVICE_LITTLE_ENDIAN,
     .valid = {
         .min_access_size = 1,
         .max_access_size = 8,
-- 
1.7.1

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Alex Williamson 8 years, 11 months ago

On Tue, 21 Feb 2017 14:46:55 +0800
Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:

> At the moment ram device's memory regions are NATIVE_ENDIAN. This does
> not work on PPC64 because VFIO PCI device is little endian but PPC64
> always defines static macro TARGET_WORDS_BIGENDIAN.
> 
> This fixes endianness for ram device the same way as it is done
> for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.

The referenced commit was to vfio code where the endianness is fixed,
here you're modifying shared generic code to assume the same
endianness as vfio.  That seems wrong.  Should
memory_region_init_ram_device_ptr() instead take an endian option and
ram_device_mem_ops should take the backing endianness into account?
Thanks,

Alex
 
> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
> ---
>  memory.c |   14 +++++++-------
>  1 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/memory.c b/memory.c
> index 6c58373..1ccb99f 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -1139,13 +1139,13 @@ static uint64_t memory_region_ram_device_read(void *opaque,
>          data = *(uint8_t *)(mr->ram_block->host + addr);
>          break;
>      case 2:
> -        data = *(uint16_t *)(mr->ram_block->host + addr);
> +        data = le16_to_cpu(*(uint16_t *)(mr->ram_block->host + addr));
>          break;
>      case 4:
> -        data = *(uint32_t *)(mr->ram_block->host + addr);
> +        data = le32_to_cpu(*(uint32_t *)(mr->ram_block->host + addr));
>          break;
>      case 8:
> -        data = *(uint64_t *)(mr->ram_block->host + addr);
> +        data = le64_to_cpu(*(uint64_t *)(mr->ram_block->host + addr));
>          break;
>      }
>  
> @@ -1166,13 +1166,13 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>          *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data;
>          break;
>      case 2:
> -        *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data;
> +        *(uint16_t *)(mr->ram_block->host + addr) = cpu_to_le16((uint16_t)data);
>          break;
>      case 4:
> -        *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data;
> +        *(uint32_t *)(mr->ram_block->host + addr) = cpu_to_le32((uint32_t)data);
>          break;
>      case 8:
> -        *(uint64_t *)(mr->ram_block->host + addr) = data;
> +        *(uint64_t *)(mr->ram_block->host + addr) = cpu_to_le64(data);
>          break;
>      }
>  }
> @@ -1180,7 +1180,7 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>  static const MemoryRegionOps ram_device_mem_ops = {
>      .read = memory_region_ram_device_read,
>      .write = memory_region_ram_device_write,
> -    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
>      .valid = {
>          .min_access_size = 1,
>          .max_access_size = 8,

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 21/02/2017 17:21, Alex Williamson wrote:
> On Tue, 21 Feb 2017 14:46:55 +0800
> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> 
>> At the moment ram device's memory regions are NATIVE_ENDIAN. This does
>> not work on PPC64 because VFIO PCI device is little endian but PPC64
>> always defines static macro TARGET_WORDS_BIGENDIAN.
>>
>> This fixes endianness for ram device the same way as it is done
>> for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.
> 
> The referenced commit was to vfio code where the endianness is fixed,
> here you're modifying shared generic code to assume the same
> endianness as vfio.  That seems wrong.

Is the goal to have the same endianness as VFIO?  Or is it just a trick
to ensure the number of swaps is always 0 or 2, so that they cancel away?

In other words, would Yongji's patch just work if it used
DEVICE_BIG_ENDIAN and beNN_to_cpu/cpu_to_beNN?  If so, then I think the
patch is okay.

Paolo

> 
> Alex
>  
>> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
>> ---
>>  memory.c |   14 +++++++-------
>>  1 files changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/memory.c b/memory.c
>> index 6c58373..1ccb99f 100644
>> --- a/memory.c
>> +++ b/memory.c
>> @@ -1139,13 +1139,13 @@ static uint64_t memory_region_ram_device_read(void *opaque,
>>          data = *(uint8_t *)(mr->ram_block->host + addr);
>>          break;
>>      case 2:
>> -        data = *(uint16_t *)(mr->ram_block->host + addr);
>> +        data = le16_to_cpu(*(uint16_t *)(mr->ram_block->host + addr));
>>          break;
>>      case 4:
>> -        data = *(uint32_t *)(mr->ram_block->host + addr);
>> +        data = le32_to_cpu(*(uint32_t *)(mr->ram_block->host + addr));
>>          break;
>>      case 8:
>> -        data = *(uint64_t *)(mr->ram_block->host + addr);
>> +        data = le64_to_cpu(*(uint64_t *)(mr->ram_block->host + addr));
>>          break;
>>      }
>>  
>> @@ -1166,13 +1166,13 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>>          *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data;
>>          break;
>>      case 2:
>> -        *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data;
>> +        *(uint16_t *)(mr->ram_block->host + addr) = cpu_to_le16((uint16_t)data);
>>          break;
>>      case 4:
>> -        *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data;
>> +        *(uint32_t *)(mr->ram_block->host + addr) = cpu_to_le32((uint32_t)data);
>>          break;
>>      case 8:
>> -        *(uint64_t *)(mr->ram_block->host + addr) = data;
>> +        *(uint64_t *)(mr->ram_block->host + addr) = cpu_to_le64(data);
>>          break;
>>      }
>>  }
>> @@ -1180,7 +1180,7 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>>  static const MemoryRegionOps ram_device_mem_ops = {
>>      .read = memory_region_ram_device_read,
>>      .write = memory_region_ram_device_write,
>> -    .endianness = DEVICE_NATIVE_ENDIAN,
>> +    .endianness = DEVICE_LITTLE_ENDIAN,
>>      .valid = {
>>          .min_access_size = 1,
>>          .max_access_size = 8,
>

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Peter Maydell 8 years, 11 months ago

On 21 February 2017 at 16:34, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 21/02/2017 17:21, Alex Williamson wrote:
>> On Tue, 21 Feb 2017 14:46:55 +0800
>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>
>>> At the moment ram device's memory regions are NATIVE_ENDIAN. This does
>>> not work on PPC64 because VFIO PCI device is little endian but PPC64
>>> always defines static macro TARGET_WORDS_BIGENDIAN.
>>>
>>> This fixes endianness for ram device the same way as it is done
>>> for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.
>>
>> The referenced commit was to vfio code where the endianness is fixed,
>> here you're modifying shared generic code to assume the same
>> endianness as vfio.  That seems wrong.
>
> Is the goal to have the same endianness as VFIO?  Or is it just a trick
> to ensure the number of swaps is always 0 or 2, so that they cancel away?
>
> In other words, would Yongji's patch just work if it used
> DEVICE_BIG_ENDIAN and beNN_to_cpu/cpu_to_beNN?  If so, then I think the
> patch is okay.

I think any patch that proposes adding or removing one or more
endianness related swaps should come with a commit message that
states very clearly why this exact point in the stack is the
correct place to do these swaps, from a design point of view.
(This is so we can avoid the pitfall of putting in enough swaps
to cancel each other out but at the wrong point in the design.)

In this instance I don't understand the patch. The ram_device
mem-ops are there to deal with memory regions backed by a
lump of RAM, right? Lumps of memory are always the endianness
of the host CPU by definition, so DEVICE_NATIVE_ENDIAN and
no swapping in the accessors seems like it ought to be the right
thing...

thanks
-- PMM

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Alex Williamson 8 years, 11 months ago

On Tue, 21 Feb 2017 18:09:04 +0000
Peter Maydell <peter.maydell@linaro.org> wrote:

> On 21 February 2017 at 16:34, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> >
> > On 21/02/2017 17:21, Alex Williamson wrote:  
> >> On Tue, 21 Feb 2017 14:46:55 +0800
> >> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
> >>  
> >>> At the moment ram device's memory regions are NATIVE_ENDIAN. This does
> >>> not work on PPC64 because VFIO PCI device is little endian but PPC64
> >>> always defines static macro TARGET_WORDS_BIGENDIAN.
> >>>
> >>> This fixes endianness for ram device the same way as it is done
> >>> for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.  
> >>
> >> The referenced commit was to vfio code where the endianness is fixed,
> >> here you're modifying shared generic code to assume the same
> >> endianness as vfio.  That seems wrong.  
> >
> > Is the goal to have the same endianness as VFIO?  Or is it just a trick
> > to ensure the number of swaps is always 0 or 2, so that they cancel away?
> >
> > In other words, would Yongji's patch just work if it used
> > DEVICE_BIG_ENDIAN and beNN_to_cpu/cpu_to_beNN?  If so, then I think the
> > patch is okay.  
> 
> I think any patch that proposes adding or removing one or more
> endianness related swaps should come with a commit message that
> states very clearly why this exact point in the stack is the
> correct place to do these swaps, from a design point of view.
> (This is so we can avoid the pitfall of putting in enough swaps
> to cancel each other out but at the wrong point in the design.)
> 
> In this instance I don't understand the patch. The ram_device
> mem-ops are there to deal with memory regions backed by a
> lump of RAM, right? Lumps of memory are always the endianness
> of the host CPU by definition, so DEVICE_NATIVE_ENDIAN and
> no swapping in the accessors seems like it ought to be the right
> thing...

I'm glad I'm not the only one confused.  The clarity I can add is that
for vfio, we define that any read/write to the vfio file descriptor is
little endian.  The kernel does cpu_to_leXX/leXX_to_cpu around the
ioreadXX/iowriteXX calls.  For better or worse, this gets us around the
implicit CPU endianness of the data there and gives us a standard
endianness.  mmaps are of course device native endian, but the point of
the ram device was that some devices (realtek NICs) don't respond well
to using memcpy through the mmap and we're better off using explicit
reads and writes.

The part where I get lost is that if PPC64 always sets the target
to big endian, then adjust_endianness() we will always bswap after a
read and before a write.  I don't follow how that bswap necessarily
gets us to QEMU CPU endian such that the cpu_to_leXX/leXX_to_cpu in the
access functions always do the right thing.  I know we do this
elsewhere in vfio, perhaps I've deferred to someone convincing me it
was the right thing at the time, but I'm not able to derive it myself
currently.

To answer Paolo's question, if PPC64 sets TARGET_WORDS_BIGENDIAN, which
is a compile time setting, then using DEVICE_BIG_ENDIAN would
deterministically remove the bswap from adjust_endianness(), but I
don't know how anything is deterministic about CPU-relative byte swaps
since (I think) PPC64 could actually be running in either.  +1 to being
very explicit about which swaps are occurring where.  Thanks,

Alex

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Yongji Xie 8 years, 11 months ago

on 2017/2/22 2:44, Alex Williamson wrote:

> On Tue, 21 Feb 2017 18:09:04 +0000
> Peter Maydell <peter.maydell@linaro.org> wrote:
>
>> On 21 February 2017 at 16:34, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>>> On 21/02/2017 17:21, Alex Williamson wrote:
>>>> On Tue, 21 Feb 2017 14:46:55 +0800
>>>> Yongji Xie <xyjxie@linux.vnet.ibm.com> wrote:
>>>>   
>>>>> At the moment ram device's memory regions are NATIVE_ENDIAN. This does
>>>>> not work on PPC64 because VFIO PCI device is little endian but PPC64
>>>>> always defines static macro TARGET_WORDS_BIGENDIAN.
>>>>>
>>>>> This fixes endianness for ram device the same way as it is done
>>>>> for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.
>>>> The referenced commit was to vfio code where the endianness is fixed,
>>>> here you're modifying shared generic code to assume the same
>>>> endianness as vfio.  That seems wrong.
>>> Is the goal to have the same endianness as VFIO?  Or is it just a trick
>>> to ensure the number of swaps is always 0 or 2, so that they cancel away?
>>>
>>> In other words, would Yongji's patch just work if it used
>>> DEVICE_BIG_ENDIAN and beNN_to_cpu/cpu_to_beNN?  If so, then I think the
>>> patch is okay.
>> I think any patch that proposes adding or removing one or more
>> endianness related swaps should come with a commit message that
>> states very clearly why this exact point in the stack is the
>> correct place to do these swaps, from a design point of view.
>> (This is so we can avoid the pitfall of putting in enough swaps
>> to cancel each other out but at the wrong point in the design.)
>>
>> In this instance I don't understand the patch. The ram_device
>> mem-ops are there to deal with memory regions backed by a
>> lump of RAM, right? Lumps of memory are always the endianness
>> of the host CPU by definition, so DEVICE_NATIVE_ENDIAN and
>> no swapping in the accessors seems like it ought to be the right
>> thing...
> I'm glad I'm not the only one confused.  The clarity I can add is that
> for vfio, we define that any read/write to the vfio file descriptor is
> little endian.  The kernel does cpu_to_leXX/leXX_to_cpu around the
> ioreadXX/iowriteXX calls.  For better or worse, this gets us around the
> implicit CPU endianness of the data there and gives us a standard
> endianness.  mmaps are of course device native endian, but the point of
> the ram device was that some devices (realtek NICs) don't respond well
> to using memcpy through the mmap and we're better off using explicit
> reads and writes.
>
> The part where I get lost is that if PPC64 always sets the target
> to big endian, then adjust_endianness() we will always bswap after a
> read and before a write.  I don't follow how that bswap necessarily
> gets us to QEMU CPU endian such that the cpu_to_leXX/leXX_to_cpu in the
> access functions always do the right thing.  I know we do this
> elsewhere in vfio, perhaps I've deferred to someone convincing me it
> was the right thing at the time, but I'm not able to derive it myself
> currently.
>
> To answer Paolo's question, if PPC64 sets TARGET_WORDS_BIGENDIAN, which
> is a compile time setting, then using DEVICE_BIG_ENDIAN would
> deterministically remove the bswap from adjust_endianness(), but I
> don't know how anything is deterministic about CPU-relative byte swaps
> since (I think) PPC64 could actually be running in either.  +1 to being
> very explicit about which swaps are occurring where.  Thanks,
>

Actually I also have some confusions about the endianness during fixing 
this issue...

I assume that kvm_run->mmio.data is stored as the endianness of device that
guest try to accesss. For one mmio write,  we may do the *first* bswap in
address_space_write_continue(). The ldX_p() will do the bswap when 
target and
host endianness are different. Then adjust_endianness() may do the 
*second* bswap
when target and device endianness are different.

Here are all possible cases:

run->mmio.data    device    target    host    for mr->ops->write()
============================================================
little        native    big    big    little
little        native    little    little    little
little        native    big    little    big (this is the wrong case we 
found)
little        native    little    big    big
==============================================================
little        little    big    big    big (could be fixed by cpu_to_leXX)
little        little    little    little    little
little        little    big    little    little
little        little    little    big    big (could be fixed by cpu_to_leXX)
==============================================================
big        native    big    big    big
big        native    little    little    big
big        native    big    little    little
big        native    little    big    little
==============================================================
big        big    big    big    big
big        big    little    little    little (could be fixed by cpu_to_beXX)
big        big    big    little    little (could be fixed by cpu_to_beXX)
big        big    little    big    big

We can easily find that native_endian device doesn't work when target 
and host
endianness are different. So I fix it like this patch does. But I'm 
wrong to
assume the same endianness as vfio in this patch like Alex said. So maybe
we should consider an endian option in memory_region_init_ram_device_ptr().

Back to my confusion, I failed to understand the objective of those two 
swaps.
Is it possible to pass the run->mmio.data directly to mr->ops->write()? 
Seems
like those two swaps are based on the fact that run->mmio.data is stored as
host-endian rather than the endianness of device that guest try to 
accesss. Am
I missed something important here?

Thanks,
Yongji

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 21/02/2017 19:44, Alex Williamson wrote:
>>> In other words, would Yongji's patch just work if it used
>>> DEVICE_BIG_ENDIAN and beNN_to_cpu/cpu_to_beNN?  If so, then I think the
>>> patch is okay.  
>
> The part where I get lost is that if PPC64 always sets the target
> to big endian, then adjust_endianness() we will always bswap after a
> read and before a write.  I don't follow how that bswap necessarily
> gets us to QEMU CPU endian such that the cpu_to_leXX/leXX_to_cpu in the
> access functions always do the right thing.  I know we do this
> elsewhere in vfio, perhaps I've deferred to someone convincing me it
> was the right thing at the time, but I'm not able to derive it myself
> currently.
> 
> To answer Paolo's question, if PPC64 sets TARGET_WORDS_BIGENDIAN, which
> is a compile time setting, then using DEVICE_BIG_ENDIAN would
> deterministically remove the bswap from adjust_endianness(), 

And beNN_to_cpu/cpu_to_beNN would add a swap in 
memory_region_ram_device_read/write, so it would work just as well.

KVM expects data in host endianness:

static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu)
{
        return (kvmppc_get_msr(vcpu) & MSR_LE) != (MSR_KERNEL & MSR_LE);
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^      ^^^^^^^^^^^^^^^^^^^
                 endianness of the target              endianness of
                                                      struct kvm_run
}

while QEMU hardcodes bigendian for DEVICE_NATIVE_ENDIAN, even if 
running on PPC64LE host.

However, I'm confused as to what would happen with TCG. :(  The best 
thing to do, IMO, is to add a RAM device MR to pc-testdev (e.g. at 0xec)

diff --git a/hw/misc/pc-testdev.c b/hw/misc/pc-testdev.c
index b81d820..5ea444f 100644
--- a/hw/misc/pc-testdev.c
+++ b/hw/misc/pc-testdev.c
@@ -47,11 +47,13 @@ typedef struct PCTestdev {
 
     MemoryRegion ioport;
     MemoryRegion ioport_byte;
+    MemoryRegion ioport_ramd;
     MemoryRegion flush;
     MemoryRegion irq;
     MemoryRegion iomem;
     uint32_t ioport_data;
     char iomem_buf[IOMEM_LEN];
+    char ioport_ramd_bytes[4];
 } PCTestdev;
 
 #define TYPE_TESTDEV "pc-testdev"
@@ -167,6 +169,10 @@ static void testdev_realizefn(DeviceState *d, Error **errp)
     memory_region_init_io(&dev->ioport_byte, OBJECT(dev),
                           &test_ioport_byte_ops, dev,
                           "pc-testdev-ioport-byte", 4);
+    memory_region_init_ram_device_ptr(&dev->ioport_ramd, OBJECT(dev),
+                                      "pc-testdev-ioport-ramd",
+                                      ARRAY_SIZE(dev->ioport_ramd_bytes),
+                                      &dev->ioport_ramd_bytes);
     memory_region_init_io(&dev->flush, OBJECT(dev), &test_flush_ops, dev,
                           "pc-testdev-flush-page", 4);
     memory_region_init_io(&dev->irq, OBJECT(dev), &test_irq_ops, dev,
@@ -177,6 +183,7 @@ static void testdev_realizefn(DeviceState *d, Error **errp)
     memory_region_add_subregion(io,  0xe0,       &dev->ioport);
     memory_region_add_subregion(io,  0xe4,       &dev->flush);
     memory_region_add_subregion(io,  0xe8,       &dev->ioport_byte);
+    memory_region_add_subregion(io,  0xec,       &dev->ioport_ramd);
     memory_region_add_subregion(io,  0x2000,     &dev->irq);
     memory_region_add_subregion(mem, 0xff000000, &dev->iomem);
 }

and a testcase to tests/endianness-test.c.

Paolo

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 21/02/2017 19:09, Peter Maydell wrote:
> In this instance I don't understand the patch. The ram_device
> mem-ops are there to deal with memory regions backed by a
> lump of RAM, right? Lumps of memory are always the endianness
> of the host CPU by definition, so DEVICE_NATIVE_ENDIAN and
> no swapping in the accessors seems like it ought to be the right
> thing...

DEVICE_NATIVE_ENDIAN is the endianness of the target CPU however.

So perhaps we need to add DEVICE_HOST_ENDIAN instead?

Paolo

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Peter Maydell 8 years, 11 months ago

On 21 February 2017 at 18:53, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 21/02/2017 19:09, Peter Maydell wrote:
>> In this instance I don't understand the patch. The ram_device
>> mem-ops are there to deal with memory regions backed by a
>> lump of RAM, right? Lumps of memory are always the endianness
>> of the host CPU by definition, so DEVICE_NATIVE_ENDIAN and
>> no swapping in the accessors seems like it ought to be the right
>> thing...
>
> DEVICE_NATIVE_ENDIAN is the endianness of the target CPU however.

...I meant 'target CPU' there (at any rate, endianness
of the bus interface between target CPU and its RAM; CPUs
like ARM with runtime configurable endianness do it by swapping
data before it hits the bus interface, conceptually speaking).

Endianness always confuses me, though (I think I understand
it and then I realize I don't; I also have trouble telling
whether I understand it and am discussing it with somebody who
doesn't understand it, or whether I don't understand it and they
do...)

thanks
-- PMM

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Alexey Kardashevskiy 8 years, 11 months ago

On 21/02/17 17:46, Yongji Xie wrote:
> At the moment ram device's memory regions are NATIVE_ENDIAN. This does
> not work on PPC64 because VFIO PCI device is little endian but PPC64
> always defines static macro TARGET_WORDS_BIGENDIAN.
> 
> This fixes endianness for ram device the same way as it is done
> for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.
> 
> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
> ---
>  memory.c |   14 +++++++-------
>  1 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/memory.c b/memory.c
> index 6c58373..1ccb99f 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -1139,13 +1139,13 @@ static uint64_t memory_region_ram_device_read(void *opaque,
>          data = *(uint8_t *)(mr->ram_block->host + addr);
>          break;
>      case 2:
> -        data = *(uint16_t *)(mr->ram_block->host + addr);
> +        data = le16_to_cpu(*(uint16_t *)(mr->ram_block->host + addr));
>          break;
>      case 4:
> -        data = *(uint32_t *)(mr->ram_block->host + addr);
> +        data = le32_to_cpu(*(uint32_t *)(mr->ram_block->host + addr));
>          break;
>      case 8:
> -        data = *(uint64_t *)(mr->ram_block->host + addr);
> +        data = le64_to_cpu(*(uint64_t *)(mr->ram_block->host + addr));
>          break;
>      }
>  
> @@ -1166,13 +1166,13 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>          *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data;
>          break;
>      case 2:
> -        *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data;
> +        *(uint16_t *)(mr->ram_block->host + addr) = cpu_to_le16((uint16_t)data);
>          break;
>      case 4:
> -        *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data;
> +        *(uint32_t *)(mr->ram_block->host + addr) = cpu_to_le32((uint32_t)data);
>          break;
>      case 8:
> -        *(uint64_t *)(mr->ram_block->host + addr) = data;
> +        *(uint64_t *)(mr->ram_block->host + addr) = cpu_to_le64(data);
>          break;
>      }
>  }
> @@ -1180,7 +1180,7 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>  static const MemoryRegionOps ram_device_mem_ops = {
>      .read = memory_region_ram_device_read,
>      .write = memory_region_ram_device_write,
> -    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
>      .valid = {
>          .min_access_size = 1,
>          .max_access_size = 8,
>
	

I did some debugging today.

First, Paolo is right and ram_device_mem_ops::endianness should be
host-endian which happens to be little in our test case (ppc64le) so
changes to .read/.write are actually no-op (I believe so, have not checked).


But I was wondering why this gets executed at all.

The test case is:

qemu-system-ppc64 ...
-device "vfio-pci,id=vfio0001_03_00_0,host=0001:03:00.0
-drive id=DRIVE0,if=none,file=./test.qcow2,format=qcow2
-device virtio-blk-pci,id=vblk0,drive=DRIVE0

The host kernel is v4.10, ppc64le (little endian), 64K system page size,
QEMU is v2.8.0.

When this boots, lspci shows:

00:00.0 Ethernet controller: Chelsio Communications Inc T310 10GbE Single
Port Adapter
        Subsystem: IBM Device 038c
        Flags: bus master, fast devsel, latency 0, IRQ 18
        Memory at 210000004000 (64-bit, non-prefetchable) [size=4K]
        Memory at 210000800000 (64-bit, non-prefetchable) [size=8M]
        Memory at 210001000000 (64-bit, non-prefetchable) [size=4K]
        Expansion ROM at 2000c0080000 [disabled] [size=512K]
        Capabilities: [40] Power Management version 3
        Capabilities: [48] MSI: Enable- Count=1/32 Maskable- 64bit+
        Capabilities: [58] Express Endpoint, MSI 00
        Capabilities: [94] Vital Product Data
        Capabilities: [9c] MSI-X: Enable+ Count=32 Masked-
        Kernel driver in use: cxgb3

00:01.0 SCSI storage controller: Red Hat, Inc Virtio block device
        Subsystem: Red Hat, Inc Device 0002
        Physical Slot: C16
        Flags: bus master, fast devsel, latency 0, IRQ 17
        I/O ports at 0040 [size=64]
        Memory at 2000c0000000 (32-bit, non-prefetchable) [size=4K]
        Memory at 210000000000 (64-bit, prefetchable) [size=16K]
        Capabilities: [98] MSI-X: Enable+ Count=2 Masked-
        Capabilities: [84] Vendor Specific Information: Len=14 <?>
        Capabilities: [70] Vendor Specific Information: Len=14 <?>
        Capabilities: [60] Vendor Specific Information: Len=10 <?>
        Capabilities: [50] Vendor Specific Information: Len=10 <?>
        Capabilities: [40] Vendor Specific Information: Len=10 <?>
        Kernel driver in use: virtio-pci


As we can see, BAR0 of Chelsio is 210000004000 - not aligned (it should
have been aligned but it is not - this is another bug, in QEMU). Normally
such a BAR would be emulated by VFIO. However, since 95251725e335 "vfio:
Add support for mmapping sub-page MMIO BARs" we mmap such BARs to QEMU in a
hope they will be registered later via KVM_SET_USER_MEMORY_REGION - and
this fails as the guest address is not host page size aligned.

So we end up having the following memory tree:

memory-region: pci@800000020000000.mmio
  0000000000000000-ffffffffffffffff (prio 0, RW): pci@800000020000000.mmio
    00000000c0000000-00000000c0000fff (prio 1, RW): virtio-blk-pci-msix
      00000000c0000000-00000000c000001f (prio 0, RW): msix-table
      00000000c0000800-00000000c0000807 (prio 0, RW): msix-pba
    0000210000000000-0000210000003fff (prio 1, RW): virtio-pci
      0000210000000000-0000210000000fff (prio 0, RW): virtio-pci-common
      0000210000001000-0000210000001fff (prio 0, RW): virtio-pci-isr
      0000210000002000-0000210000002fff (prio 0, RW): virtio-pci-device
      0000210000003000-0000210000003fff (prio 0, RW): virtio-pci-notify
    0000210000004000-0000210000004fff (prio 1, RW): 0001:03:00.0 BAR 0
      0000210000004000-0000210000004fff (prio 0, RW): 0001:03:00.0 BAR 0
mmaps[0]
    0000210000800000-0000210000ffffff (prio 1, RW): 0001:03:00.0 BAR 2
      0000210000800000-0000210000ffffff (prio 0, RW): 0001:03:00.0 BAR 2
mmaps[0]
    0000210001000000-0000210001000fff (prio 1, RW): 0001:03:00.0 BAR 4
      0000210001000000-00002100010001ff (prio 0, RW): msix-table
      0000210001000800-0000210001000807 (prio 0, RW): msix-pba [disabled]

The problem region which this patch is fixing is "0001:03:00.0 BAR 0
mmaps[0]". It is mmaped to QEMU and not registered in KVM. So when the
guest accesses this BAR, we trap into KVM, KVM passes it to QEMU and QEMU
read/writes this memory region.

A simple hack like below fixes it - it basically removes mmap'd memory
region from the tree and MMIO starts being handled by the parent MR -
"0001:03:00.0 BAR 0" (which is "little endian" - vfio_region_ops).


I am wondering now what would be a correct approach here?

Add/remove mmap'd MRs once we detect aligned/unaligned BARs?

Keep things where they are in the VFIO department and just fix
ram_device_mem_ops::endianness?

Thanks.



diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d7dbe0e3e0..0657a27623 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1109,7 +1109,10 @@ static void
vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
     memory_region_transaction_begin();

     memory_region_set_size(mr, size);
-    memory_region_set_size(mmap_mr, size);
+    if (bar_addr & qemu_real_host_page_mask)
+          memory_region_del_subregion(mr, mmap_mr);
+    else
+        memory_region_set_size(mmap_mr, size);
     if (size != region->size && memory_region_is_mapped(mr)) {
         memory_region_del_subregion(r->address_space, mr);
         memory_region_add_subregion_overlap(r->address_space,



-- 
Alexey

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 23/02/2017 05:20, Alexey Kardashevskiy wrote:
> First, Paolo is right and ram_device_mem_ops::endianness should be
> host-endian which happens to be little in our test case (ppc64le)

So you tested a ppc64 BE guest and it works?

> Keep things where they are in the VFIO department and just fix
> ram_device_mem_ops::endianness?

I would fix the ram_device_mem_ops.  Either by introducing
DEVICE_HOST_ENDIAN(*) or with Yongji's patch.

(*) DEVICE_NATIVE_ENDIAN is special cased all over the place
    because the same device (in a file that's compiled just once)
    can be either little- or big-endian.  DEVICE_HOST_ENDIAN can
    be a simple #define to either DEVICE_LITTLE_ENDIAN or
    DEVICE_BIG_ENDIAN, because host endianness is the same for
    all QEMU binaries.  It's literally half a dozen lines of code.

Paolo

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Peter Maydell 8 years, 11 months ago

On 23 February 2017 at 08:35, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 23/02/2017 05:20, Alexey Kardashevskiy wrote:
>> First, Paolo is right and ram_device_mem_ops::endianness should be
>> host-endian which happens to be little in our test case (ppc64le)
>
> So you tested a ppc64 BE guest and it works?
>
>> Keep things where they are in the VFIO department and just fix
>> ram_device_mem_ops::endianness?
>
> I would fix the ram_device_mem_ops.  Either by introducing
> DEVICE_HOST_ENDIAN(*) or with Yongji's patch.
>
> (*) DEVICE_NATIVE_ENDIAN is special cased all over the place
>     because the same device (in a file that's compiled just once)
>     can be either little- or big-endian.  DEVICE_HOST_ENDIAN can
>     be a simple #define to either DEVICE_LITTLE_ENDIAN or
>     DEVICE_BIG_ENDIAN, because host endianness is the same for
>     all QEMU binaries.  It's literally half a dozen lines of code.

I'm really not convinced we need DEVICE_HOST_ENDIAN. RAM
areas should be target-endian (you can probably define
"target endianness" as "the endianness that RAM areas have".)

-- PMM

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 23/02/2017 11:02, Peter Maydell wrote:
> On 23 February 2017 at 08:35, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>
>> On 23/02/2017 05:20, Alexey Kardashevskiy wrote:
>>> First, Paolo is right and ram_device_mem_ops::endianness should be
>>> host-endian which happens to be little in our test case (ppc64le)
>>
>> So you tested a ppc64 BE guest and it works?
>>
>>> Keep things where they are in the VFIO department and just fix
>>> ram_device_mem_ops::endianness?
>>
>> I would fix the ram_device_mem_ops.  Either by introducing
>> DEVICE_HOST_ENDIAN(*) or with Yongji's patch.
>>
>> (*) DEVICE_NATIVE_ENDIAN is special cased all over the place
>>     because the same device (in a file that's compiled just once)
>>     can be either little- or big-endian.  DEVICE_HOST_ENDIAN can
>>     be a simple #define to either DEVICE_LITTLE_ENDIAN or
>>     DEVICE_BIG_ENDIAN, because host endianness is the same for
>>     all QEMU binaries.  It's literally half a dozen lines of code.
> 
> I'm really not convinced we need DEVICE_HOST_ENDIAN. RAM
> areas should be target-endian (you can probably define
> "target endianness" as "the endianness that RAM areas have".)

This is not RAM.  This is MMIO, backed by a MMIO area in the host.  The
MemoryRegionOps read from the MMIO area (so the data has host
endianness) and do not do any further swap:

        data = *(uint16_t *)(mr->ram_block->host + addr);

Here, the dereference is basically the same as ldl_he_p.

If you wanted to make the MemoryRegion use DEVICE_NATIVE_ENDIAN, you'd
need to tswap around the access.  Or you can use ldl_le_p and
DEVICE_LITTLE_ENDIAN (this is what Yongji's patch open codes), or
ldl_be_p and DEVICE_BIG_ENDIAN.  They are all the same in the end.

Paolo

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Peter Maydell 8 years, 11 months ago

On 23 February 2017 at 10:10, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 23/02/2017 11:02, Peter Maydell wrote:
>> I'm really not convinced we need DEVICE_HOST_ENDIAN. RAM
>> areas should be target-endian (you can probably define
>> "target endianness" as "the endianness that RAM areas have".)
>
> This is not RAM.  This is MMIO, backed by a MMIO area in the host.

Hmm, I see...the naming is a bit unfortunate if it's not RAM.

> The
> MemoryRegionOps read from the MMIO area (so the data has host
> endianness) and do not do any further swap:
>
>         data = *(uint16_t *)(mr->ram_block->host + addr);
>
> Here, the dereference is basically the same as ldl_he_p.
>
> If you wanted to make the MemoryRegion use DEVICE_NATIVE_ENDIAN, you'd
> need to tswap around the access.  Or you can use ldl_le_p and
> DEVICE_LITTLE_ENDIAN (this is what Yongji's patch open codes), or
> ldl_be_p and DEVICE_BIG_ENDIAN.  They are all the same in the end.

Using stl_p &c in a DEVICE_NATIVE_ENDIAN MR would work too, right?
(This is how all the NATIVE_ENDIAN MRs in exec.c work.)

thanks
-- PMM

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 23/02/2017 11:23, Peter Maydell wrote:
> On 23 February 2017 at 10:10, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> On 23/02/2017 11:02, Peter Maydell wrote:
>>> I'm really not convinced we need DEVICE_HOST_ENDIAN. RAM
>>> areas should be target-endian (you can probably define
>>> "target endianness" as "the endianness that RAM areas have".)
>>
>> This is not RAM.  This is MMIO, backed by a MMIO area in the host.
> 
> Hmm, I see...the naming is a bit unfortunate if it's not RAM.

Yeah, it's called like that because it is backed by a RAMBlock but it
returns false for memory_access_is_direct.

>> The
>> MemoryRegionOps read from the MMIO area (so the data has host
>> endianness) and do not do any further swap:
>>
>>         data = *(uint16_t *)(mr->ram_block->host + addr);
>>
>> Here, the dereference is basically the same as ldl_he_p.
>>
>> If you wanted to make the MemoryRegion use DEVICE_NATIVE_ENDIAN, you'd
>> need to tswap around the access.  Or you can use ldl_le_p and
>> DEVICE_LITTLE_ENDIAN (this is what Yongji's patch open codes), or
>> ldl_be_p and DEVICE_BIG_ENDIAN.  They are all the same in the end.
> 
> Using stl_p &c in a DEVICE_NATIVE_ENDIAN MR would work too, right?
> (This is how all the NATIVE_ENDIAN MRs in exec.c work.)

Yes, it should, as long as the memcpy(...) of {ld,st}*_he_p is compiled
to a single access, which should be the case.

Paolo

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Peter Maydell 8 years, 11 months ago

On 23 February 2017 at 10:33, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 23/02/2017 11:23, Peter Maydell wrote:
>> On 23 February 2017 at 10:10, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> On 23/02/2017 11:02, Peter Maydell wrote:
>>>> I'm really not convinced we need DEVICE_HOST_ENDIAN. RAM
>>>> areas should be target-endian (you can probably define
>>>> "target endianness" as "the endianness that RAM areas have".)
>>>
>>> This is not RAM.  This is MMIO, backed by a MMIO area in the host.
>>
>> Hmm, I see...the naming is a bit unfortunate if it's not RAM.
>
> Yeah, it's called like that because it is backed by a RAMBlock but it
> returns false for memory_access_is_direct.

We should probably update the doc comment to note that the
pointer is to host-endianness memory (and that this is not
like normal RAM which is target-endian)...

>>> The
>>> MemoryRegionOps read from the MMIO area (so the data has host
>>> endianness) and do not do any further swap:
>>>
>>>         data = *(uint16_t *)(mr->ram_block->host + addr);
>>>
>>> Here, the dereference is basically the same as ldl_he_p.
>>>
>>> If you wanted to make the MemoryRegion use DEVICE_NATIVE_ENDIAN, you'd
>>> need to tswap around the access.  Or you can use ldl_le_p and
>>> DEVICE_LITTLE_ENDIAN (this is what Yongji's patch open codes), or
>>> ldl_be_p and DEVICE_BIG_ENDIAN.  They are all the same in the end.
>>
>> Using stl_p &c in a DEVICE_NATIVE_ENDIAN MR would work too, right?
>> (This is how all the NATIVE_ENDIAN MRs in exec.c work.)
>
> Yes, it should, as long as the memcpy(...) of {ld,st}*_he_p is compiled
> to a single access, which should be the case.

...and whichever of these approaches we take, we should have
a comment which notes that we are converting from the host
endianness memory to the endianness specified by the MemoryRegion
endianness attribute.

thanks
-- PMM

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 23/02/2017 12:34, Peter Maydell wrote:
> On 23 February 2017 at 10:33, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>
>> On 23/02/2017 11:23, Peter Maydell wrote:
>>> On 23 February 2017 at 10:10, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>> On 23/02/2017 11:02, Peter Maydell wrote:
>>>>> I'm really not convinced we need DEVICE_HOST_ENDIAN. RAM
>>>>> areas should be target-endian (you can probably define
>>>>> "target endianness" as "the endianness that RAM areas have".)
>>>>
>>>> This is not RAM.  This is MMIO, backed by a MMIO area in the host.
>>>
>>> Hmm, I see...the naming is a bit unfortunate if it's not RAM.
>>
>> Yeah, it's called like that because it is backed by a RAMBlock but it
>> returns false for memory_access_is_direct.
> 
> We should probably update the doc comment to note that the
> pointer is to host-endianness memory (and that this is not
> like normal RAM which is target-endian)...

I wouldn't call it host-endianness memory, and I disagree that normal
RAM is target-endian---in both cases it's just a bunch of bytes.

However, the access done by the MemoryRegionOps callbacks needs to match
the endianness declared by the MemoryRegionOps themselves.

Paolo

>>>> The
>>>> MemoryRegionOps read from the MMIO area (so the data has host
>>>> endianness) and do not do any further swap:
>>>>
>>>>         data = *(uint16_t *)(mr->ram_block->host + addr);
>>>>
>>>> Here, the dereference is basically the same as ldl_he_p.
>>>>
>>>> If you wanted to make the MemoryRegion use DEVICE_NATIVE_ENDIAN, you'd
>>>> need to tswap around the access.  Or you can use ldl_le_p and
>>>> DEVICE_LITTLE_ENDIAN (this is what Yongji's patch open codes), or
>>>> ldl_be_p and DEVICE_BIG_ENDIAN.  They are all the same in the end.
>>>
>>> Using stl_p &c in a DEVICE_NATIVE_ENDIAN MR would work too, right?
>>> (This is how all the NATIVE_ENDIAN MRs in exec.c work.)
>>
>> Yes, it should, as long as the memcpy(...) of {ld,st}*_he_p is compiled
>> to a single access, which should be the case.
> 
> ...and whichever of these approaches we take, we should have
> a comment which notes that we are converting from the host
> endianness memory to the endianness specified by the MemoryRegion
> endianness attribute.
> 
> thanks
> -- PMM
>

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Peter Maydell 8 years, 11 months ago

On 23 February 2017 at 11:43, Paolo Bonzini <pbonzini@redhat.com> wrote:
> On 23/02/2017 12:34, Peter Maydell wrote:
>> We should probably update the doc comment to note that the
>> pointer is to host-endianness memory (and that this is not
>> like normal RAM which is target-endian)...
>
> I wouldn't call it host-endianness memory, and I disagree that normal
> RAM is target-endian---in both cases it's just a bunch of bytes.
>
> However, the access done by the MemoryRegionOps callbacks needs to match
> the endianness declared by the MemoryRegionOps themselves.

Well, if the guest stores a bunch of integers to the memory, which
way round do you see them when you look at the bunch of bytes? The
answer is different for the two things, so something is different
about their endianness...

thanks
-- PMM

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 23/02/2017 13:26, Peter Maydell wrote:
> On 23 February 2017 at 11:43, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> On 23/02/2017 12:34, Peter Maydell wrote:
>>> We should probably update the doc comment to note that the
>>> pointer is to host-endianness memory (and that this is not
>>> like normal RAM which is target-endian)...
>>
>> I wouldn't call it host-endianness memory, and I disagree that normal
>> RAM is target-endian---in both cases it's just a bunch of bytes.
>>
>> However, the access done by the MemoryRegionOps callbacks needs to match
>> the endianness declared by the MemoryRegionOps themselves.
> 
> Well, if the guest stores a bunch of integers to the memory, which
> way round do you see them when you look at the bunch of bytes?

You see them in whatever endianness the guest used.  So let's say a
little-endian guest is writing 0x12345678 to a register.  With
DEVICE_HOST_ENDIAN, some examples are:

- little-endian host, little-endian device.  The guest will write
0x12345678, QEMU will write 0x12345678 which becomes 0x78 0x56 0x34
0x12.  The little-endian device is happy because it sees it as 0x12345678.

- little-endian host, big-endian device.  The guest will write
0x78563412 to account for the different endianness of the host.  QEMU
will also write 0x78563412 which becomes 0x12 0x34 0x56 0x78.  The
big-endian device is happy because it sees it as 0x12345678.

- big-endian host, little-endian device.  The guest will write
0x12345678, memory.c will flip it and QEMU will write 0x78563412.  In
bytes this becomes 0x78 0x56 0x34 0x12.  The little-endian device is
happy because it sees it as 0x12345678.

- big-endian host, big-endian device.  The guest will write 0x78563412
to account for the different endianness of the host.  Memory.c will flip
it and QEMU will write 0x12345678, which becomes 0x12 0x34 0x56 0x78.
The big-endian device is happy because it sees it as 0x12345678.

With DEVICE_LITTLE_ENDIAN you add two flips on big-endian hosts; with
DEVICE_BIG_ENDIAN you add two flips on little-endian hosts.

Paolo

> The answer is different for the two things, so something is different
> about their endianness...

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Peter Maydell 8 years, 11 months ago

On 23 February 2017 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 23/02/2017 13:26, Peter Maydell wrote:
>> On 23 February 2017 at 11:43, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> On 23/02/2017 12:34, Peter Maydell wrote:
>>>> We should probably update the doc comment to note that the
>>>> pointer is to host-endianness memory (and that this is not
>>>> like normal RAM which is target-endian)...
>>>
>>> I wouldn't call it host-endianness memory, and I disagree that normal
>>> RAM is target-endian---in both cases it's just a bunch of bytes.
>>>
>>> However, the access done by the MemoryRegionOps callbacks needs to match
>>> the endianness declared by the MemoryRegionOps themselves.
>>
>> Well, if the guest stores a bunch of integers to the memory, which
>> way round do you see them when you look at the bunch of bytes?
>
> You see them in whatever endianness the guest used.

I'm confused. I said "normal RAM and this ramdevice memory are
different", and you seem to be saying they're the same. I don't
think they are (in particular I think with a BE guest on an
LE host they'll look different).

thanks
-- PMM

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 23/02/2017 15:35, Peter Maydell wrote:
> On 23 February 2017 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>
>>
>> On 23/02/2017 13:26, Peter Maydell wrote:
>>> On 23 February 2017 at 11:43, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>> On 23/02/2017 12:34, Peter Maydell wrote:
>>>>> We should probably update the doc comment to note that the
>>>>> pointer is to host-endianness memory (and that this is not
>>>>> like normal RAM which is target-endian)...
>>>>
>>>> I wouldn't call it host-endianness memory, and I disagree that normal
>>>> RAM is target-endian---in both cases it's just a bunch of bytes.
>>>>
>>>> However, the access done by the MemoryRegionOps callbacks needs to match
>>>> the endianness declared by the MemoryRegionOps themselves.
>>>
>>> Well, if the guest stores a bunch of integers to the memory, which
>>> way round do you see them when you look at the bunch of bytes?
>>
>> You see them in whatever endianness the guest used.
> 
> I'm confused. I said "normal RAM and this ramdevice memory are
> different", and you seem to be saying they're the same. I don't
> think they are (in particular I think with a BE guest on an
> LE host they'll look different).

No, they look entirely the same.  The only difference is that they go
through MemoryRegionOps instead of memcpy.

Paolo

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Peter Maydell 8 years, 11 months ago

On 23 February 2017 at 15:21, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 23/02/2017 15:35, Peter Maydell wrote:
>> On 23 February 2017 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>>>
>>> On 23/02/2017 13:26, Peter Maydell wrote:
>>>> On 23 February 2017 at 11:43, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>>> On 23/02/2017 12:34, Peter Maydell wrote:
>>>>>> We should probably update the doc comment to note that the
>>>>>> pointer is to host-endianness memory (and that this is not
>>>>>> like normal RAM which is target-endian)...
>>>>>
>>>>> I wouldn't call it host-endianness memory, and I disagree that normal
>>>>> RAM is target-endian---in both cases it's just a bunch of bytes.
>>>>>
>>>>> However, the access done by the MemoryRegionOps callbacks needs to match
>>>>> the endianness declared by the MemoryRegionOps themselves.
>>>>
>>>> Well, if the guest stores a bunch of integers to the memory, which
>>>> way round do you see them when you look at the bunch of bytes?
>>>
>>> You see them in whatever endianness the guest used.
>>
>> I'm confused. I said "normal RAM and this ramdevice memory are
>> different", and you seem to be saying they're the same. I don't
>> think they are (in particular I think with a BE guest on an
>> LE host they'll look different).
>
> No, they look entirely the same.  The only difference is that they go
> through MemoryRegionOps instead of memcpy.

Then we have a different problem, because the thing this patch
is claiming to fix is that the memory the device is backed by
(from vfio) is little-endian and we're not accessing it right.

RAM of the usual sort is target-endian (by which I mean "when the guest
does a write of 32-bits 0x12345678, and you look at the memory byte
by byte then the order of bytes is either 0x12 0x34 0x56 0x78 if
TARGET_LITTLE_ENDIAN or 0x78 0x56 0x34 0x12 if TARGET_BIG_ENDIAN").

AIUI what we want for this VFIO case is "when the guest does
a 32-bit write of 0x12345678 then the bytes are 0x12 0x34 0x56 0x78
regardless of whether TARGET_BIG_ENDIAN or not".

thanks
-- PMM

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago

On 23/02/2017 16:29, Peter Maydell wrote:
>> No, they look entirely the same.  The only difference is that they go
>> through MemoryRegionOps instead of memcpy.
> Then we have a different problem, because the thing this patch
> is claiming to fix is that the memory the device is backed by
> (from vfio) is little-endian and we're not accessing it right.
> 
> RAM of the usual sort is target-endian (by which I mean "when the guest
> does a write of 32-bits 0x12345678, and you look at the memory byte
> by byte then the order of bytes is either 0x12 0x34 0x56 0x78 if
> TARGET_LITTLE_ENDIAN or 0x78 0x56 0x34 0x12 if TARGET_BIG_ENDIAN").

Okay, I think I see what you mean.

MMIO regions do not expect their datum in any endianness; they just get
a uint64_t representing an 8/16/32/64-bit value.  The datum undergoes a
conversion if necessary, according to target and MemoryRegionOps
endiannesses, before the callback is invoked.

In the ramd case, the ops convert between
bunch-of-bytes-stored-into-RAM-by-host and uint64_t (to uint64_t for
read, from uint64_t for write).  The conversion from/to target
endianness has been done already by the memory.c core code.  The ops'
task is to ensure that this bunch of bytes has the exact order that the
guest desired.  There's more than one way to do it---all that's needed
is that the order used by the ops matches the one specified by the ops'
endianness, resulting in an even number of swaps.

However, DEVICE_NATIVE_ENDIAN would have to be paired with tswap, which
the current code does not do, hence the bug.  To have no swap at all,
you'd need DEVICE_HOST_ENDIAN.

> AIUI what we want for this VFIO case is "when the guest does
> a 32-bit write of 0x12345678 then the bytes are 0x12 0x34 0x56 0x78
> regardless of whether TARGET_BIG_ENDIAN or not".

No, I don't think so.  This is not specific to VFIO.  You can do it with
any device, albeit VFIO is currently the only one using ramd regions.

Paolo

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Peter Maydell 8 years, 11 months ago

On 23 February 2017 at 15:58, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
> On 23/02/2017 16:29, Peter Maydell wrote:
>>> No, they look entirely the same.  The only difference is that they go
>>> through MemoryRegionOps instead of memcpy.
>> Then we have a different problem, because the thing this patch
>> is claiming to fix is that the memory the device is backed by
>> (from vfio) is little-endian and we're not accessing it right.
>>
>> RAM of the usual sort is target-endian (by which I mean "when the guest
>> does a write of 32-bits 0x12345678, and you look at the memory byte
>> by byte then the order of bytes is either 0x12 0x34 0x56 0x78 if
>> TARGET_LITTLE_ENDIAN or 0x78 0x56 0x34 0x12 if TARGET_BIG_ENDIAN").
>
> Okay, I think I see what you mean.
>
> MMIO regions do not expect their datum in any endianness; they just get
> a uint64_t representing an 8/16/32/64-bit value.  The datum undergoes a
> conversion if necessary, according to target and MemoryRegionOps
> endiannesses, before the callback is invoked.

Yep. However, how they implement 8/16/32/64 bit transfers
implies an endianness model (assuming it is consistent and
the device provides enough of the operations to determine one).

> In the ramd case, the ops convert between
> bunch-of-bytes-stored-into-RAM-by-host and uint64_t (to uint64_t for
> read, from uint64_t for write).  The conversion from/to target
> endianness has been done already by the memory.c core code.  The ops'
> task is to ensure that this bunch of bytes has the exact order that the
> guest desired.  There's more than one way to do it---all that's needed
> is that the order used by the ops matches the one specified by the ops'
> endianness, resulting in an even number of swaps.
>
> However, DEVICE_NATIVE_ENDIAN would have to be paired with tswap, which
> the current code does not do, hence the bug.  To have no swap at all,
> you'd need DEVICE_HOST_ENDIAN.

Yes, I agree that the current ramdevice code has this bug (and
that we can fix it by any of the various options).

>> AIUI what we want for this VFIO case is "when the guest does
>> a 32-bit write of 0x12345678 then the bytes are 0x12 0x34 0x56 0x78
>> regardless of whether TARGET_BIG_ENDIAN or not".
>
> No, I don't think so.  This is not specific to VFIO.  You can do it with
> any device, albeit VFIO is currently the only one using ramd regions.

The commit message in the patch that started this thread off
says specifically that "VFIO PCI device is little endian".
Is that wrong?

thanks
-- PMM

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 23/02/2017 17:08, Peter Maydell wrote:
> On 23 February 2017 at 15:58, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> However, DEVICE_NATIVE_ENDIAN would have to be paired with tswap, which
>> the current code does not do, hence the bug.  To have no swap at all,
>> you'd need DEVICE_HOST_ENDIAN.
> 
> Yes, I agree that the current ramdevice code has this bug (and
> that we can fix it by any of the various options).

Good. :)

>>> AIUI what we want for this VFIO case is "when the guest does
>>> a 32-bit write of 0x12345678 then the bytes are 0x12 0x34 0x56 0x78
>>> regardless of whether TARGET_BIG_ENDIAN or not".
>>
>> No, I don't think so.  This is not specific to VFIO.  You can do it with
>> any device, albeit VFIO is currently the only one using ramd regions.
> 
> The commit message in the patch that started this thread off
> says specifically that "VFIO PCI device is little endian".
> Is that wrong?

Yes, I think it's a red herring.  Hence my initial confusion, when I
asked "would Yongji's patch just work if it used DEVICE_BIG_ENDIAN and
beNN_to_cpu/cpu_to_beNN".

Paolo

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Yongji Xie 8 years, 11 months ago

on 2017/2/24 0:15, Paolo Bonzini wrote:

>
> On 23/02/2017 17:08, Peter Maydell wrote:
>> On 23 February 2017 at 15:58, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>> However, DEVICE_NATIVE_ENDIAN would have to be paired with tswap, which
>>> the current code does not do, hence the bug.  To have no swap at all,
>>> you'd need DEVICE_HOST_ENDIAN.
>> Yes, I agree that the current ramdevice code has this bug (and
>> that we can fix it by any of the various options).
> Good. :)
>
>>>> AIUI what we want for this VFIO case is "when the guest does
>>>> a 32-bit write of 0x12345678 then the bytes are 0x12 0x34 0x56 0x78
>>>> regardless of whether TARGET_BIG_ENDIAN or not".
>>> No, I don't think so.  This is not specific to VFIO.  You can do it with
>>> any device, albeit VFIO is currently the only one using ramd regions.
>> The commit message in the patch that started this thread off
>> says specifically that "VFIO PCI device is little endian".
>> Is that wrong?
> Yes, I think it's a red herring.  Hence my initial confusion, when I
> asked "would Yongji's patch just work if it used DEVICE_BIG_ENDIAN and
> beNN_to_cpu/cpu_to_beNN".
>

Thank you for the great discussion. I have a better understanding of the
endianness now.:-)

And for the commit message, I was wrong to assume the same endianness
as vfio. That's my fault. This bug should happen when target and host
endianness are different regardless of the device endianness.

To fix it, introducing DEVICE_HOST_ENDIAN for the ram device seems to be
more reasonable than other ways. I think I'll update the patch with this 
way.

diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index bd15853..eef74df 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -36,6 +36,12 @@ enum device_endian {
      DEVICE_LITTLE_ENDIAN,
  };

+#if defined(HOST_WORDS_BIGENDIAN)
+#define DEVICE_HOST_ENDIAN DEVICE_BIG_ENDIAN
+#else
+#define DEVICE_HOST_ENDIAN DEVICE_LITTLE_ENDIAN
+#endif
+
  /* address in the RAM (different from a physical address) */
  #if defined(CONFIG_XEN_BACKEND)
  typedef uint64_t ram_addr_t;
diff --git a/memory.c b/memory.c
index ed8b5aa..17cfada 100644
--- a/memory.c
+++ b/memory.c
@@ -1180,7 +1180,7 @@ static void memory_region_ram_device_write(void 
*opaque, hwaddr addr,
  static const MemoryRegionOps ram_device_mem_ops = {
      .read = memory_region_ram_device_read,
      .write = memory_region_ram_device_write,
-    .endianness = DEVICE_NATIVE_ENDIAN,
+    .endianness = DEVICE_HOST_ENDIAN,
      .valid = {
          .min_access_size = 1,
          .max_access_size = 8,

Thanks,
Yongji

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by David Gibson 8 years, 11 months ago

On Fri, Feb 24, 2017 at 01:14:09AM +0800, Yongji Xie wrote:
> on 2017/2/24 0:15, Paolo Bonzini wrote:
> 
> > 
> > On 23/02/2017 17:08, Peter Maydell wrote:
> > > On 23 February 2017 at 15:58, Paolo Bonzini <pbonzini@redhat.com> wrote:
> > > > However, DEVICE_NATIVE_ENDIAN would have to be paired with tswap, which
> > > > the current code does not do, hence the bug.  To have no swap at all,
> > > > you'd need DEVICE_HOST_ENDIAN.
> > > Yes, I agree that the current ramdevice code has this bug (and
> > > that we can fix it by any of the various options).
> > Good. :)
> > 
> > > > > AIUI what we want for this VFIO case is "when the guest does
> > > > > a 32-bit write of 0x12345678 then the bytes are 0x12 0x34 0x56 0x78
> > > > > regardless of whether TARGET_BIG_ENDIAN or not".
> > > > No, I don't think so.  This is not specific to VFIO.  You can do it with
> > > > any device, albeit VFIO is currently the only one using ramd regions.
> > > The commit message in the patch that started this thread off
> > > says specifically that "VFIO PCI device is little endian".
> > > Is that wrong?
> > Yes, I think it's a red herring.  Hence my initial confusion, when I
> > asked "would Yongji's patch just work if it used DEVICE_BIG_ENDIAN and
> > beNN_to_cpu/cpu_to_beNN".
> > 
> 
> Thank you for the great discussion. I have a better understanding of the
> endianness now.:-)
> 
> And for the commit message, I was wrong to assume the same endianness
> as vfio. That's my fault. This bug should happen when target and host
> endianness are different regardless of the device endianness.
> 
> To fix it, introducing DEVICE_HOST_ENDIAN for the ram device seems to be
> more reasonable than other ways. I think I'll update the patch with this
> way.

I think this is basically the right approach, with the only caveat
being whether we want to call it "host endian" or something else.

> diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
> index bd15853..eef74df 100644
> --- a/include/exec/cpu-common.h
> +++ b/include/exec/cpu-common.h
> @@ -36,6 +36,12 @@ enum device_endian {
>      DEVICE_LITTLE_ENDIAN,
>  };
> 
> +#if defined(HOST_WORDS_BIGENDIAN)
> +#define DEVICE_HOST_ENDIAN DEVICE_BIG_ENDIAN
> +#else
> +#define DEVICE_HOST_ENDIAN DEVICE_LITTLE_ENDIAN
> +#endif
> +
>  /* address in the RAM (different from a physical address) */
>  #if defined(CONFIG_XEN_BACKEND)
>  typedef uint64_t ram_addr_t;
> diff --git a/memory.c b/memory.c
> index ed8b5aa..17cfada 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -1180,7 +1180,7 @@ static void memory_region_ram_device_write(void
> *opaque, hwaddr addr,
>  static const MemoryRegionOps ram_device_mem_ops = {
>      .read = memory_region_ram_device_read,
>      .write = memory_region_ram_device_write,
> -    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .endianness = DEVICE_HOST_ENDIAN,
>      .valid = {
>          .min_access_size = 1,
>          .max_access_size = 8,
> 
> Thanks,
> Yongji
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paul Mackerras 8 years, 11 months ago

On Thu, Feb 23, 2017 at 03:29:53PM +0000, Peter Maydell wrote:
> On 23 February 2017 at 15:21, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> >
> > On 23/02/2017 15:35, Peter Maydell wrote:
> >> On 23 February 2017 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >>>
> >>>
> >>> On 23/02/2017 13:26, Peter Maydell wrote:
> >>>> On 23 February 2017 at 11:43, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >>>>> On 23/02/2017 12:34, Peter Maydell wrote:
> >>>>>> We should probably update the doc comment to note that the
> >>>>>> pointer is to host-endianness memory (and that this is not
> >>>>>> like normal RAM which is target-endian)...
> >>>>>
> >>>>> I wouldn't call it host-endianness memory, and I disagree that normal
> >>>>> RAM is target-endian---in both cases it's just a bunch of bytes.
> >>>>>
> >>>>> However, the access done by the MemoryRegionOps callbacks needs to match
> >>>>> the endianness declared by the MemoryRegionOps themselves.
> >>>>
> >>>> Well, if the guest stores a bunch of integers to the memory, which
> >>>> way round do you see them when you look at the bunch of bytes?
> >>>
> >>> You see them in whatever endianness the guest used.
> >>
> >> I'm confused. I said "normal RAM and this ramdevice memory are
> >> different", and you seem to be saying they're the same. I don't
> >> think they are (in particular I think with a BE guest on an
> >> LE host they'll look different).
> >
> > No, they look entirely the same.  The only difference is that they go
> > through MemoryRegionOps instead of memcpy.
> 
> Then we have a different problem, because the thing this patch
> is claiming to fix is that the memory the device is backed by
> (from vfio) is little-endian and we're not accessing it right.
> 
> RAM of the usual sort is target-endian (by which I mean "when the guest
> does a write of 32-bits 0x12345678, and you look at the memory byte
> by byte then the order of bytes is either 0x12 0x34 0x56 0x78 if
> TARGET_LITTLE_ENDIAN or 0x78 0x56 0x34 0x12 if TARGET_BIG_ENDIAN").
> 
> AIUI what we want for this VFIO case is "when the guest does
> a 32-bit write of 0x12345678 then the bytes are 0x12 0x34 0x56 0x78
> regardless of whether TARGET_BIG_ENDIAN or not".

At least in the case of KVM and MMIO emulation (which is the case
here), run->mmio.data should be considered as a byte stream without
endianness, and what is needed is for QEMU to transfer data between
run->mmio.data and the device (or whatever is backing the MMIO region)
without any net byte swap.

So if QEMU is doing a 32-bit host-endian load from run->mmio.data
(for an MMIO store), then it needs to do a 32-bit host-endian store
into the device memory.  In other words, the RAM memory region needs
to be considered host endian to match run->mmio.data being considered
host endian.

Paul.

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Alex Williamson 8 years, 11 months ago

On Thu, 23 Feb 2017 16:21:47 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 23/02/2017 15:35, Peter Maydell wrote:
> > On 23 February 2017 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote:  
> >>
> >>
> >> On 23/02/2017 13:26, Peter Maydell wrote:  
> >>> On 23 February 2017 at 11:43, Paolo Bonzini <pbonzini@redhat.com> wrote:  
> >>>> On 23/02/2017 12:34, Peter Maydell wrote:  
> >>>>> We should probably update the doc comment to note that the
> >>>>> pointer is to host-endianness memory (and that this is not
> >>>>> like normal RAM which is target-endian)...  
> >>>>
> >>>> I wouldn't call it host-endianness memory, and I disagree that normal
> >>>> RAM is target-endian---in both cases it's just a bunch of bytes.
> >>>>
> >>>> However, the access done by the MemoryRegionOps callbacks needs to match
> >>>> the endianness declared by the MemoryRegionOps themselves.  
> >>>
> >>> Well, if the guest stores a bunch of integers to the memory, which
> >>> way round do you see them when you look at the bunch of bytes?  
> >>
> >> You see them in whatever endianness the guest used.  
> > 
> > I'm confused. I said "normal RAM and this ramdevice memory are
> > different", and you seem to be saying they're the same. I don't
> > think they are (in particular I think with a BE guest on an
> > LE host they'll look different).  
> 
> No, they look entirely the same.  The only difference is that they go
> through MemoryRegionOps instead of memcpy.

Is this true for vfio use case?  If we use memcpy we're talking directly
to the device with no endian conversions.  If we use read/write then
there is an endian conversion in the host kernel.

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Paolo Bonzini 8 years, 11 months ago


On 23/02/2017 16:39, Alex Williamson wrote:
> On Thu, 23 Feb 2017 16:21:47 +0100
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
>> On 23/02/2017 15:35, Peter Maydell wrote:
>>> On 23 February 2017 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote:  
>>>>
>>>>
>>>> On 23/02/2017 13:26, Peter Maydell wrote:  
>>>>> On 23 February 2017 at 11:43, Paolo Bonzini <pbonzini@redhat.com> wrote:  
>>>>>> On 23/02/2017 12:34, Peter Maydell wrote:  
>>>>>>> We should probably update the doc comment to note that the
>>>>>>> pointer is to host-endianness memory (and that this is not
>>>>>>> like normal RAM which is target-endian)...  
>>>>>>
>>>>>> I wouldn't call it host-endianness memory, and I disagree that normal
>>>>>> RAM is target-endian---in both cases it's just a bunch of bytes.
>>>>>>
>>>>>> However, the access done by the MemoryRegionOps callbacks needs to match
>>>>>> the endianness declared by the MemoryRegionOps themselves.  
>>>>>
>>>>> Well, if the guest stores a bunch of integers to the memory, which
>>>>> way round do you see them when you look at the bunch of bytes?  
>>>>
>>>> You see them in whatever endianness the guest used.  
>>>
>>> I'm confused. I said "normal RAM and this ramdevice memory are
>>> different", and you seem to be saying they're the same. I don't
>>> think they are (in particular I think with a BE guest on an
>>> LE host they'll look different).  
>>
>> No, they look entirely the same.  The only difference is that they go
>> through MemoryRegionOps instead of memcpy.
> 
> Is this true for vfio use case?  If we use memcpy we're talking directly
> to the device with no endian conversions.  If we use read/write then
> there is an endian conversion in the host kernel.

But ramd MemoryRegionOps do not use file read/write, they use memory
read/write, so they talk directly to the device.

Paolo

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Alex Williamson 8 years, 11 months ago

On Thu, 23 Feb 2017 16:47:23 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> On 23/02/2017 16:39, Alex Williamson wrote:
> > On Thu, 23 Feb 2017 16:21:47 +0100
> > Paolo Bonzini <pbonzini@redhat.com> wrote:
> >   
> >> On 23/02/2017 15:35, Peter Maydell wrote:  
> >>> On 23 February 2017 at 12:53, Paolo Bonzini <pbonzini@redhat.com> wrote:    
> >>>>
> >>>>
> >>>> On 23/02/2017 13:26, Peter Maydell wrote:    
> >>>>> On 23 February 2017 at 11:43, Paolo Bonzini <pbonzini@redhat.com> wrote:    
> >>>>>> On 23/02/2017 12:34, Peter Maydell wrote:    
> >>>>>>> We should probably update the doc comment to note that the
> >>>>>>> pointer is to host-endianness memory (and that this is not
> >>>>>>> like normal RAM which is target-endian)...    
> >>>>>>
> >>>>>> I wouldn't call it host-endianness memory, and I disagree that normal
> >>>>>> RAM is target-endian---in both cases it's just a bunch of bytes.
> >>>>>>
> >>>>>> However, the access done by the MemoryRegionOps callbacks needs to match
> >>>>>> the endianness declared by the MemoryRegionOps themselves.    
> >>>>>
> >>>>> Well, if the guest stores a bunch of integers to the memory, which
> >>>>> way round do you see them when you look at the bunch of bytes?    
> >>>>
> >>>> You see them in whatever endianness the guest used.    
> >>>
> >>> I'm confused. I said "normal RAM and this ramdevice memory are
> >>> different", and you seem to be saying they're the same. I don't
> >>> think they are (in particular I think with a BE guest on an
> >>> LE host they'll look different).    
> >>
> >> No, they look entirely the same.  The only difference is that they go
> >> through MemoryRegionOps instead of memcpy.  
> > 
> > Is this true for vfio use case?  If we use memcpy we're talking directly
> > to the device with no endian conversions.  If we use read/write then
> > there is an endian conversion in the host kernel.  
> 
> But ramd MemoryRegionOps do not use file read/write, they use memory
> read/write, so they talk directly to the device.

Ah ha!  Ding! ;)  Sorry, I forgot that.

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by David Gibson 8 years, 11 months ago

On Thu, Feb 23, 2017 at 12:43:37PM +0100, Paolo Bonzini wrote:
> 
> 
> On 23/02/2017 12:34, Peter Maydell wrote:
> > On 23 February 2017 at 10:33, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >>
> >>
> >> On 23/02/2017 11:23, Peter Maydell wrote:
> >>> On 23 February 2017 at 10:10, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >>>> On 23/02/2017 11:02, Peter Maydell wrote:
> >>>>> I'm really not convinced we need DEVICE_HOST_ENDIAN. RAM
> >>>>> areas should be target-endian (you can probably define
> >>>>> "target endianness" as "the endianness that RAM areas have".)
> >>>>
> >>>> This is not RAM.  This is MMIO, backed by a MMIO area in the host.
> >>>
> >>> Hmm, I see...the naming is a bit unfortunate if it's not RAM.
> >>
> >> Yeah, it's called like that because it is backed by a RAMBlock but it
> >> returns false for memory_access_is_direct.
> > 
> > We should probably update the doc comment to note that the
> > pointer is to host-endianness memory (and that this is not
> > like normal RAM which is target-endian)...
> 
> I wouldn't call it host-endianness memory, and I disagree that normal
> RAM is target-endian---in both cases it's just a bunch of bytes.

Right.  Really, endianness is not a property of the device - even less
so of RAM.  It's a property of an individual multibyte value and how
it is interpreted relative to individual byte addresses.

Really the declarations in the MRs are saying: assuming the guest
(software + hardware) does (big|little|target native) accesses on this
device, do the right swaps so we get host native multi-byte values
which match the original multibyte values the guest had in mind.

What we want for memory (both RAM and VFIO MMIO) is: don't even try to
preserve multi-byte values between host and guest views, just preserve
byte address order.  Tastes may vary as to whether you call that "host
endian" or "no endian" or "bag-o'-bytesian" or whatever.

> However, the access done by the MemoryRegionOps callbacks needs to match
> the endianness declared by the MemoryRegionOps themselves.
> 
> Paolo
> 
> >>>> The
> >>>> MemoryRegionOps read from the MMIO area (so the data has host
> >>>> endianness) and do not do any further swap:
> >>>>
> >>>>         data = *(uint16_t *)(mr->ram_block->host + addr);
> >>>>
> >>>> Here, the dereference is basically the same as ldl_he_p.
> >>>>
> >>>> If you wanted to make the MemoryRegion use DEVICE_NATIVE_ENDIAN, you'd
> >>>> need to tswap around the access.  Or you can use ldl_le_p and
> >>>> DEVICE_LITTLE_ENDIAN (this is what Yongji's patch open codes), or
> >>>> ldl_be_p and DEVICE_BIG_ENDIAN.  They are all the same in the end.
> >>>
> >>> Using stl_p &c in a DEVICE_NATIVE_ENDIAN MR would work too, right?
> >>> (This is how all the NATIVE_ENDIAN MRs in exec.c work.)
> >>
> >> Yes, it should, as long as the memcpy(...) of {ld,st}*_he_p is compiled
> >> to a single access, which should be the case.
> > 
> > ...and whichever of these approaches we take, we should have
> > a comment which notes that we are converting from the host
> > endianness memory to the endianness specified by the MemoryRegion
> > endianness attribute.
> > 
> > thanks
> > -- PMM
> > 
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Alexey Kardashevskiy 8 years, 11 months ago

On 23/02/17 19:35, Paolo Bonzini wrote:
> 
> 
> On 23/02/2017 05:20, Alexey Kardashevskiy wrote:
>> First, Paolo is right and ram_device_mem_ops::endianness should be
>> host-endian which happens to be little in our test case (ppc64le)
> 
> So you tested a ppc64 BE guest and it works?

Yes. Setting ram_device_mem_ops.endianness = DEVICE_LITTLE_ENDIAN does the
job for both LE and BE guests on the LE host.



-- 
Alexey

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Michael Roth 8 years, 11 months ago

Quoting Alexey Kardashevskiy (2017-02-22 22:20:25)
> On 21/02/17 17:46, Yongji Xie wrote:
> > At the moment ram device's memory regions are NATIVE_ENDIAN. This does
> > not work on PPC64 because VFIO PCI device is little endian but PPC64
> > always defines static macro TARGET_WORDS_BIGENDIAN.
> > 
> > This fixes endianness for ram device the same way as it is done
> > for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.
> > 
> > Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
> > ---
> >  memory.c |   14 +++++++-------
> >  1 files changed, 7 insertions(+), 7 deletions(-)
> > 
> > diff --git a/memory.c b/memory.c
> > index 6c58373..1ccb99f 100644
> > --- a/memory.c
> > +++ b/memory.c
> > @@ -1139,13 +1139,13 @@ static uint64_t memory_region_ram_device_read(void *opaque,
> >          data = *(uint8_t *)(mr->ram_block->host + addr);
> >          break;
> >      case 2:
> > -        data = *(uint16_t *)(mr->ram_block->host + addr);
> > +        data = le16_to_cpu(*(uint16_t *)(mr->ram_block->host + addr));
> >          break;
> >      case 4:
> > -        data = *(uint32_t *)(mr->ram_block->host + addr);
> > +        data = le32_to_cpu(*(uint32_t *)(mr->ram_block->host + addr));
> >          break;
> >      case 8:
> > -        data = *(uint64_t *)(mr->ram_block->host + addr);
> > +        data = le64_to_cpu(*(uint64_t *)(mr->ram_block->host + addr));
> >          break;
> >      }
> >  
> > @@ -1166,13 +1166,13 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
> >          *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data;
> >          break;
> >      case 2:
> > -        *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data;
> > +        *(uint16_t *)(mr->ram_block->host + addr) = cpu_to_le16((uint16_t)data);
> >          break;
> >      case 4:
> > -        *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data;
> > +        *(uint32_t *)(mr->ram_block->host + addr) = cpu_to_le32((uint32_t)data);
> >          break;
> >      case 8:
> > -        *(uint64_t *)(mr->ram_block->host + addr) = data;
> > +        *(uint64_t *)(mr->ram_block->host + addr) = cpu_to_le64(data);
> >          break;
> >      }
> >  }
> > @@ -1180,7 +1180,7 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
> >  static const MemoryRegionOps ram_device_mem_ops = {
> >      .read = memory_region_ram_device_read,
> >      .write = memory_region_ram_device_write,
> > -    .endianness = DEVICE_NATIVE_ENDIAN,
> > +    .endianness = DEVICE_LITTLE_ENDIAN,
> >      .valid = {
> >          .min_access_size = 1,
> >          .max_access_size = 8,
> >
>         
> 
> I did some debugging today.
> 
> First, Paolo is right and ram_device_mem_ops::endianness should be
> host-endian which happens to be little in our test case (ppc64le) so
> changes to .read/.write are actually no-op (I believe so, have not checked).
> 
> 
> But I was wondering why this gets executed at all.
> 
> The test case is:
> 
> qemu-system-ppc64 ...
> -device "vfio-pci,id=vfio0001_03_00_0,host=0001:03:00.0
> -drive id=DRIVE0,if=none,file=./test.qcow2,format=qcow2
> -device virtio-blk-pci,id=vblk0,drive=DRIVE0
> 
> The host kernel is v4.10, ppc64le (little endian), 64K system page size,
> QEMU is v2.8.0.
> 
> When this boots, lspci shows:
> 
> 00:00.0 Ethernet controller: Chelsio Communications Inc T310 10GbE Single
> Port Adapter
>         Subsystem: IBM Device 038c
>         Flags: bus master, fast devsel, latency 0, IRQ 18
>         Memory at 210000004000 (64-bit, non-prefetchable) [size=4K]
>         Memory at 210000800000 (64-bit, non-prefetchable) [size=8M]
>         Memory at 210001000000 (64-bit, non-prefetchable) [size=4K]
>         Expansion ROM at 2000c0080000 [disabled] [size=512K]
>         Capabilities: [40] Power Management version 3
>         Capabilities: [48] MSI: Enable- Count=1/32 Maskable- 64bit+
>         Capabilities: [58] Express Endpoint, MSI 00
>         Capabilities: [94] Vital Product Data
>         Capabilities: [9c] MSI-X: Enable+ Count=32 Masked-
>         Kernel driver in use: cxgb3
> 
> 00:01.0 SCSI storage controller: Red Hat, Inc Virtio block device
>         Subsystem: Red Hat, Inc Device 0002
>         Physical Slot: C16
>         Flags: bus master, fast devsel, latency 0, IRQ 17
>         I/O ports at 0040 [size=64]
>         Memory at 2000c0000000 (32-bit, non-prefetchable) [size=4K]
>         Memory at 210000000000 (64-bit, prefetchable) [size=16K]
>         Capabilities: [98] MSI-X: Enable+ Count=2 Masked-
>         Capabilities: [84] Vendor Specific Information: Len=14 <?>
>         Capabilities: [70] Vendor Specific Information: Len=14 <?>
>         Capabilities: [60] Vendor Specific Information: Len=10 <?>
>         Capabilities: [50] Vendor Specific Information: Len=10 <?>
>         Capabilities: [40] Vendor Specific Information: Len=10 <?>
>         Kernel driver in use: virtio-pci
> 
> 
> As we can see, BAR0 of Chelsio is 210000004000 - not aligned (it should
> have been aligned but it is not - this is another bug, in QEMU). Normally

I think SLOF is the culprit in this case. The patch below enforces
64k alignment for mmio/mem regions at boot-time. I'm not sure if this
should be done for all devices, or limited specifically to VFIO though
(controlled perhaps via a per-device property? or at the machine-level
at least?):

https://github.com/mdroth/SLOF/commit/6ff6827740aba39d8db9bebcc3ae069bdf934d06

I've only sniff-tested it with virtio so far. Does this fix things
for Chelsio?

aligned (enabled):
  Bus  0, device   2, function 0:
    Ethernet controller: PCI device 1af4:1000
      IRQ 0.
      BAR0: I/O at 0x0040 [0x005f].
      BAR1: 32 bit memory at 0xc0080000 [0xc0080fff].
      BAR4: 64 bit prefetchable memory at 0x210000010000 [0x210000013fff].
      BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
      id ""
  Bus  0, device   3, function 0:
    Ethernet controller: PCI device 1af4:1000
      IRQ 0.
      BAR0: I/O at 0x0020 [0x003f].
      BAR1: 32 bit memory at 0xc0000000 [0xc0000fff].
      BAR4: 64 bit prefetchable memory at 0x210000000000 [0x210000003fff].
      BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
      id ""

aligned (disabled):
  Bus  0, device   2, function 0:
    Ethernet controller: PCI device 1af4:1000
      IRQ 0.
      BAR0: I/O at 0x0040 [0x005f].
      BAR1: 32 bit memory at 0xc0080000 [0xc0080fff].
      BAR4: 64 bit prefetchable memory at 0x210000004000 [0x210000007fff].
      BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
      id ""
  Bus  0, device   3, function 0:
    Ethernet controller: PCI device 1af4:1000
      IRQ 0.
      BAR0: I/O at 0x0020 [0x003f].
      BAR1: 32 bit memory at 0xc0000000 [0xc0000fff].
      BAR4: 64 bit prefetchable memory at 0x210000000000 [0x210000003fff].
      BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
      id ""

upstream:
  Bus  0, device   2, function 0:
    Ethernet controller: PCI device 1af4:1000
      IRQ 0.
      BAR0: I/O at 0x0040 [0x005f].
      BAR1: 32 bit memory at 0xc0080000 [0xc0080fff].
      BAR4: 64 bit prefetchable memory at 0x210000004000 [0x210000007fff].
      BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
      id ""
  Bus  0, device   3, function 0:
    Ethernet controller: PCI device 1af4:1000
      IRQ 0.
      BAR0: I/O at 0x0020 [0x003f].
      BAR1: 32 bit memory at 0xc0000000 [0xc0000fff].
      BAR4: 64 bit prefetchable memory at 0x210000000000 [0x210000003fff].
      BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
      id ""


> such a BAR would be emulated by VFIO. However, since 95251725e335 "vfio:
> Add support for mmapping sub-page MMIO BARs" we mmap such BARs to QEMU in a
> hope they will be registered later via KVM_SET_USER_MEMORY_REGION - and
> this fails as the guest address is not host page size aligned.
> 
> So we end up having the following memory tree:
> 
> memory-region: pci@800000020000000.mmio
>   0000000000000000-ffffffffffffffff (prio 0, RW): pci@800000020000000.mmio
>     00000000c0000000-00000000c0000fff (prio 1, RW): virtio-blk-pci-msix
>       00000000c0000000-00000000c000001f (prio 0, RW): msix-table
>       00000000c0000800-00000000c0000807 (prio 0, RW): msix-pba
>     0000210000000000-0000210000003fff (prio 1, RW): virtio-pci
>       0000210000000000-0000210000000fff (prio 0, RW): virtio-pci-common
>       0000210000001000-0000210000001fff (prio 0, RW): virtio-pci-isr
>       0000210000002000-0000210000002fff (prio 0, RW): virtio-pci-device
>       0000210000003000-0000210000003fff (prio 0, RW): virtio-pci-notify
>     0000210000004000-0000210000004fff (prio 1, RW): 0001:03:00.0 BAR 0
>       0000210000004000-0000210000004fff (prio 0, RW): 0001:03:00.0 BAR 0
> mmaps[0]
>     0000210000800000-0000210000ffffff (prio 1, RW): 0001:03:00.0 BAR 2
>       0000210000800000-0000210000ffffff (prio 0, RW): 0001:03:00.0 BAR 2
> mmaps[0]
>     0000210001000000-0000210001000fff (prio 1, RW): 0001:03:00.0 BAR 4
>       0000210001000000-00002100010001ff (prio 0, RW): msix-table
>       0000210001000800-0000210001000807 (prio 0, RW): msix-pba [disabled]
> 
> The problem region which this patch is fixing is "0001:03:00.0 BAR 0
> mmaps[0]". It is mmaped to QEMU and not registered in KVM. So when the
> guest accesses this BAR, we trap into KVM, KVM passes it to QEMU and QEMU
> read/writes this memory region.
> 
> A simple hack like below fixes it - it basically removes mmap'd memory
> region from the tree and MMIO starts being handled by the parent MR -
> "0001:03:00.0 BAR 0" (which is "little endian" - vfio_region_ops).
> 
> 
> I am wondering now what would be a correct approach here?
> 
> Add/remove mmap'd MRs once we detect aligned/unaligned BARs?
> 
> Keep things where they are in the VFIO department and just fix
> ram_device_mem_ops::endianness?

VFIO tries to report similar scenarios in realize():

  vfio_bar_setup:
    if (vfio_region_mmap(&bar->region)) {
      error_report("Failed to mmap %s BAR %d. Performance may be slow",
                   vdev->vbasedev.name, nr);
    }

maybe we should at least be reporting it in this case as well? In our
case we'll see it every reset/hotplug though, so maybe a trace makes
more sense. Even with the patch for SLOF this would continue to be an
issue for hotplug until BAR assignment is moved to QEMU for pseries,
so would be good to have a simple way to check for it. Can send a patch
if that makes sense.

> 
> Thanks.
> 
> 
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index d7dbe0e3e0..0657a27623 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -1109,7 +1109,10 @@ static void
> vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
>      memory_region_transaction_begin();
> 
>      memory_region_set_size(mr, size);
> -    memory_region_set_size(mmap_mr, size);
> +    if (bar_addr & qemu_real_host_page_mask)
> +          memory_region_del_subregion(mr, mmap_mr);
> +    else
> +        memory_region_set_size(mmap_mr, size);
>      if (size != region->size && memory_region_is_mapped(mr)) {
>          memory_region_del_subregion(r->address_space, mr);
>          memory_region_add_subregion_overlap(r->address_space,
> 
> 
> 
> -- 
> Alexey
>

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Alexey Kardashevskiy 8 years, 11 months ago

On 27/02/17 13:25, Michael Roth wrote:
> Quoting Alexey Kardashevskiy (2017-02-22 22:20:25)
>> On 21/02/17 17:46, Yongji Xie wrote:
>>> At the moment ram device's memory regions are NATIVE_ENDIAN. This does
>>> not work on PPC64 because VFIO PCI device is little endian but PPC64
>>> always defines static macro TARGET_WORDS_BIGENDIAN.
>>>
>>> This fixes endianness for ram device the same way as it is done
>>> for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.
>>>
>>> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
>>> ---
>>>  memory.c |   14 +++++++-------
>>>  1 files changed, 7 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/memory.c b/memory.c
>>> index 6c58373..1ccb99f 100644
>>> --- a/memory.c
>>> +++ b/memory.c
>>> @@ -1139,13 +1139,13 @@ static uint64_t memory_region_ram_device_read(void *opaque,
>>>          data = *(uint8_t *)(mr->ram_block->host + addr);
>>>          break;
>>>      case 2:
>>> -        data = *(uint16_t *)(mr->ram_block->host + addr);
>>> +        data = le16_to_cpu(*(uint16_t *)(mr->ram_block->host + addr));
>>>          break;
>>>      case 4:
>>> -        data = *(uint32_t *)(mr->ram_block->host + addr);
>>> +        data = le32_to_cpu(*(uint32_t *)(mr->ram_block->host + addr));
>>>          break;
>>>      case 8:
>>> -        data = *(uint64_t *)(mr->ram_block->host + addr);
>>> +        data = le64_to_cpu(*(uint64_t *)(mr->ram_block->host + addr));
>>>          break;
>>>      }
>>>  
>>> @@ -1166,13 +1166,13 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>>>          *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data;
>>>          break;
>>>      case 2:
>>> -        *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data;
>>> +        *(uint16_t *)(mr->ram_block->host + addr) = cpu_to_le16((uint16_t)data);
>>>          break;
>>>      case 4:
>>> -        *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data;
>>> +        *(uint32_t *)(mr->ram_block->host + addr) = cpu_to_le32((uint32_t)data);
>>>          break;
>>>      case 8:
>>> -        *(uint64_t *)(mr->ram_block->host + addr) = data;
>>> +        *(uint64_t *)(mr->ram_block->host + addr) = cpu_to_le64(data);
>>>          break;
>>>      }
>>>  }
>>> @@ -1180,7 +1180,7 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>>>  static const MemoryRegionOps ram_device_mem_ops = {
>>>      .read = memory_region_ram_device_read,
>>>      .write = memory_region_ram_device_write,
>>> -    .endianness = DEVICE_NATIVE_ENDIAN,
>>> +    .endianness = DEVICE_LITTLE_ENDIAN,
>>>      .valid = {
>>>          .min_access_size = 1,
>>>          .max_access_size = 8,
>>>
>>         
>>
>> I did some debugging today.
>>
>> First, Paolo is right and ram_device_mem_ops::endianness should be
>> host-endian which happens to be little in our test case (ppc64le) so
>> changes to .read/.write are actually no-op (I believe so, have not checked).
>>
>>
>> But I was wondering why this gets executed at all.
>>
>> The test case is:
>>
>> qemu-system-ppc64 ...
>> -device "vfio-pci,id=vfio0001_03_00_0,host=0001:03:00.0
>> -drive id=DRIVE0,if=none,file=./test.qcow2,format=qcow2
>> -device virtio-blk-pci,id=vblk0,drive=DRIVE0
>>
>> The host kernel is v4.10, ppc64le (little endian), 64K system page size,
>> QEMU is v2.8.0.
>>
>> When this boots, lspci shows:
>>
>> 00:00.0 Ethernet controller: Chelsio Communications Inc T310 10GbE Single
>> Port Adapter
>>         Subsystem: IBM Device 038c
>>         Flags: bus master, fast devsel, latency 0, IRQ 18
>>         Memory at 210000004000 (64-bit, non-prefetchable) [size=4K]
>>         Memory at 210000800000 (64-bit, non-prefetchable) [size=8M]
>>         Memory at 210001000000 (64-bit, non-prefetchable) [size=4K]
>>         Expansion ROM at 2000c0080000 [disabled] [size=512K]
>>         Capabilities: [40] Power Management version 3
>>         Capabilities: [48] MSI: Enable- Count=1/32 Maskable- 64bit+
>>         Capabilities: [58] Express Endpoint, MSI 00
>>         Capabilities: [94] Vital Product Data
>>         Capabilities: [9c] MSI-X: Enable+ Count=32 Masked-
>>         Kernel driver in use: cxgb3
>>
>> 00:01.0 SCSI storage controller: Red Hat, Inc Virtio block device
>>         Subsystem: Red Hat, Inc Device 0002
>>         Physical Slot: C16
>>         Flags: bus master, fast devsel, latency 0, IRQ 17
>>         I/O ports at 0040 [size=64]
>>         Memory at 2000c0000000 (32-bit, non-prefetchable) [size=4K]
>>         Memory at 210000000000 (64-bit, prefetchable) [size=16K]
>>         Capabilities: [98] MSI-X: Enable+ Count=2 Masked-
>>         Capabilities: [84] Vendor Specific Information: Len=14 <?>
>>         Capabilities: [70] Vendor Specific Information: Len=14 <?>
>>         Capabilities: [60] Vendor Specific Information: Len=10 <?>
>>         Capabilities: [50] Vendor Specific Information: Len=10 <?>
>>         Capabilities: [40] Vendor Specific Information: Len=10 <?>
>>         Kernel driver in use: virtio-pci
>>
>>
>> As we can see, BAR0 of Chelsio is 210000004000 - not aligned (it should
>> have been aligned but it is not - this is another bug, in QEMU). Normally
> 
> I think SLOF is the culprit in this case. The patch below enforces
> 64k alignment for mmio/mem regions at boot-time. I'm not sure if this
> should be done for all devices, or limited specifically to VFIO though
> (controlled perhaps via a per-device property? or at the machine-level
> at least?):


I was sure we have this in SLOF and now I see that we don't. Hm :)



> https://github.com/mdroth/SLOF/commit/6ff6827740aba39d8db9bebcc3ae069bdf934d06
> 
> I've only sniff-tested it with virtio so far. Does this fix things
> for Chelsio?
> 
> aligned (enabled):
>   Bus  0, device   2, function 0:
>     Ethernet controller: PCI device 1af4:1000
>       IRQ 0.
>       BAR0: I/O at 0x0040 [0x005f].
>       BAR1: 32 bit memory at 0xc0080000 [0xc0080fff].
>       BAR4: 64 bit prefetchable memory at 0x210000010000 [0x210000013fff].
>       BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
>       id ""
>   Bus  0, device   3, function 0:
>     Ethernet controller: PCI device 1af4:1000
>       IRQ 0.
>       BAR0: I/O at 0x0020 [0x003f].
>       BAR1: 32 bit memory at 0xc0000000 [0xc0000fff].
>       BAR4: 64 bit prefetchable memory at 0x210000000000 [0x210000003fff].
>       BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
>       id ""
> 
> aligned (disabled):
>   Bus  0, device   2, function 0:
>     Ethernet controller: PCI device 1af4:1000
>       IRQ 0.
>       BAR0: I/O at 0x0040 [0x005f].
>       BAR1: 32 bit memory at 0xc0080000 [0xc0080fff].
>       BAR4: 64 bit prefetchable memory at 0x210000004000 [0x210000007fff].
>       BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
>       id ""
>   Bus  0, device   3, function 0:
>     Ethernet controller: PCI device 1af4:1000
>       IRQ 0.
>       BAR0: I/O at 0x0020 [0x003f].
>       BAR1: 32 bit memory at 0xc0000000 [0xc0000fff].
>       BAR4: 64 bit prefetchable memory at 0x210000000000 [0x210000003fff].
>       BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
>       id ""
> 
> upstream:
>   Bus  0, device   2, function 0:
>     Ethernet controller: PCI device 1af4:1000
>       IRQ 0.
>       BAR0: I/O at 0x0040 [0x005f].
>       BAR1: 32 bit memory at 0xc0080000 [0xc0080fff].
>       BAR4: 64 bit prefetchable memory at 0x210000004000 [0x210000007fff].
>       BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
>       id ""
>   Bus  0, device   3, function 0:
>     Ethernet controller: PCI device 1af4:1000
>       IRQ 0.
>       BAR0: I/O at 0x0020 [0x003f].
>       BAR1: 32 bit memory at 0xc0000000 [0xc0000fff].
>       BAR4: 64 bit prefetchable memory at 0x210000000000 [0x210000003fff].
>       BAR6: 32 bit memory at 0xffffffffffffffff [0x0003fffe].
>       id ""
> 
> 
>> such a BAR would be emulated by VFIO. However, since 95251725e335 "vfio:
>> Add support for mmapping sub-page MMIO BARs" we mmap such BARs to QEMU in a
>> hope they will be registered later via KVM_SET_USER_MEMORY_REGION - and
>> this fails as the guest address is not host page size aligned.
>>
>> So we end up having the following memory tree:
>>
>> memory-region: pci@800000020000000.mmio
>>   0000000000000000-ffffffffffffffff (prio 0, RW): pci@800000020000000.mmio
>>     00000000c0000000-00000000c0000fff (prio 1, RW): virtio-blk-pci-msix
>>       00000000c0000000-00000000c000001f (prio 0, RW): msix-table
>>       00000000c0000800-00000000c0000807 (prio 0, RW): msix-pba
>>     0000210000000000-0000210000003fff (prio 1, RW): virtio-pci
>>       0000210000000000-0000210000000fff (prio 0, RW): virtio-pci-common
>>       0000210000001000-0000210000001fff (prio 0, RW): virtio-pci-isr
>>       0000210000002000-0000210000002fff (prio 0, RW): virtio-pci-device
>>       0000210000003000-0000210000003fff (prio 0, RW): virtio-pci-notify
>>     0000210000004000-0000210000004fff (prio 1, RW): 0001:03:00.0 BAR 0
>>       0000210000004000-0000210000004fff (prio 0, RW): 0001:03:00.0 BAR 0
>> mmaps[0]
>>     0000210000800000-0000210000ffffff (prio 1, RW): 0001:03:00.0 BAR 2
>>       0000210000800000-0000210000ffffff (prio 0, RW): 0001:03:00.0 BAR 2
>> mmaps[0]
>>     0000210001000000-0000210001000fff (prio 1, RW): 0001:03:00.0 BAR 4
>>       0000210001000000-00002100010001ff (prio 0, RW): msix-table
>>       0000210001000800-0000210001000807 (prio 0, RW): msix-pba [disabled]
>>
>> The problem region which this patch is fixing is "0001:03:00.0 BAR 0
>> mmaps[0]". It is mmaped to QEMU and not registered in KVM. So when the
>> guest accesses this BAR, we trap into KVM, KVM passes it to QEMU and QEMU
>> read/writes this memory region.
>>
>> A simple hack like below fixes it - it basically removes mmap'd memory
>> region from the tree and MMIO starts being handled by the parent MR -
>> "0001:03:00.0 BAR 0" (which is "little endian" - vfio_region_ops).
>>
>>
>> I am wondering now what would be a correct approach here?
>>
>> Add/remove mmap'd MRs once we detect aligned/unaligned BARs?
>>
>> Keep things where they are in the VFIO department and just fix
>> ram_device_mem_ops::endianness?
> 
> VFIO tries to report similar scenarios in realize():
> 
>   vfio_bar_setup:
>     if (vfio_region_mmap(&bar->region)) {
>       error_report("Failed to mmap %s BAR %d. Performance may be slow",
>                    vdev->vbasedev.name, nr);
>     }
> 
> maybe we should at least be reporting it in this case as well? In our
> case we'll see it every reset/hotplug though, so maybe a trace makes
> more sense.


Tracepoint should be enough imho.


> Even with the patch for SLOF this would continue to be an
> issue for hotplug until BAR assignment is moved to QEMU for pseries,
> so would be good to have a simple way to check for it. Can send a patch
> if that makes sense.

Can BAR allocation be done by QEMU when hotplugging?


> 
>>
>> Thanks.
>>
>>
>>
>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>> index d7dbe0e3e0..0657a27623 100644
>> --- a/hw/vfio/pci.c
>> +++ b/hw/vfio/pci.c
>> @@ -1109,7 +1109,10 @@ static void
>> vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
>>      memory_region_transaction_begin();
>>
>>      memory_region_set_size(mr, size);
>> -    memory_region_set_size(mmap_mr, size);
>> +    if (bar_addr & qemu_real_host_page_mask)
>> +          memory_region_del_subregion(mr, mmap_mr);
>> +    else
>> +        memory_region_set_size(mmap_mr, size);
>>      if (size != region->size && memory_region_is_mapped(mr)) {
>>          memory_region_del_subregion(r->address_space, mr);
>>          memory_region_add_subregion_overlap(r->address_space,
>>
>>
>>
>> -- 
>> Alexey
>>
> 


-- 
Alexey

Re: [Qemu-devel] [PATCH] memory: make ram device read/write endian sensitive

Posted by Yongji Xie 8 years, 11 months ago

on 2017/2/27 11:25, Alexey Kardashevskiy wrote:

> On 27/02/17 13:25, Michael Roth wrote:
>> Quoting Alexey Kardashevskiy (2017-02-22 22:20:25)
>>> On 21/02/17 17:46, Yongji Xie wrote:
>>>> At the moment ram device's memory regions are NATIVE_ENDIAN. This does
>>>> not work on PPC64 because VFIO PCI device is little endian but PPC64
>>>> always defines static macro TARGET_WORDS_BIGENDIAN.
>>>>
>>>> This fixes endianness for ram device the same way as it is done
>>>> for VFIO region in commit 6758008e2c4e79fb6bd04fe8e7a41665fa583965.
>>>>
>>>> Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
>>>> ---
>>>>   memory.c |   14 +++++++-------
>>>>   1 files changed, 7 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/memory.c b/memory.c
>>>> index 6c58373..1ccb99f 100644
>>>> --- a/memory.c
>>>> +++ b/memory.c
>>>> @@ -1139,13 +1139,13 @@ static uint64_t memory_region_ram_device_read(void *opaque,
>>>>           data = *(uint8_t *)(mr->ram_block->host + addr);
>>>>           break;
>>>>       case 2:
>>>> -        data = *(uint16_t *)(mr->ram_block->host + addr);
>>>> +        data = le16_to_cpu(*(uint16_t *)(mr->ram_block->host + addr));
>>>>           break;
>>>>       case 4:
>>>> -        data = *(uint32_t *)(mr->ram_block->host + addr);
>>>> +        data = le32_to_cpu(*(uint32_t *)(mr->ram_block->host + addr));
>>>>           break;
>>>>       case 8:
>>>> -        data = *(uint64_t *)(mr->ram_block->host + addr);
>>>> +        data = le64_to_cpu(*(uint64_t *)(mr->ram_block->host + addr));
>>>>           break;
>>>>       }
>>>>   
>>>> @@ -1166,13 +1166,13 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>>>>           *(uint8_t *)(mr->ram_block->host + addr) = (uint8_t)data;
>>>>           break;
>>>>       case 2:
>>>> -        *(uint16_t *)(mr->ram_block->host + addr) = (uint16_t)data;
>>>> +        *(uint16_t *)(mr->ram_block->host + addr) = cpu_to_le16((uint16_t)data);
>>>>           break;
>>>>       case 4:
>>>> -        *(uint32_t *)(mr->ram_block->host + addr) = (uint32_t)data;
>>>> +        *(uint32_t *)(mr->ram_block->host + addr) = cpu_to_le32((uint32_t)data);
>>>>           break;
>>>>       case 8:
>>>> -        *(uint64_t *)(mr->ram_block->host + addr) = data;
>>>> +        *(uint64_t *)(mr->ram_block->host + addr) = cpu_to_le64(data);
>>>>           break;
>>>>       }
>>>>   }
>>>> @@ -1180,7 +1180,7 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr,
>>>>   static const MemoryRegionOps ram_device_mem_ops = {
>>>>       .read = memory_region_ram_device_read,
>>>>       .write = memory_region_ram_device_write,
>>>> -    .endianness = DEVICE_NATIVE_ENDIAN,
>>>> +    .endianness = DEVICE_LITTLE_ENDIAN,
>>>>       .valid = {
>>>>           .min_access_size = 1,
>>>>           .max_access_size = 8,
>>>>
>>>          
>>>
>>> I did some debugging today.
>>>
>>> First, Paolo is right and ram_device_mem_ops::endianness should be
>>> host-endian which happens to be little in our test case (ppc64le) so
>>> changes to .read/.write are actually no-op (I believe so, have not checked).
>>>
>>>
>>> But I was wondering why this gets executed at all.
>>>
>>> The test case is:
>>>
>>> qemu-system-ppc64 ...
>>> -device "vfio-pci,id=vfio0001_03_00_0,host=0001:03:00.0
>>> -drive id=DRIVE0,if=none,file=./test.qcow2,format=qcow2
>>> -device virtio-blk-pci,id=vblk0,drive=DRIVE0
>>>
>>> The host kernel is v4.10, ppc64le (little endian), 64K system page size,
>>> QEMU is v2.8.0.
>>>
>>> When this boots, lspci shows:
>>>
>>> 00:00.0 Ethernet controller: Chelsio Communications Inc T310 10GbE Single
>>> Port Adapter
>>>          Subsystem: IBM Device 038c
>>>          Flags: bus master, fast devsel, latency 0, IRQ 18
>>>          Memory at 210000004000 (64-bit, non-prefetchable) [size=4K]
>>>          Memory at 210000800000 (64-bit, non-prefetchable) [size=8M]
>>>          Memory at 210001000000 (64-bit, non-prefetchable) [size=4K]
>>>          Expansion ROM at 2000c0080000 [disabled] [size=512K]
>>>          Capabilities: [40] Power Management version 3
>>>          Capabilities: [48] MSI: Enable- Count=1/32 Maskable- 64bit+
>>>          Capabilities: [58] Express Endpoint, MSI 00
>>>          Capabilities: [94] Vital Product Data
>>>          Capabilities: [9c] MSI-X: Enable+ Count=32 Masked-
>>>          Kernel driver in use: cxgb3
>>>
>>> 00:01.0 SCSI storage controller: Red Hat, Inc Virtio block device
>>>          Subsystem: Red Hat, Inc Device 0002
>>>          Physical Slot: C16
>>>          Flags: bus master, fast devsel, latency 0, IRQ 17
>>>          I/O ports at 0040 [size=64]
>>>          Memory at 2000c0000000 (32-bit, non-prefetchable) [size=4K]
>>>          Memory at 210000000000 (64-bit, prefetchable) [size=16K]
>>>          Capabilities: [98] MSI-X: Enable+ Count=2 Masked-
>>>          Capabilities: [84] Vendor Specific Information: Len=14 <?>
>>>          Capabilities: [70] Vendor Specific Information: Len=14 <?>
>>>          Capabilities: [60] Vendor Specific Information: Len=10 <?>
>>>          Capabilities: [50] Vendor Specific Information: Len=10 <?>
>>>          Capabilities: [40] Vendor Specific Information: Len=10 <?>
>>>          Kernel driver in use: virtio-pci
>>>
>>>
>>> As we can see, BAR0 of Chelsio is 210000004000 - not aligned (it should
>>> have been aligned but it is not - this is another bug, in QEMU). Normally
>> I think SLOF is the culprit in this case. The patch below enforces
>> 64k alignment for mmio/mem regions at boot-time. I'm not sure if this
>> should be done for all devices, or limited specifically to VFIO though
>> (controlled perhaps via a per-device property? or at the machine-level
>> at least?):
>
> I was sure we have this in SLOF and now I see that we don't. Hm :)
>

I have a quick check. Seems like the SLOF patch was only in pkvm3.1.1 
SLOF tree...

Thanks,
Yongji