[PATCH v1 2/4] accel/kvm: Keep track of the HWPoisonPage page_size

“William Roche posted 4 patches 1 month ago
There is a newer version of this series
[PATCH v1 2/4] accel/kvm: Keep track of the HWPoisonPage page_size
Posted by “William Roche 1 month ago
From: William Roche <william.roche@oracle.com>

Add the page size information to the hwpoison_page_list elements.
As the kernel doesn't always report the actual poisoned page size,
we adjust this size from the backend real page size.
We take into account the recorded page size to adjust the size
and location of the memory hole.

Signed-off-by: William Roche <william.roche@oracle.com>
---
 accel/kvm/kvm-all.c       | 14 ++++++++++----
 include/exec/cpu-common.h |  1 +
 include/sysemu/kvm.h      |  3 ++-
 include/sysemu/kvm_int.h  |  3 ++-
 system/physmem.c          | 20 ++++++++++++++++++++
 target/arm/kvm.c          |  8 ++++++--
 target/i386/kvm/kvm.c     |  8 ++++++--
 7 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 2adc4d9c24..40117eefa7 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1266,6 +1266,7 @@ int kvm_vm_check_extension(KVMState *s, unsigned int extension)
  */
 typedef struct HWPoisonPage {
     ram_addr_t ram_addr;
+    size_t     page_size;
     QLIST_ENTRY(HWPoisonPage) list;
 } HWPoisonPage;
 
@@ -1278,15 +1279,18 @@ static void kvm_unpoison_all(void *param)
 
     QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
         QLIST_REMOVE(page, list);
-        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
+        qemu_ram_remap(page->ram_addr, page->page_size);
         g_free(page);
     }
 }
 
-void kvm_hwpoison_page_add(ram_addr_t ram_addr)
+void kvm_hwpoison_page_add(ram_addr_t ram_addr, size_t sz)
 {
     HWPoisonPage *page;
 
+    if (sz > TARGET_PAGE_SIZE)
+        ram_addr = ROUND_DOWN(ram_addr, sz);
+
     QLIST_FOREACH(page, &hwpoison_page_list, list) {
         if (page->ram_addr == ram_addr) {
             return;
@@ -1294,6 +1298,7 @@ void kvm_hwpoison_page_add(ram_addr_t ram_addr)
     }
     page = g_new(HWPoisonPage, 1);
     page->ram_addr = ram_addr;
+    page->page_size = sz;
     QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
 }
 
@@ -3140,7 +3145,8 @@ int kvm_cpu_exec(CPUState *cpu)
         if (unlikely(have_sigbus_pending)) {
             bql_lock();
             kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
-                                    pending_sigbus_addr);
+                                    pending_sigbus_addr,
+                                    pending_sigbus_addr_lsb);
             have_sigbus_pending = false;
             bql_unlock();
         }
@@ -3678,7 +3684,7 @@ int kvm_on_sigbus(int code, void *addr, short addr_lsb)
      * we can only get action optional here.
      */
     assert(code != BUS_MCEERR_AR);
-    kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
+    kvm_arch_on_sigbus_vcpu(first_cpu, code, addr, addr_lsb);
     return 0;
 #else
     return 1;
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 638dc806a5..b971b13306 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -108,6 +108,7 @@ bool qemu_ram_is_named_file(RAMBlock *rb);
 int qemu_ram_get_fd(RAMBlock *rb);
 
 size_t qemu_ram_pagesize(RAMBlock *block);
+size_t qemu_ram_pagesize_from_host(void *addr);
 size_t qemu_ram_pagesize_largest(void);
 
 /**
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 1bde598404..4106a7ec07 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -383,7 +383,8 @@ bool kvm_vcpu_id_is_valid(int vcpu_id);
 unsigned long kvm_arch_vcpu_id(CPUState *cpu);
 
 #ifdef KVM_HAVE_MCE_INJECTION
-void kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr);
+void kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr,
+                             short addr_lsb);
 #endif
 
 void kvm_arch_init_irq_routing(KVMState *s);
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index a1e72763da..d2160be0ae 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -178,10 +178,11 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size);
  *
  * Parameters:
  *  @ram_addr: the address in the RAM for the poisoned page
+ *  @sz: size of the poisoned page as reported by the kernel
  *
  * Add a poisoned page to the list
  *
  * Return: None.
  */
-void kvm_hwpoison_page_add(ram_addr_t ram_addr);
+void kvm_hwpoison_page_add(ram_addr_t ram_addr, size_t sz);
 #endif
diff --git a/system/physmem.c b/system/physmem.c
index dc1db3a384..3757428336 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -1665,6 +1665,26 @@ size_t qemu_ram_pagesize(RAMBlock *rb)
     return rb->page_size;
 }
 
+/* Returns backend real page size used for the given address */
+size_t qemu_ram_pagesize_from_host(void *addr)
+{
+    RAMBlock *rb;
+    ram_addr_t offset;
+
+    /*
+     * XXX kernel provided size is not reliable...
+     * As kvm_send_hwpoison_signal() uses a hard-coded PAGE_SHIFT
+     * signal value on hwpoison signal.
+     * So we must identify the actual size to consider from the
+     * mapping block pagesize.
+     */
+    rb =  qemu_ram_block_from_host(addr, false, &offset);
+    if (!rb) {
+        return TARGET_PAGE_SIZE;
+    }
+    return qemu_ram_pagesize(rb);
+}
+
 /* Returns the largest size of page in use */
 size_t qemu_ram_pagesize_largest(void)
 {
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index f1f1b5b375..11579e170b 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -2348,10 +2348,11 @@ int kvm_arch_get_registers(CPUState *cs, Error **errp)
     return ret;
 }
 
-void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
+void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr, short addr_lsb)
 {
     ram_addr_t ram_addr;
     hwaddr paddr;
+    size_t sz = (addr_lsb > 0) ? (1 << addr_lsb) : TARGET_PAGE_SIZE;
 
     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
 
@@ -2359,7 +2360,10 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
-            kvm_hwpoison_page_add(ram_addr);
+            if (sz == TARGET_PAGE_SIZE) {
+                sz = qemu_ram_pagesize_from_host(addr);
+            }
+            kvm_hwpoison_page_add(ram_addr, sz);
             /*
              * If this is a BUS_MCEERR_AR, we know we have been called
              * synchronously from the vCPU thread, so we can easily
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index fd9f198892..71e674bca0 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -735,12 +735,13 @@ static void hardware_memory_error(void *host_addr)
     exit(1);
 }
 
-void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
+void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr, short addr_lsb)
 {
     X86CPU *cpu = X86_CPU(c);
     CPUX86State *env = &cpu->env;
     ram_addr_t ram_addr;
     hwaddr paddr;
+    size_t sz = (addr_lsb > 0) ? (1 << addr_lsb) : TARGET_PAGE_SIZE;
 
     /* If we get an action required MCE, it has been injected by KVM
      * while the VM was running.  An action optional MCE instead should
@@ -753,7 +754,10 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
         ram_addr = qemu_ram_addr_from_host(addr);
         if (ram_addr != RAM_ADDR_INVALID &&
             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
-            kvm_hwpoison_page_add(ram_addr);
+            if (sz == TARGET_PAGE_SIZE) {
+                sz = qemu_ram_pagesize_from_host(addr);
+            }
+            kvm_hwpoison_page_add(ram_addr, sz);
             kvm_mce_inject(cpu, paddr, code);
 
             /*
-- 
2.43.5
Re: [PATCH v1 2/4] accel/kvm: Keep track of the HWPoisonPage page_size
Posted by David Hildenbrand 1 month ago
On 22.10.24 23:35, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
> 
> Add the page size information to the hwpoison_page_list elements.
> As the kernel doesn't always report the actual poisoned page size,
> we adjust this size from the backend real page size.
> We take into account the recorded page size to adjust the size
> and location of the memory hole.
> 
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
>   accel/kvm/kvm-all.c       | 14 ++++++++++----
>   include/exec/cpu-common.h |  1 +
>   include/sysemu/kvm.h      |  3 ++-
>   include/sysemu/kvm_int.h  |  3 ++-
>   system/physmem.c          | 20 ++++++++++++++++++++
>   target/arm/kvm.c          |  8 ++++++--
>   target/i386/kvm/kvm.c     |  8 ++++++--
>   7 files changed, 47 insertions(+), 10 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 2adc4d9c24..40117eefa7 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -1266,6 +1266,7 @@ int kvm_vm_check_extension(KVMState *s, unsigned int extension)
>    */
>   typedef struct HWPoisonPage {
>       ram_addr_t ram_addr;
> +    size_t     page_size;
>       QLIST_ENTRY(HWPoisonPage) list;
>   } HWPoisonPage;
>   
> @@ -1278,15 +1279,18 @@ static void kvm_unpoison_all(void *param)
>   
>       QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>           QLIST_REMOVE(page, list);
> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
> +        qemu_ram_remap(page->ram_addr, page->page_size);

Can't we just use the page size from the RAMBlock in qemu_ram_remap? 
There we lookup the RAMBlock, and all pages in a RAMBlock have the same 
size.

I'll note that qemu_ram_remap() is rather stupid and optimized only for 
private memory (not shmem etc).

mmap(MAP_FIXED|MAP_SHARED, fd) will give you the same poisoned page from 
the pagecache; you'd have to punch a hole instead.

It might be better to use ram_block_discard_range() in the long run. 
Memory preallocation + page pinning is tricky, but we could simply bail 
out in these cases (preallocation failing, ram discard being disabled).

qemu_ram_remap() might be problematic with page pinning (vfio) as is in 
any way :(

-- 
Cheers,

David / dhildenb


Re: [PATCH v1 2/4] accel/kvm: Keep track of the HWPoisonPage page_size
Posted by William Roche 4 weeks ago
On 10/23/24 09:28, David Hildenbrand wrote:
> On 22.10.24 23:35, “William Roche wrote:
>> From: William Roche <william.roche@oracle.com>
>>
>> Add the page size information to the hwpoison_page_list elements.
>> As the kernel doesn't always report the actual poisoned page size,
>> we adjust this size from the backend real page size.
>> We take into account the recorded page size to adjust the size
>> and location of the memory hole.
>>
>> Signed-off-by: William Roche <william.roche@oracle.com>
>> ---
>>   accel/kvm/kvm-all.c       | 14 ++++++++++----
>>   include/exec/cpu-common.h |  1 +
>>   include/sysemu/kvm.h      |  3 ++-
>>   include/sysemu/kvm_int.h  |  3 ++-
>>   system/physmem.c          | 20 ++++++++++++++++++++
>>   target/arm/kvm.c          |  8 ++++++--
>>   target/i386/kvm/kvm.c     |  8 ++++++--
>>   7 files changed, 47 insertions(+), 10 deletions(-)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index 2adc4d9c24..40117eefa7 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -1266,6 +1266,7 @@ int kvm_vm_check_extension(KVMState *s, unsigned 
>> int extension)
>>    */
>>   typedef struct HWPoisonPage {
>>       ram_addr_t ram_addr;
>> +    size_t     page_size;
>>       QLIST_ENTRY(HWPoisonPage) list;
>>   } HWPoisonPage;
>> @@ -1278,15 +1279,18 @@ static void kvm_unpoison_all(void *param)
>>       QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>>           QLIST_REMOVE(page, list);
>> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
>> +        qemu_ram_remap(page->ram_addr, page->page_size);
> 
> Can't we just use the page size from the RAMBlock in qemu_ram_remap? 
> There we lookup the RAMBlock, and all pages in a RAMBlock have the same 
> size.

Yes, we could use the page size from the RAMBlock in qemu_ram_remap() 
that is called when the VM is resetting. I think that knowing the 
information about the size of poisoned chunk of memory when the poison 
is created is useful to give a trace of what is going on, before seeing 
maybe other pages being reported as poisoned. That's the 4th patch goal 
to give an information as soon as we get it.
It also helps to filter the new errors reported and only create an entry 
in the hwpoison_page_list for new large pages.

Now we could delay the page size retrieval until we are resetting and 
present the information (post mortem). I do think that having the 
information earlier is better in this case.



> 
> I'll note that qemu_ram_remap() is rather stupid and optimized only for 
> private memory (not shmem etc).
> 
> mmap(MAP_FIXED|MAP_SHARED, fd) will give you the same poisoned page from 
> the pagecache; you'd have to punch a hole instead.
> 
> It might be better to use ram_block_discard_range() in the long run. 
> Memory preallocation + page pinning is tricky, but we could simply bail 
> out in these cases (preallocation failing, ram discard being disabled).

I see that ram_block_discard_range() adds more control before discarding 
the RAM region and can also call madvise() in addition to the fallocate 
punch hole for standard sized memory pages. Now as the range is supposed 
to be recreated, I'm not convinced that these madvise calls are necessary.

But we can also notice that this function will report the following 
warning in all cases of not shared file backends:
"ram_block_discard_range: Discarding RAM in private file mappings is 
possibly dangerous, because it will modify the underlying file and will 
affect other users of the file"
Which means that hugetlbfs configurations do see this new cryptic 
warning message on reboot if it is impacted by a memory poisoning.
So I would prefer to leave the fallocate call in the qemu_ram_remap() 
function. Or would you prefer to enhance ram_block_discard_range() code 
to avoid the message in a reset situation (when called from 
qemu_ram_remap) ?


> 
> qemu_ram_remap() might be problematic with page pinning (vfio) as is in 
> any way :(
> 

I agree. If qemu_ram_remap() fails, Qemu is ended either abort() or 
exit(1). Do you say that memory pinning could be detected by 
ram_block_discard_range() or maybe mmap call for the impacted region and 
make one of them fail ? This would be an additional reason to call 
ram_block_discard_range() from qemu_ram_remap().   Is it what you are 
suggesting ?

Re: [PATCH v1 2/4] accel/kvm: Keep track of the HWPoisonPage page_size
Posted by William Roche 4 weeks ago
On 10/23/24 09:28, David Hildenbrand wrote:

> On 22.10.24 23:35, “William Roche wrote:
>> From: William Roche <william.roche@oracle.com>
>>
>> Add the page size information to the hwpoison_page_list elements.
>> As the kernel doesn't always report the actual poisoned page size,
>> we adjust this size from the backend real page size.
>> We take into account the recorded page size to adjust the size
>> and location of the memory hole.
>>
>> Signed-off-by: William Roche <william.roche@oracle.com>
>> ---
>>   accel/kvm/kvm-all.c       | 14 ++++++++++----
>>   include/exec/cpu-common.h |  1 +
>>   include/sysemu/kvm.h      |  3 ++-
>>   include/sysemu/kvm_int.h  |  3 ++-
>>   system/physmem.c          | 20 ++++++++++++++++++++
>>   target/arm/kvm.c          |  8 ++++++--
>>   target/i386/kvm/kvm.c     |  8 ++++++--
>>   7 files changed, 47 insertions(+), 10 deletions(-)
>>
>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>> index 2adc4d9c24..40117eefa7 100644
>> --- a/accel/kvm/kvm-all.c
>> +++ b/accel/kvm/kvm-all.c
>> @@ -1266,6 +1266,7 @@ int kvm_vm_check_extension(KVMState *s, 
>> unsigned int extension)
>>    */
>>   typedef struct HWPoisonPage {
>>       ram_addr_t ram_addr;
>> +    size_t     page_size;
>>       QLIST_ENTRY(HWPoisonPage) list;
>>   } HWPoisonPage;
>>   @@ -1278,15 +1279,18 @@ static void kvm_unpoison_all(void *param)
>>         QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>>           QLIST_REMOVE(page, list);
>> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
>> +        qemu_ram_remap(page->ram_addr, page->page_size);
>
> Can't we just use the page size from the RAMBlock in qemu_ram_remap? 
> There we lookup the RAMBlock, and all pages in a RAMBlock have the 
> same size.


Yes, we could use the page size from the RAMBlock in qemu_ram_remap() 
that is called when the VM is resetting. I think that knowing the 
information about the size of poisoned chunk of memory when the poison 
is created is useful to give a trace of what is going on, before seeing 
maybe other pages being reported as poisoned. That's the 4th patch goal 
to give an information as soon as we get it.
It also helps to filter the new errors reported and only create an entry 
in the hwpoison_page_list for new large pages.
Now we could delay the page size retrieval until we are resetting and 
present the information (post mortem). I do think that having the 
information earlier is better in this case.


>
> I'll note that qemu_ram_remap() is rather stupid and optimized only 
> for private memory (not shmem etc).
>
> mmap(MAP_FIXED|MAP_SHARED, fd) will give you the same poisoned page 
> from the pagecache; you'd have to punch a hole instead.
>
> It might be better to use ram_block_discard_range() in the long run. 
> Memory preallocation + page pinning is tricky, but we could simply 
> bail out in these cases (preallocation failing, ram discard being 
> disabled).


I see that ram_block_discard_range() adds more control before discarding 
the RAM region and can also call madvise() in addition to the fallocate 
punch hole for standard sized memory pages. Now as the range is supposed 
to be recreated, I'm not convinced that these madvise calls are necessary.

But we can also notice that this function will report the following 
warning in all cases of not shared file backends:
"ram_block_discard_range: Discarding RAM in private file mappings is 
possibly dangerous, because it will modify the underlying file and will 
affect other users of the file"
Which means that hugetlbfs configurations do see this new cryptic 
warning message on reboot if it is impacted by a memory poisoning.
So I would prefer to leave the fallocate call in the qemu_ram_remap() 
function. Or would you prefer to enhance ram_block_discard_range()code 
to avoid the message in a reset situation (when called from qemu_ram_remap)?


>
> qemu_ram_remap() might be problematic with page pinning (vfio) as is 
> in any way :(

I agree. If qemu_ram_remap() fails, Qemu is ended either abort() or 
exit(1). Do you say that memory pinning could be detected by 
ram_block_discard_range() or maybe mmap call for the impacted region and 
make one of them fail ? This would be an additional reason to call 
ram_block_discard_range() from qemu_ram_remap().   Is it what you are 
suggesting ?
Re: [PATCH v1 2/4] accel/kvm: Keep track of the HWPoisonPage page_size
Posted by David Hildenbrand 3 weeks, 5 days ago
On 26.10.24 01:27, William Roche wrote:
> On 10/23/24 09:28, David Hildenbrand wrote:
> 
>> On 22.10.24 23:35, “William Roche wrote:
>>> From: William Roche <william.roche@oracle.com>
>>>
>>> Add the page size information to the hwpoison_page_list elements.
>>> As the kernel doesn't always report the actual poisoned page size,
>>> we adjust this size from the backend real page size.
>>> We take into account the recorded page size to adjust the size
>>> and location of the memory hole.
>>>
>>> Signed-off-by: William Roche <william.roche@oracle.com>
>>> ---
>>>   accel/kvm/kvm-all.c       | 14 ++++++++++----
>>>   include/exec/cpu-common.h |  1 +
>>>   include/sysemu/kvm.h      |  3 ++-
>>>   include/sysemu/kvm_int.h  |  3 ++-
>>>   system/physmem.c          | 20 ++++++++++++++++++++
>>>   target/arm/kvm.c          |  8 ++++++--
>>>   target/i386/kvm/kvm.c     |  8 ++++++--
>>>   7 files changed, 47 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>>> index 2adc4d9c24..40117eefa7 100644
>>> --- a/accel/kvm/kvm-all.c
>>> +++ b/accel/kvm/kvm-all.c
>>> @@ -1266,6 +1266,7 @@ int kvm_vm_check_extension(KVMState *s, 
>>> unsigned int extension)
>>>    */
>>>   typedef struct HWPoisonPage {
>>>       ram_addr_t ram_addr;
>>> +    size_t     page_size;
>>>       QLIST_ENTRY(HWPoisonPage) list;
>>>   } HWPoisonPage;
>>>   @@ -1278,15 +1279,18 @@ static void kvm_unpoison_all(void *param)
>>>         QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
>>>           QLIST_REMOVE(page, list);
>>> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
>>> +        qemu_ram_remap(page->ram_addr, page->page_size);
>>
>> Can't we just use the page size from the RAMBlock in qemu_ram_remap? 
>> There we lookup the RAMBlock, and all pages in a RAMBlock have the 
>> same size.
> 
> 
> Yes, we could use the page size from the RAMBlock in qemu_ram_remap() 
> that is called when the VM is resetting. I think that knowing the 
> information about the size of poisoned chunk of memory when the poison 
> is created is useful to give a trace of what is going on, before seeing 
> maybe other pages being reported as poisoned. That's the 4th patch goal 
> to give an information as soon as we get it.
> It also helps to filter the new errors reported and only create an entry 
> in the hwpoison_page_list for new large pages.
> Now we could delay the page size retrieval until we are resetting and 
> present the information (post mortem). I do think that having the 
> information earlier is better in this case.

If it is not required for this patch, then please move the other stuff 
to patch #4.

Here, we really only have to discard a large page, which we can derive 
from the QEMU RAMBlock page size.

> 
> 
>>
>> I'll note that qemu_ram_remap() is rather stupid and optimized only 
>> for private memory (not shmem etc).
>>
>> mmap(MAP_FIXED|MAP_SHARED, fd) will give you the same poisoned page 
>> from the pagecache; you'd have to punch a hole instead.
>>
>> It might be better to use ram_block_discard_range() in the long run. 
>> Memory preallocation + page pinning is tricky, but we could simply 
>> bail out in these cases (preallocation failing, ram discard being 
>> disabled).
> 
> 
> I see that ram_block_discard_range() adds more control before discarding 
> the RAM region and can also call madvise() in addition to the fallocate 
> punch hole for standard sized memory pages. Now as the range is supposed 
> to be recreated, I'm not convinced that these madvise calls are necessary.

They are the proper replacement for the mmap(MAP_FIXED) + fallocate.

That function handles all cases of properly discarding guest RAM.

> 
> But we can also notice that this function will report the following 
> warning in all cases of not shared file backends:
> "ram_block_discard_range: Discarding RAM in private file mappings is 
> possibly dangerous, because it will modify the underlying file and will 
> affect other users of the file"

Yes, because it's a clear warning sign that something weird is 
happening. You might be throwing away data that some other process might 
be relying on.

How are you making QEMU consume hugetlbs?

We could suppress these warnings, but let's first see how you are able 
to trigger it.

> Which means that hugetlbfs configurations do see this new cryptic 
> warning message on reboot if it is impacted by a memory poisoning.
> So I would prefer to leave the fallocate call in the qemu_ram_remap() 
> function. Or would you prefer to enhance ram_block_discard_range()code 
> to avoid the message in a reset situation (when called from qemu_ram_remap)?

Please try reusing the mechanism to discard guest RAM instead of 
open-coding this. We still have to use mmap(MAP_FIXED) as a backup, but 
otherwise this function should mostly do+check what you need.

(-warnings we might want to report differently / suppress)

If you want, I can start a quick prototype of what it could look like 
when using ram_block_discard_range() + ram_block_discard_is_disabled() + 
fallback to existing mmap(MAP_FIXED).

> 
> 
>>
>> qemu_ram_remap() might be problematic with page pinning (vfio) as is 
>> in any way :(
> 
> I agree. If qemu_ram_remap() fails, Qemu is ended either abort() or 
> exit(1). Do you say that memory pinning could be detected by 
> ram_block_discard_range() or maybe mmap call for the impacted region and 
> make one of them fail ? This would be an additional reason to call 
> ram_block_discard_range() from qemu_ram_remap().   Is it what you are 
> suggesting ?

ram_block_discard_is_disabled() might be the right test. If discarding 
is disabled, then rebooting might create an inconsistency with 
e.g.,vfio, resulting in the issues we know from memory ballooning where 
the state vfio sees will be different from the state the guest kernel 
sees. It's tricky ... and we much rather quit the VM early instead of 
corrupting data later :/

-- 
Cheers,

David / dhildenb


Re: [PATCH v1 2/4] accel/kvm: Keep track of the HWPoisonPage page_size
Posted by William Roche 3 weeks, 3 days ago
On 10/28/24 17:42, David Hildenbrand wrote:
> On 26.10.24 01:27, William Roche wrote:
>> On 10/23/24 09:28, David Hildenbrand wrote:
>>
>>> On 22.10.24 23:35, “William Roche wrote:
>>>> From: William Roche <william.roche@oracle.com>
>>>>
>>>> Add the page size information to the hwpoison_page_list elements.
>>>> As the kernel doesn't always report the actual poisoned page size,
>>>> we adjust this size from the backend real page size.
>>>> We take into account the recorded page size to adjust the size
>>>> and location of the memory hole.
>>>>
>>>> Signed-off-by: William Roche <william.roche@oracle.com>
>>>> ---
>>>>   accel/kvm/kvm-all.c       | 14 ++++++++++----
>>>>   include/exec/cpu-common.h |  1 +
>>>>   include/sysemu/kvm.h      |  3 ++-
>>>>   include/sysemu/kvm_int.h  |  3 ++-
>>>>   system/physmem.c          | 20 ++++++++++++++++++++
>>>>   target/arm/kvm.c          |  8 ++++++--
>>>>   target/i386/kvm/kvm.c     |  8 ++++++--
>>>>   7 files changed, 47 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
>>>> index 2adc4d9c24..40117eefa7 100644
>>>> --- a/accel/kvm/kvm-all.c
>>>> +++ b/accel/kvm/kvm-all.c
>>>> @@ -1266,6 +1266,7 @@ int kvm_vm_check_extension(KVMState *s, 
>>>> unsigned int extension)
>>>>    */
>>>>   typedef struct HWPoisonPage {
>>>>       ram_addr_t ram_addr;
>>>> +    size_t     page_size;
>>>>       QLIST_ENTRY(HWPoisonPage) list;
>>>>   } HWPoisonPage;
>>>>   @@ -1278,15 +1279,18 @@ static void kvm_unpoison_all(void *param)
>>>>         QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, 
>>>> next_page) {
>>>>           QLIST_REMOVE(page, list);
>>>> -        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
>>>> +        qemu_ram_remap(page->ram_addr, page->page_size);
>>>
>>> Can't we just use the page size from the RAMBlock in qemu_ram_remap? 
>>> There we lookup the RAMBlock, and all pages in a RAMBlock have the 
>>> same size.
>>
>>
>> Yes, we could use the page size from the RAMBlock in qemu_ram_remap() 
>> that is called when the VM is resetting. I think that knowing the 
>> information about the size of poisoned chunk of memory when the poison 
>> is created is useful to give a trace of what is going on, before 
>> seeing maybe other pages being reported as poisoned. That's the 4th 
>> patch goal to give an information as soon as we get it.
>> It also helps to filter the new errors reported and only create an 
>> entry in the hwpoison_page_list for new large pages.
>> Now we could delay the page size retrieval until we are resetting and 
>> present the information (post mortem). I do think that having the 
>> information earlier is better in this case.
> 
> If it is not required for this patch, then please move the other stuff 
> to patch #4.
> 
> Here, we really only have to discard a large page, which we can derive 
> from the QEMU RAMBlock page size.


Ok, I can remove the first patch that is created to track the kernel 
provided page size and pass it to the kvm_hwpoison_page_add() function, 
but we could deal with the page size at the kvm_hwpoison_page_add() 
function level as we don't rely on the kernel provided info, but just 
the RAMBlock page size.

I'll send a new version with this modification.


>>
>>
>>>
>>> I'll note that qemu_ram_remap() is rather stupid and optimized only 
>>> for private memory (not shmem etc).
>>>
>>> mmap(MAP_FIXED|MAP_SHARED, fd) will give you the same poisoned page 
>>> from the pagecache; you'd have to punch a hole instead.
>>>
>>> It might be better to use ram_block_discard_range() in the long run. 
>>> Memory preallocation + page pinning is tricky, but we could simply 
>>> bail out in these cases (preallocation failing, ram discard being 
>>> disabled).
>>
>>
>> I see that ram_block_discard_range() adds more control before 
>> discarding the RAM region and can also call madvise() in addition to 
>> the fallocate punch hole for standard sized memory pages. Now as the 
>> range is supposed to be recreated, I'm not convinced that these 
>> madvise calls are necessary.
> 
> They are the proper replacement for the mmap(MAP_FIXED) + fallocate.
> 
> That function handles all cases of properly discarding guest RAM.

In the case of hugetlbfs pages, ram_block_discard_range() does the 
punch-hole fallocate call (and prints out the warning messages).
The madvise call is only done when (rb->page_size == 
qemu_real_host_page_size()) which isn't true for hugetlbfs.
So need_madvise is false and neither QEMU_MADV_REMOVE nor 
QEMU_MADV_DONTNEED madvise calls is performed.


> 
>>
>> But we can also notice that this function will report the following 
>> warning in all cases of not shared file backends:
>> "ram_block_discard_range: Discarding RAM in private file mappings is 
>> possibly dangerous, because it will modify the underlying file and 
>> will affect other users of the file"
> 
> Yes, because it's a clear warning sign that something weird is 
> happening. You might be throwing away data that some other process might 
> be relying on.
> 
> How are you making QEMU consume hugetlbs?

A classical way to consume (not shared) hugetlbfs pages is done with the 
creation of a file that is opened, mmapped by the Qemu instance but we 
also delete the file system entry so that if the Qemu instance dies, the 
resources are released. This file is usually not shared.


> 
> We could suppress these warnings, but let's first see how you are able 
> to trigger it.

The warning is always displayed when such a hugetlbfs VM impacted by a 
memory error is rebooted.
I understand the reason why we have this message, but in the case of 
hugetlbfs classical use this (new) message on reboot is probably too 
worrying...  But loosing memory is already very worrying ;)


> 
>> Which means that hugetlbfs configurations do see this new cryptic 
>> warning message on reboot if it is impacted by a memory poisoning.
>> So I would prefer to leave the fallocate call in the qemu_ram_remap() 
>> function. Or would you prefer to enhance ram_block_discard_range()code 
>> to avoid the message in a reset situation (when called from 
>> qemu_ram_remap)?
> 
> Please try reusing the mechanism to discard guest RAM instead of open- 
> coding this. We still have to use mmap(MAP_FIXED) as a backup, but 
> otherwise this function should mostly do+check what you need.
> 
> (-warnings we might want to report differently / suppress)
> 
> If you want, I can start a quick prototype of what it could look like 
> when using ram_block_discard_range() + ram_block_discard_is_disabled() + 
> fallback to existing mmap(MAP_FIXED).

I just want to notice that the reason why need_madvise was used was 
because "DONTNEED fails for hugepages but fallocate works on hugepages 
and shmem". In fact, MADV_REMOVE support on hugetlbfs only appeared in 
kernel v4.3 and MADV_DONTNEED support only appeared 5.18

Our Qemu code avoids calling these madvise for hugepages, as we need to 
have:
(rb->page_size == qemu_real_host_page_size())

That's a reason why we have to remap the "hole-punched" section of the 
file when using hugepages.

>>
>>
>>>
>>> qemu_ram_remap() might be problematic with page pinning (vfio) as is 
>>> in any way :(
>>
>> I agree. If qemu_ram_remap() fails, Qemu is ended either abort() or 
>> exit(1). Do you say that memory pinning could be detected by 
>> ram_block_discard_range() or maybe mmap call for the impacted region 
>> and make one of them fail ? This would be an additional reason to call 
>> ram_block_discard_range() from qemu_ram_remap().   Is it what you are 
>> suggesting ?
> 
> ram_block_discard_is_disabled() might be the right test. If discarding 
> is disabled, then rebooting might create an inconsistency with 
> e.g.,vfio, resulting in the issues we know from memory ballooning where 
> the state vfio sees will be different from the state the guest kernel 
> sees. It's tricky ... and we much rather quit the VM early instead of 
> corrupting data later :/


Alright. we can verify if ram_block_discard_is_disabled() is true and we 
exit Qemu in this case with a message instead of trying to recreate the 
memory area (in the other case).



Re: [PATCH v1 2/4] accel/kvm: Keep track of the HWPoisonPage page_size
Posted by David Hildenbrand 2 weeks, 5 days ago
> 
> Ok, I can remove the first patch that is created to track the kernel
> provided page size and pass it to the kvm_hwpoison_page_add() function,
> but we could deal with the page size at the kvm_hwpoison_page_add()
> function level as we don't rely on the kernel provided info, but just
> the RAMBlock page size.

Great!

>>> I see that ram_block_discard_range() adds more control before
>>> discarding the RAM region and can also call madvise() in addition to
>>> the fallocate punch hole for standard sized memory pages. Now as the
>>> range is supposed to be recreated, I'm not convinced that these
>>> madvise calls are necessary.
>>
>> They are the proper replacement for the mmap(MAP_FIXED) + fallocate.
>>
>> That function handles all cases of properly discarding guest RAM.
> 
> In the case of hugetlbfs pages, ram_block_discard_range() does the
> punch-hole fallocate call (and prints out the warning messages).
> The madvise call is only done when (rb->page_size ==
> qemu_real_host_page_size()) which isn't true for hugetlbfs.
> So need_madvise is false and neither QEMU_MADV_REMOVE nor
> QEMU_MADV_DONTNEED madvise calls is performed.

See my other mail regarding fallocte()+hugetlb oddities.

The warning is for MAP_PRIVATE mappings where we cannot be sure that we are
not doing harm to somebody else that is mapping the file :(

See

commit 1d44ff586f8a8e113379430750b5a0a2a3f64cf9
Author: David Hildenbrand <david@redhat.com>
Date:   Thu Jul 6 09:56:06 2023 +0200

     softmmu/physmem: Warn with ram_block_discard_range() on MAP_PRIVATE file mapping
     
     ram_block_discard_range() cannot possibly do the right thing in
     MAP_PRIVATE file mappings in the general case.
     
     To achieve the documented semantics, we also have to punch a hole into
     the file, possibly messing with other MAP_PRIVATE/MAP_SHARED mappings
     of such a file.
     
     For example, using VM templating -- see commit b17fbbe55cba ("migration:
     allow private destination ram with x-ignore-shared") -- in combination with
     any mechanism that relies on discarding of RAM is problematic. This
     includes:
     * Postcopy live migration
     * virtio-balloon inflation/deflation or free-page-reporting
     * virtio-mem
     
     So at least warn that there is something possibly dangerous is going on
     when using ram_block_discard_range() in these cases.

So the warning is the best we can do to say "this is possibly very
problematic, and it might be undesirable".

For hugetlb, users should switch to using memory-backend-memfd or
memory-backend-file,share=on.

> 
> 
>>
>>>
>>> But we can also notice that this function will report the following
>>> warning in all cases of not shared file backends:
>>> "ram_block_discard_range: Discarding RAM in private file mappings is
>>> possibly dangerous, because it will modify the underlying file and
>>> will affect other users of the file"
>>
>> Yes, because it's a clear warning sign that something weird is
>> happening. You might be throwing away data that some other process might
>> be relying on.
>>
>> How are you making QEMU consume hugetlbs?
> 
> A classical way to consume (not shared) hugetlbfs pages is done with the
> creation of a file that is opened, mmapped by the Qemu instance but we
> also delete the file system entry so that if the Qemu instance dies, the
> resources are released. This file is usually not shared.

Right, see above. We should be using memory-backend-file,share=on with that,
just like we would with shmem/tmpfs :(

The ugly bit is that the legacy "-mem-path" option translates to
"memory-backend-file,share=off", and we cannot easily change that.

That option really should not be used anymore.

> 
> 
>>
>> We could suppress these warnings, but let's first see how you are able
>> to trigger it.
> 
> The warning is always displayed when such a hugetlbfs VM impacted by a
> memory error is rebooted.
> I understand the reason why we have this message, but in the case of
> hugetlbfs classical use this (new) message on reboot is probably too
> worrying...  But loosing memory is already very worrying ;)

See above; we cannot easily identify "we map this file MAP_PRIVATE
but we are guaranteed to be the single user", so punching a hole in that
file might just corrupt data for another user (e.g., VM templating) without
any warning.

Again, we could suppress the warning, but not using MAP_PRIVATE with
a hugetlb file would be even better.

(hugetlb contains other hacks that make sure that MAP_PRIVATE on a file
won't result in a double memory consumption -- with shmem/tmpfs it would
result in a double memory consumption!)

Are the users you are aware of using "-mem-path" or "-object memory-backend-file"?

We might be able to change the default for the latter with a new QEMU version,
maybe ...


-- 
Cheers,

David / dhildenb