[PATCH v4 2/7] system/physmem: poisoned memory discard on reboot

“William Roche posted 7 patches 3 months, 3 weeks ago
There is a newer version of this series
[PATCH v4 2/7] system/physmem: poisoned memory discard on reboot
Posted by “William Roche 3 months, 3 weeks ago
From: William Roche <william.roche@oracle.com>

Repair poisoned memory location(s), calling ram_block_discard_range():
punching a hole in the backend file when necessary and regenerating
a usable memory.
If the kernel doesn't support the madvise calls used by this function
and we are dealing with anonymous memory, fall back to remapping the
location(s).

Signed-off-by: William Roche <william.roche@oracle.com>
---
 system/physmem.c | 63 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 21 deletions(-)

diff --git a/system/physmem.c b/system/physmem.c
index 2c90cc2d78..b228a692f8 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -2180,13 +2180,37 @@ void qemu_ram_free(RAMBlock *block)
 }
 
 #ifndef _WIN32
+/* Try to simply remap the given location */
+static void qemu_ram_remap_mmap(RAMBlock *block, void* vaddr, size_t size,
+                                ram_addr_t offset)
+{
+    int flags, prot;
+    void *area;
+
+    flags = MAP_FIXED;
+    flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE;
+    flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
+    prot = PROT_READ;
+    prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
+    if (block->fd >= 0) {
+        area = mmap(vaddr, size, prot, flags, block->fd,
+                    offset + block->fd_offset);
+    } else {
+        flags |= MAP_ANONYMOUS;
+        area = mmap(vaddr, size, prot, flags, -1, 0);
+    }
+    if (area != vaddr) {
+        error_report("Could not remap addr: " RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
+                     size, block->offset + offset);
+        exit(1);
+    }
+}
+
 void qemu_ram_remap(ram_addr_t addr)
 {
     RAMBlock *block;
     ram_addr_t offset;
-    int flags;
-    void *area, *vaddr;
-    int prot;
+    void *vaddr;
     size_t page_size;
 
     RAMBLOCK_FOREACH(block) {
@@ -2202,24 +2226,21 @@ void qemu_ram_remap(ram_addr_t addr)
             } else if (xen_enabled()) {
                 abort();
             } else {
-                flags = MAP_FIXED;
-                flags |= block->flags & RAM_SHARED ?
-                         MAP_SHARED : MAP_PRIVATE;
-                flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
-                prot = PROT_READ;
-                prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
-                if (block->fd >= 0) {
-                    area = mmap(vaddr, page_size, prot, flags, block->fd,
-                                offset + block->fd_offset);
-                } else {
-                    flags |= MAP_ANONYMOUS;
-                    area = mmap(vaddr, page_size, prot, flags, -1, 0);
-                }
-                if (area != vaddr) {
-                    error_report("Could not remap addr: "
-                                 RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
-                                 page_size, addr);
-                    exit(1);
+                if (ram_block_discard_range(block, offset + block->fd_offset,
+                                            page_size) != 0) {
+                    /*
+                     * Fold back to using mmap() only for anonymous mapping,
+                     * as if a backing file is associated we may not be able
+                     * to recover the memory in all cases.
+                     * So don't take the risk of using only mmap and fail now.
+                     */
+                    if (block->fd >= 0) {
+                        error_report("Memory poison recovery failure addr: "
+                                     RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
+                                     page_size, addr);
+                        exit(1);
+                    }
+                    qemu_ram_remap_mmap(block, vaddr, page_size, offset);
                 }
                 memory_try_enable_merging(vaddr, page_size);
                 qemu_ram_setup_dump(vaddr, page_size);
-- 
2.43.5
Re: [PATCH v4 2/7] system/physmem: poisoned memory discard on reboot
Posted by David Hildenbrand 3 months ago
On 14.12.24 14:45, “William Roche wrote:
> From: William Roche <william.roche@oracle.com>
> 
> Repair poisoned memory location(s), calling ram_block_discard_range():
> punching a hole in the backend file when necessary and regenerating
> a usable memory.
> If the kernel doesn't support the madvise calls used by this function
> and we are dealing with anonymous memory, fall back to remapping the
> location(s).
> 
> Signed-off-by: William Roche <william.roche@oracle.com>
> ---
>   system/physmem.c | 63 ++++++++++++++++++++++++++++++++----------------
>   1 file changed, 42 insertions(+), 21 deletions(-)
> 
> diff --git a/system/physmem.c b/system/physmem.c
> index 2c90cc2d78..b228a692f8 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -2180,13 +2180,37 @@ void qemu_ram_free(RAMBlock *block)
>   }
>   
>   #ifndef _WIN32
> +/* Try to simply remap the given location */
> +static void qemu_ram_remap_mmap(RAMBlock *block, void* vaddr, size_t size,
> +                                ram_addr_t offset)

Can you make the parameters match the ones of ram_block_discard_range() 
so the invocation gets easier to read? You can calculate vaddr easily 
internally.

Something like

static void qemu_ram_remap_mmap(RAMBlock *rb, uint64_t start,
				size_t length)

> +{
> +    int flags, prot;
> +    void *area;
> +
> +    flags = MAP_FIXED;
> +    flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE;
> +    flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
> +    prot = PROT_READ;
> +    prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
> +    if (block->fd >= 0) {

Heh, that case can no longer happen!

assert(block->fs < 0);

?

> +        area = mmap(vaddr, size, prot, flags, block->fd,
> +                    offset + block->fd_offset);
> +    } else {
> +        flags |= MAP_ANONYMOUS;
> +        area = mmap(vaddr, size, prot, flags, -1, 0);
> +    }
> +    if (area != vaddr) {
> +        error_report("Could not remap addr: " RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
> +                     size, block->offset + offset);
> +        exit(1);
> +    }
> +}
> +
>   void qemu_ram_remap(ram_addr_t addr)
>   {
>       RAMBlock *block;
>       ram_addr_t offset;
> -    int flags;
> -    void *area, *vaddr;
> -    int prot;
> +    void *vaddr;
>       size_t page_size;
>   
>       RAMBLOCK_FOREACH(block) {
> @@ -2202,24 +2226,21 @@ void qemu_ram_remap(ram_addr_t addr)
>               } else if (xen_enabled()) {
>                   abort();
>               } else {
> -                flags = MAP_FIXED;
> -                flags |= block->flags & RAM_SHARED ?
> -                         MAP_SHARED : MAP_PRIVATE;
> -                flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
> -                prot = PROT_READ;
> -                prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
> -                if (block->fd >= 0) {
> -                    area = mmap(vaddr, page_size, prot, flags, block->fd,
> -                                offset + block->fd_offset);
> -                } else {
> -                    flags |= MAP_ANONYMOUS;
> -                    area = mmap(vaddr, page_size, prot, flags, -1, 0);
> -                }
> -                if (area != vaddr) {
> -                    error_report("Could not remap addr: "
> -                                 RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
> -                                 page_size, addr);
> -                    exit(1);
> +                if (ram_block_discard_range(block, offset + block->fd_offset,
> +                                            page_size) != 0) {
> +                    /*
> +                     * Fold back to using mmap() only for anonymous mapping,

s/Fold/Fall/

> +                     * as if a backing file is associated we may not be able
> +                     * to recover the memory in all cases.
> +                     * So don't take the risk of using only mmap and fail now.
> +                     */
> +                    if (block->fd >= 0) {
> +                        error_report("Memory poison recovery failure addr: "
> +                                     RAM_ADDR_FMT "@" RAM_ADDR_FMT "",
> +                                     page_size, addr);
> +                        exit(1);
> +                    }
> +                    qemu_ram_remap_mmap(block, vaddr, page_size, offset);
>                   }
>                   memory_try_enable_merging(vaddr, page_size);
>                   qemu_ram_setup_dump(vaddr, page_size);

These two can be moved into qemu_ram_remap_mmap(). They are not required 
if we didn't actually mess with mmap().

-- 
Cheers,

David / dhildenb


Re: [PATCH v4 2/7] system/physmem: poisoned memory discard on reboot
Posted by William Roche 2 months, 4 weeks ago
On 1/8/25 22:44, David Hildenbrand wrote:
> On 14.12.24 14:45, “William Roche wrote:
>> +/* Try to simply remap the given location */
>> +static void qemu_ram_remap_mmap(RAMBlock *block, void* vaddr, size_t 
>> size,
>> +                                ram_addr_t offset)
> 
> Can you make the parameters match the ones of ram_block_discard_range() 
> so the invocation gets easier to read? You can calculate vaddr easily 
> internally.
> 
> Something like
> 
> static void qemu_ram_remap_mmap(RAMBlock *rb, uint64_t start,
>                  size_t length)

I used the same arguments as ram_block_discard_range() as you asked.

> 
>> +{
>> +    int flags, prot;
>> +    void *area;
>> +
>> +    flags = MAP_FIXED;
>> +    flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE;
>> +    flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
>> +    prot = PROT_READ;
>> +    prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
>> +    if (block->fd >= 0) {
> 
> Heh, that case can no longer happen!

I also removed the used case of remapping a file in the 
qemu_ram_remap_mmap() function.

> 
> assert(block->fs < 0);

And added the assert() you suggested.


>> +                if (ram_block_discard_range(block, offset + block->fd_offset,
>> +                                            page_size) != 0) {


Studying some more the arguments used by ram_block_discard_range() and 
the need to fallocate/Punch the underlying file, I think that I should 
simply provide the 'offset' here and that block->fd_offset is missing in 
the ram_block_discard_range() function where we have to punch a hole in 
the file. Don't you agree ?

If we can get the current set of fixes integrated, I'll submit another 
fix proposal to take the fd_offset into account in a second time. (Not 
enlarging the current set)

But here is what I'm thinking about. That we can discuss later if you want:

@@ -3730,11 +3724,12 @@ int ram_block_discard_range(RAMBlock *rb, 
uint64_t start, size_t length)
              }

              ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | 
FALLOC_FL_KEEP_SIZE,
-                            start, length);
+                            start + rb->fd_offset, length);
              if (ret) {
                  ret = -errno;
                  error_report("%s: Failed to fallocate %s:%" PRIx64 " 
+%zx (%d)",
-                             __func__, rb->idstr, start, length, ret);
+                             __func__, rb->idstr, start + rb->fd_offset,
+                            length, ret);
                  goto err;
              }


Or I can integrate that as an addition patch if you prefer.




>> +                    /*
>> +                     * Fold back to using mmap() only for anonymous mapping,
> 
> s/Fold/Fall/

typo fixed



>>                   }
>>                   memory_try_enable_merging(vaddr, page_size);
>>                   qemu_ram_setup_dump(vaddr, page_size);
> 
> These two can be moved into qemu_ram_remap_mmap(). They are not required 
> if we didn't actually mess with mmap().


These functions will be replaced by the ram_block_notify_remap() of 
patch 7 which is called no matter the ram_block_discard_range() 
succeeded or not.
So we should leave these 2 function calls here for now as they mimic an 
aspect of what the notifier code will do.


Re: [PATCH v4 2/7] system/physmem: poisoned memory discard on reboot
Posted by David Hildenbrand 2 months, 3 weeks ago
> If we can get the current set of fixes integrated, I'll submit another
> fix proposal to take the fd_offset into account in a second time. (Not
> enlarging the current set)
> 
> But here is what I'm thinking about. That we can discuss later if you want:
> 
> @@ -3730,11 +3724,12 @@ int ram_block_discard_range(RAMBlock *rb,
> uint64_t start, size_t length)
>                }
> 
>                ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE |
> FALLOC_FL_KEEP_SIZE,
> -                            start, length);
> +                            start + rb->fd_offset, length);
>                if (ret) {
>                    ret = -errno;
>                    error_report("%s: Failed to fallocate %s:%" PRIx64 "
> +%zx (%d)",
> -                             __func__, rb->idstr, start, length, ret);
> +                             __func__, rb->idstr, start + rb->fd_offset,
> +                            length, ret);
>                    goto err;
>                }
> 
> 
> Or I can integrate that as an addition patch if you prefer.

Very good point! We missed to take fd_offset into account here.

Can you send that out as a separate fix?

Fixed: 4b870dc4d0c0 ("hostmem-file: add offset option")

-- 
Cheers,

David / dhildenb
Re: [PATCH v4 2/7] system/physmem: poisoned memory discard on reboot
Posted by William Roche 2 months, 1 week ago
On 1/14/25 15:00, David Hildenbrand wrote:
>> If we can get the current set of fixes integrated, I'll submit another
>> fix proposal to take the fd_offset into account in a second time. (Not
>> enlarging the current set)
>>
>> But here is what I'm thinking about. That we can discuss later if you 
>> want:
>>
>> @@ -3730,11 +3724,12 @@ int ram_block_discard_range(RAMBlock *rb,
>> uint64_t start, size_t length)
>>                }
>>
>>                ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE |
>> FALLOC_FL_KEEP_SIZE,
>> -                            start, length);
>> +                            start + rb->fd_offset, length);
>>                if (ret) {
>>                    ret = -errno;
>>                    error_report("%s: Failed to fallocate %s:%" PRIx64 "
>> +%zx (%d)",
>> -                             __func__, rb->idstr, start, length, ret);
>> +                             __func__, rb->idstr, start + rb->fd_offset,
>> +                            length, ret);
>>                    goto err;
>>                }
>>
>>
>> Or I can integrate that as an addition patch if you prefer.
> 
> Very good point! We missed to take fd_offset into account here.
> 
> Can you send that out as a separate fix?
> 
> Fixed: 4b870dc4d0c0 ("hostmem-file: add offset option"

Thanks to Peter Xu and to you for your reviews of my proposal for this 
separate fix that should be on track to be integrated soon.