[v1] Ram blocks with resizable anonymous allocations under POSIX

[PATCH v1 09/13] util/mmap-alloc: Implement resizable mmaps

Posted by David Hildenbrand 5 years, 9 months ago

Implement resizable mmaps. For now, the actual resizing is not wired up.
Introduce qemu_ram_mmap_resizable() and qemu_ram_mmap_resize(). Make
qemu_ram_mmap() a wrapper of qemu_ram_mmap_resizable().

Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Greg Kurz <groug@kaod.org>
Cc: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/qemu/mmap-alloc.h | 21 ++++++++++++-------
 util/mmap-alloc.c         | 44 ++++++++++++++++++++++++++++-----------
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
index e786266b92..70bc8e9637 100644
--- a/include/qemu/mmap-alloc.h
+++ b/include/qemu/mmap-alloc.h
@@ -7,11 +7,13 @@ size_t qemu_fd_getpagesize(int fd);
 size_t qemu_mempath_getpagesize(const char *mem_path);
 
 /**
- * qemu_ram_mmap: mmap the specified file or device.
+ * qemu_ram_mmap_resizable: reserve a memory region of @max_size to mmap the
+ *                          specified file or device and mmap @size of it.
  *
  * Parameters:
  *  @fd: the file or the device to mmap
  *  @size: the number of bytes to be mmaped
+ *  @max_size: the number of bytes to be reserved
  *  @align: if not zero, specify the alignment of the starting mapping address;
  *          otherwise, the alignment in use will be determined by QEMU.
  *  @shared: map has RAM_SHARED flag.
@@ -21,12 +23,15 @@ size_t qemu_mempath_getpagesize(const char *mem_path);
  *  On success, return a pointer to the mapped area.
  *  On failure, return MAP_FAILED.
  */
-void *qemu_ram_mmap(int fd,
-                    size_t size,
-                    size_t align,
-                    bool shared,
-                    bool is_pmem);
-
-void qemu_ram_munmap(int fd, void *ptr, size_t size);
+void *qemu_ram_mmap_resizable(int fd, size_t size, size_t max_size,
+                              size_t align, bool shared, bool is_pmem);
+void *qemu_ram_mmap_resize(void *ptr, int fd, size_t old_size, size_t new_size,
+                           bool shared, bool is_pmem);
+static inline void *qemu_ram_mmap(int fd, size_t size, size_t align,
+                                  bool shared, bool is_pmem)
+{
+    return qemu_ram_mmap_resizable(fd, size, size, align, shared, is_pmem);
+}
+void qemu_ram_munmap(int fd, void *ptr, size_t max_size);
 
 #endif
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index 63ad6893b7..2d562145e9 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -172,11 +172,8 @@ static inline size_t mmap_pagesize(int fd)
 #endif
 }
 
-void *qemu_ram_mmap(int fd,
-                    size_t size,
-                    size_t align,
-                    bool shared,
-                    bool is_pmem)
+void *qemu_ram_mmap_resizable(int fd, size_t size, size_t max_size,
+                              size_t align, bool shared, bool is_pmem)
 {
     const size_t pagesize = mmap_pagesize(fd);
     size_t offset, total;
@@ -184,12 +181,14 @@ void *qemu_ram_mmap(int fd,
 
     /* we can only map whole pages */
     size = QEMU_ALIGN_UP(size, pagesize);
+    max_size = QEMU_ALIGN_UP(max_size, pagesize);
 
     /*
      * Note: this always allocates at least one extra page of virtual address
-     * space, even if size is already aligned.
+     * space, even if the size is already aligned. We will reserve an area of
+     * at least max_size, but only populate the requested part of it.
      */
-    total = size + align;
+    total = max_size + align;
 
     guardptr = mmap_reserve(0, total, fd);
     if (guardptr == MAP_FAILED) {
@@ -217,22 +216,43 @@ void *qemu_ram_mmap(int fd,
      * a guard page guarding against potential buffer overflows.
      */
     total -= offset;
-    if (total > size + pagesize) {
-        munmap(ptr + size + pagesize, total - size - pagesize);
+    if (total > max_size + pagesize) {
+        munmap(ptr + max_size + pagesize, total - max_size - pagesize);
     }
 
     return ptr;
 }
 
-void qemu_ram_munmap(int fd, void *ptr, size_t size)
+void *qemu_ram_mmap_resize(void *ptr, int fd, size_t old_size, size_t new_size,
+                           bool shared, bool is_pmem)
 {
     const size_t pagesize = mmap_pagesize(fd);
 
     /* we can only map whole pages */
-    size = QEMU_ALIGN_UP(size, pagesize);
+    old_size = QEMU_ALIGN_UP(old_size, pagesize);
+    new_size = QEMU_ALIGN_UP(new_size, pagesize);
+
+    /* we support actually resizable memory regions only on Linux */
+    if (old_size < new_size) {
+        /* populate the missing piece into the reserved area */
+        ptr = mmap_populate(ptr + old_size, new_size - old_size, fd, old_size,
+                            shared, is_pmem);
+    } else if (old_size > new_size) {
+        /* discard this piece, keeping the area reserved (should never fail) */
+        ptr = mmap_reserve(ptr + new_size, old_size - new_size, fd);
+    }
+    return ptr;
+}
+
+void qemu_ram_munmap(int fd, void *ptr, size_t max_size)
+{
+    const size_t pagesize = mmap_pagesize(fd);
+
+    /* we can only map whole pages */
+    max_size = QEMU_ALIGN_UP(max_size, pagesize);
 
     if (ptr) {
         /* Unmap both the RAM block and the guard page */
-        munmap(ptr, size + pagesize);
+        munmap(ptr, max_size + pagesize);
     }
 }
-- 
2.24.1

Re: [PATCH v1 09/13] util/mmap-alloc: Implement resizable mmaps

Posted by Murilo Opsfelder Araújo 5 years, 9 months ago

Hello, David.

On Monday, February 3, 2020 3:31:21 PM -03 David Hildenbrand wrote:
> Implement resizable mmaps. For now, the actual resizing is not wired up.
> Introduce qemu_ram_mmap_resizable() and qemu_ram_mmap_resize(). Make
> qemu_ram_mmap() a wrapper of qemu_ram_mmap_resizable().
>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Greg Kurz <groug@kaod.org>
> Cc: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
> Cc: Eduardo Habkost <ehabkost@redhat.com>
> Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/qemu/mmap-alloc.h | 21 ++++++++++++-------
>  util/mmap-alloc.c         | 44 ++++++++++++++++++++++++++++-----------
>  2 files changed, 45 insertions(+), 20 deletions(-)
>
> diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
> index e786266b92..70bc8e9637 100644
> --- a/include/qemu/mmap-alloc.h
> +++ b/include/qemu/mmap-alloc.h
> @@ -7,11 +7,13 @@ size_t qemu_fd_getpagesize(int fd);
>  size_t qemu_mempath_getpagesize(const char *mem_path);
>
>  /**
> - * qemu_ram_mmap: mmap the specified file or device.
> + * qemu_ram_mmap_resizable: reserve a memory region of @max_size to mmap
> the + *                          specified file or device and mmap @size of
> it. *
>   * Parameters:
>   *  @fd: the file or the device to mmap
>   *  @size: the number of bytes to be mmaped
> + *  @max_size: the number of bytes to be reserved
>   *  @align: if not zero, specify the alignment of the starting mapping
> address; *          otherwise, the alignment in use will be determined by
> QEMU. *  @shared: map has RAM_SHARED flag.
> @@ -21,12 +23,15 @@ size_t qemu_mempath_getpagesize(const char *mem_path);
>   *  On success, return a pointer to the mapped area.
>   *  On failure, return MAP_FAILED.
>   */
> -void *qemu_ram_mmap(int fd,
> -                    size_t size,
> -                    size_t align,
> -                    bool shared,
> -                    bool is_pmem);
> -
> -void qemu_ram_munmap(int fd, void *ptr, size_t size);
> +void *qemu_ram_mmap_resizable(int fd, size_t size, size_t max_size,
> +                              size_t align, bool shared, bool is_pmem);
> +void *qemu_ram_mmap_resize(void *ptr, int fd, size_t old_size, size_t
> new_size, +                           bool shared, bool is_pmem);
> +static inline void *qemu_ram_mmap(int fd, size_t size, size_t align,
> +                                  bool shared, bool is_pmem)
> +{
> +    return qemu_ram_mmap_resizable(fd, size, size, align, shared, is_pmem);
> +}
> +void qemu_ram_munmap(int fd, void *ptr, size_t max_size);
>
>  #endif
> diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> index 63ad6893b7..2d562145e9 100644
> --- a/util/mmap-alloc.c
> +++ b/util/mmap-alloc.c
> @@ -172,11 +172,8 @@ static inline size_t mmap_pagesize(int fd)
>  #endif
>  }
>
> -void *qemu_ram_mmap(int fd,
> -                    size_t size,
> -                    size_t align,
> -                    bool shared,
> -                    bool is_pmem)
> +void *qemu_ram_mmap_resizable(int fd, size_t size, size_t max_size,
> +                              size_t align, bool shared, bool is_pmem)
>  {
>      const size_t pagesize = mmap_pagesize(fd);
>      size_t offset, total;
> @@ -184,12 +181,14 @@ void *qemu_ram_mmap(int fd,
>
>      /* we can only map whole pages */
>      size = QEMU_ALIGN_UP(size, pagesize);
> +    max_size = QEMU_ALIGN_UP(max_size, pagesize);
>
>      /*
>       * Note: this always allocates at least one extra page of virtual
> address -     * space, even if size is already aligned.
> +     * space, even if the size is already aligned. We will reserve an area
> of +     * at least max_size, but only populate the requested part of it.
> */
> -    total = size + align;
> +    total = max_size + align;
>
>      guardptr = mmap_reserve(0, total, fd);
>      if (guardptr == MAP_FAILED) {
> @@ -217,22 +216,43 @@ void *qemu_ram_mmap(int fd,
>       * a guard page guarding against potential buffer overflows.
>       */
>      total -= offset;
> -    if (total > size + pagesize) {
> -        munmap(ptr + size + pagesize, total - size - pagesize);
> +    if (total > max_size + pagesize) {
> +        munmap(ptr + max_size + pagesize, total - max_size - pagesize);
>      }
>
>      return ptr;
>  }
>
> -void qemu_ram_munmap(int fd, void *ptr, size_t size)
> +void *qemu_ram_mmap_resize(void *ptr, int fd, size_t old_size, size_t
> new_size, +                           bool shared, bool is_pmem)
>  {
>      const size_t pagesize = mmap_pagesize(fd);
>
>      /* we can only map whole pages */
> -    size = QEMU_ALIGN_UP(size, pagesize);
> +    old_size = QEMU_ALIGN_UP(old_size, pagesize);
> +    new_size = QEMU_ALIGN_UP(new_size, pagesize);

Shouldn't we just assert old_size and new_size are aligned with
pagesize?

> +
> +    /* we support actually resizable memory regions only on Linux */
> +    if (old_size < new_size) {
> +        /* populate the missing piece into the reserved area */
> +        ptr = mmap_populate(ptr + old_size, new_size - old_size, fd,
> old_size, +                            shared, is_pmem);
> +    } else if (old_size > new_size) {
> +        /* discard this piece, keeping the area reserved (should never
> fail) */ +        ptr = mmap_reserve(ptr + new_size, old_size - new_size,
> fd); +    }

I find the behaviour of this function somewhat confusing.  Perhaps I'm
missing something and need your help to clarify.  Please bear with me.

For the case where we want to grow in size, it returns a populated area
(PROT_READ | PROT_WRITE flags).

And for the case where we want to shrink in size, it returns a reserved
area (PROT_NONE flag), requiring the caller to call mmap_populate()
again to be able to use that memory.

I believe the behaviour should be consistent regardless if we want to
grow or shrink in size.  Either return a reserved or an already
populated area.  Not both.

Would "old_size == new_size" situation be possible?  In this case, ptr
would be returned without changing protection flags of the mapping.

Shouldn't we also assert that new_size <= max_size?

> +    return ptr;
> +}
> +
> +void qemu_ram_munmap(int fd, void *ptr, size_t max_size)
> +{
> +    const size_t pagesize = mmap_pagesize(fd);
> +
> +    /* we can only map whole pages */
> +    max_size = QEMU_ALIGN_UP(max_size, pagesize);

Shouldn't we just assert this and leave the alignment to the caller?

>
>      if (ptr) {
>          /* Unmap both the RAM block and the guard page */
> -        munmap(ptr, size + pagesize);
> +        munmap(ptr, max_size + pagesize);
>      }
>  }

--
Murilo

Re: [PATCH v1 09/13] util/mmap-alloc: Implement resizable mmaps

Posted by David Hildenbrand 5 years, 9 months ago

>>      /* we can only map whole pages */
>> -    size = QEMU_ALIGN_UP(size, pagesize);
>> +    old_size = QEMU_ALIGN_UP(old_size, pagesize);
>> +    new_size = QEMU_ALIGN_UP(new_size, pagesize);
> 
> Shouldn't we just assert old_size and new_size are aligned with
> pagesize?
> 

Already reworked :)

>> +
>> +    /* we support actually resizable memory regions only on Linux */
>> +    if (old_size < new_size) {
>> +        /* populate the missing piece into the reserved area */
>> +        ptr = mmap_populate(ptr + old_size, new_size - old_size, fd,
>> old_size, +                            shared, is_pmem);
>> +    } else if (old_size > new_size) {
>> +        /* discard this piece, keeping the area reserved (should never
>> fail) */ +        ptr = mmap_reserve(ptr + new_size, old_size - new_size,
>> fd); +    }
> 
> I find the behaviour of this function somewhat confusing.  Perhaps I'm
> missing something and need your help to clarify.  Please bear with me.
> 
> For the case where we want to grow in size, it returns a populated area
> (PROT_READ | PROT_WRITE flags).
> 
> And for the case where we want to shrink in size, it returns a reserved
> area (PROT_NONE flag), requiring the caller to call mmap_populate()
> again to be able to use that memory.
> 
> I believe the behaviour should be consistent regardless if we want to
> grow or shrink in size.  Either return a reserved or an already
> populated area.  Not both.

The return value is only used to differentiate between success (!=
MAP_FAILED) and failure (== MAP_FAILED), nothing else.

For now, I switched to returning a bool instead (resized vs. not
resized), that avoids this inconsistency. See my reply to Richard's comment.

> 
> Would "old_size == new_size" situation be possible?  In this case, ptr
> would be returned without changing protection flags of the mapping.

It could, although in this patch set, it never would :) The result would
be a NOP, which is the right thing to do.

> 
> Shouldn't we also assert that new_size <= max_size?

Then we would have to pass max_size, just for asserting. I'll leave that
to the callers for now.

> 
>> +    return ptr;
>> +}
>> +
>> +void qemu_ram_munmap(int fd, void *ptr, size_t max_size)
>> +{
>> +    const size_t pagesize = mmap_pagesize(fd);
>> +
>> +    /* we can only map whole pages */
>> +    max_size = QEMU_ALIGN_UP(max_size, pagesize);
> 
> Shouldn't we just assert this and leave the alignment to the caller?

Already reworked.

Thanks!

-- 
Thanks,

David / dhildenb

Re: [PATCH v1 09/13] util/mmap-alloc: Implement resizable mmaps

Posted by Richard Henderson 5 years, 9 months ago

On 2/3/20 6:31 PM, David Hildenbrand wrote:
> +void *qemu_ram_mmap_resize(void *ptr, int fd, size_t old_size, size_t new_size,
> +                           bool shared, bool is_pmem)
>  {
>      const size_t pagesize = mmap_pagesize(fd);
>  
>      /* we can only map whole pages */
> -    size = QEMU_ALIGN_UP(size, pagesize);
> +    old_size = QEMU_ALIGN_UP(old_size, pagesize);
> +    new_size = QEMU_ALIGN_UP(new_size, pagesize);
> +
> +    /* we support actually resizable memory regions only on Linux */
> +    if (old_size < new_size) {
> +        /* populate the missing piece into the reserved area */
> +        ptr = mmap_populate(ptr + old_size, new_size - old_size, fd, old_size,
> +                            shared, is_pmem);
> +    } else if (old_size > new_size) {
> +        /* discard this piece, keeping the area reserved (should never fail) */
> +        ptr = mmap_reserve(ptr + new_size, old_size - new_size, fd);
> +    }
> +    return ptr;
> +}

What does the return value indicate?
Is it just for != MAP_FAILED?

Would we be better off with an assert?  There's the comment re mmap_reserve,
but I can't see why mmap_populate should fail either.

Assuming an assert isn't viable, are we better off with a boolean return?  With
an Error **ptr?


r~

Re: [PATCH v1 09/13] util/mmap-alloc: Implement resizable mmaps

Posted by David Hildenbrand 5 years, 9 months ago

On 06.02.20 13:08, Richard Henderson wrote:
> On 2/3/20 6:31 PM, David Hildenbrand wrote:
>> +void *qemu_ram_mmap_resize(void *ptr, int fd, size_t old_size, size_t new_size,
>> +                           bool shared, bool is_pmem)
>>  {
>>      const size_t pagesize = mmap_pagesize(fd);
>>  
>>      /* we can only map whole pages */
>> -    size = QEMU_ALIGN_UP(size, pagesize);
>> +    old_size = QEMU_ALIGN_UP(old_size, pagesize);
>> +    new_size = QEMU_ALIGN_UP(new_size, pagesize);
>> +
>> +    /* we support actually resizable memory regions only on Linux */
>> +    if (old_size < new_size) {
>> +        /* populate the missing piece into the reserved area */
>> +        ptr = mmap_populate(ptr + old_size, new_size - old_size, fd, old_size,
>> +                            shared, is_pmem);
>> +    } else if (old_size > new_size) {
>> +        /* discard this piece, keeping the area reserved (should never fail) */
>> +        ptr = mmap_reserve(ptr + new_size, old_size - new_size, fd);
>> +    }
>> +    return ptr;
>> +}
> 
> What does the return value indicate?
> Is it just for != MAP_FAILED?

It indicates if resizing succeeded. In a previous version I returned an
int via

ptr == MAP_FAILED ? -errno : 0;


Populating will usually only fail because we're out of memory.

Populating and reserving *might* fail if we are out of VMAs in the
kernel. VMA merging will make sure that the number of VMAs will not
explode (usually 2-3 VMAs for one resizable region: populated VMA +
Reserved VMA + Guard page VMA). But once we would be close to the VMA
limit, it could happen - but it's highly unlikely.

> Assuming an assert isn't viable, are we better off with a boolean return?  With
> an Error **ptr?

either that or an int. What do you prefer?

Thanks!

-- 
Thanks,

David / dhildenb

Re: [PATCH v1 09/13] util/mmap-alloc: Implement resizable mmaps

Posted by David Hildenbrand 5 years, 9 months ago

On 06.02.20 14:22, David Hildenbrand wrote:
> On 06.02.20 13:08, Richard Henderson wrote:
>> On 2/3/20 6:31 PM, David Hildenbrand wrote:
>>> +void *qemu_ram_mmap_resize(void *ptr, int fd, size_t old_size, size_t new_size,
>>> +                           bool shared, bool is_pmem)
>>>  {
>>>      const size_t pagesize = mmap_pagesize(fd);
>>>  
>>>      /* we can only map whole pages */
>>> -    size = QEMU_ALIGN_UP(size, pagesize);
>>> +    old_size = QEMU_ALIGN_UP(old_size, pagesize);
>>> +    new_size = QEMU_ALIGN_UP(new_size, pagesize);
>>> +
>>> +    /* we support actually resizable memory regions only on Linux */
>>> +    if (old_size < new_size) {
>>> +        /* populate the missing piece into the reserved area */
>>> +        ptr = mmap_populate(ptr + old_size, new_size - old_size, fd, old_size,
>>> +                            shared, is_pmem);
>>> +    } else if (old_size > new_size) {
>>> +        /* discard this piece, keeping the area reserved (should never fail) */
>>> +        ptr = mmap_reserve(ptr + new_size, old_size - new_size, fd);
>>> +    }
>>> +    return ptr;
>>> +}
>>
>> What does the return value indicate?
>> Is it just for != MAP_FAILED?
> 
> It indicates if resizing succeeded. In a previous version I returned an
> int via
> 
> ptr == MAP_FAILED ? -errno : 0;
> 
> 
> Populating will usually only fail because we're out of memory.
> 
> Populating and reserving *might* fail if we are out of VMAs in the
> kernel. VMA merging will make sure that the number of VMAs will not
> explode (usually 2-3 VMAs for one resizable region: populated VMA +
> Reserved VMA + Guard page VMA). But once we would be close to the VMA
> limit, it could happen - but it's highly unlikely.
> 
>> Assuming an assert isn't viable, are we better off with a boolean return?  With
>> an Error **ptr?
> 
> either that or an int. What do you prefer?

I'll go with a bool for now. "return ptr != MAP_FAILED;"

The actual error will usually be -ENOMEM, so there is no real benefit in
returning it.

-- 
Thanks,

David / dhildenb