[v1] util/oslib-posix: Support MADV_POPULATE_WRITE for os_mem_prealloc()

[PATCH v1 1/3] util/oslib-posix: Support MADV_POPULATE_WRITE for os_mem_prealloc()

Posted by David Hildenbrand 4 years, 6 months ago

Let's sense support and use it for preallocation. MADV_POPULATE_WRITE
does not require a SIGBUS handler, doesn't actually touch page content,
and avoids context switches; it is, therefore, faster and easier to handle
than our current approach.

This resolves the TODO in do_touch_pages().

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/qemu/osdep.h |  7 ++++
 util/oslib-posix.c   | 84 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 60718fc342..d1660d67fa 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -471,6 +471,11 @@ static inline void qemu_cleanup_generic_vfree(void *p)
 #else
 #define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED
 #endif
+#ifdef MADV_POPULATE_WRITE
+#define QEMU_MADV_POPULATE_WRITE MADV_POPULATE_WRITE
+#else
+#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID
+#endif
 
 #elif defined(CONFIG_POSIX_MADVISE)
 
@@ -484,6 +489,7 @@ static inline void qemu_cleanup_generic_vfree(void *p)
 #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
 #define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
 #define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED
+#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID
 
 #else /* no-op */
 
@@ -497,6 +503,7 @@ static inline void qemu_cleanup_generic_vfree(void *p)
 #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
 #define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
 #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
+#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID
 
 #endif
 
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e8bdb02e1d..679796ac1f 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -484,10 +484,6 @@ static void *do_touch_pages(void *arg)
              *
              * 'volatile' to stop compiler optimizing this away
              * to a no-op
-             *
-             * TODO: get a better solution from kernel so we
-             * don't need to write at all so we don't cause
-             * wear on the storage backing the region...
              */
             *(volatile char *)addr = *addr;
             addr += hpagesize;
@@ -497,6 +493,27 @@ static void *do_touch_pages(void *arg)
     return NULL;
 }
 
+static void *do_madv_populate_write_pages(void *arg)
+{
+    MemsetThread *memset_args = (MemsetThread *)arg;
+    const size_t size = memset_args->numpages * memset_args->hpagesize;
+    char * const addr = memset_args->addr;
+    int ret;
+
+    /* See do_touch_pages(). */
+    qemu_mutex_lock(&page_mutex);
+    while (!threads_created_flag) {
+        qemu_cond_wait(&page_cond, &page_mutex);
+    }
+    qemu_mutex_unlock(&page_mutex);
+
+    ret = qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE);
+    if (ret) {
+        memset_thread_failed = true;
+    }
+    return NULL;
+}
+
 static inline int get_memset_num_threads(int smp_cpus)
 {
     long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
@@ -510,10 +527,11 @@ static inline int get_memset_num_threads(int smp_cpus)
 }
 
 static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
-                            int smp_cpus)
+                            int smp_cpus, bool use_madv_populate_write)
 {
     static gsize initialized = 0;
     size_t numpages_per_thread, leftover;
+    void *(*touch_fn)(void *);
     char *addr = area;
     int i = 0;
 
@@ -523,6 +541,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
         g_once_init_leave(&initialized, 1);
     }
 
+    if (use_madv_populate_write) {
+        touch_fn = do_madv_populate_write_pages;
+    } else {
+        touch_fn = do_touch_pages;
+    }
+
     memset_thread_failed = false;
     threads_created_flag = false;
     memset_num_threads = get_memset_num_threads(smp_cpus);
@@ -534,7 +558,7 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
         memset_thread[i].numpages = numpages_per_thread + (i < leftover);
         memset_thread[i].hpagesize = hpagesize;
         qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
-                           do_touch_pages, &memset_thread[i],
+                           touch_fn, &memset_thread[i],
                            QEMU_THREAD_JOINABLE);
         addr += memset_thread[i].numpages * hpagesize;
     }
@@ -553,6 +577,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
     return memset_thread_failed;
 }
 
+static bool madv_populate_write_possible(char *area, size_t pagesize)
+{
+    return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
+           errno != EINVAL;
+}
+
 void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
                      Error **errp)
 {
@@ -560,29 +590,43 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
     struct sigaction act, oldact;
     size_t hpagesize = qemu_fd_getpagesize(fd);
     size_t numpages = DIV_ROUND_UP(memory, hpagesize);
+    bool use_madv_populate_write;
+
+    /*
+     * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
+     * some special mappings, such as mapping /dev/mem.
+     */
+    if (madv_populate_write_possible(area, hpagesize)) {
+        use_madv_populate_write = true;
+    }
 
-    memset(&act, 0, sizeof(act));
-    act.sa_handler = &sigbus_handler;
-    act.sa_flags = 0;
+    if (!use_madv_populate_write) {
+        memset(&act, 0, sizeof(act));
+        act.sa_handler = &sigbus_handler;
+        act.sa_flags = 0;
 
-    ret = sigaction(SIGBUS, &act, &oldact);
-    if (ret) {
-        error_setg_errno(errp, errno,
-            "os_mem_prealloc: failed to install signal handler");
-        return;
+        ret = sigaction(SIGBUS, &act, &oldact);
+        if (ret) {
+            error_setg_errno(errp, errno,
+                "os_mem_prealloc: failed to install signal handler");
+            return;
+        }
     }
 
     /* touch pages simultaneously */
-    if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
+    if (touch_all_pages(area, hpagesize, numpages, smp_cpus,
+                        use_madv_populate_write)) {
         error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
             "pages available to allocate guest RAM");
     }
 
-    ret = sigaction(SIGBUS, &oldact, NULL);
-    if (ret) {
-        /* Terminate QEMU since it can't recover from error */
-        perror("os_mem_prealloc: failed to reinstall signal handler");
-        exit(1);
+    if (!use_madv_populate_write) {
+        ret = sigaction(SIGBUS, &oldact, NULL);
+        if (ret) {
+            /* Terminate QEMU since it can't recover from error */
+            perror("os_mem_prealloc: failed to reinstall signal handler");
+            exit(1);
+        }
     }
 }
 
-- 
2.31.1

Re: [PATCH v1 1/3] util/oslib-posix: Support MADV_POPULATE_WRITE for os_mem_prealloc()

Posted by Daniel P. Berrangé 4 years, 6 months ago

On Wed, Jul 14, 2021 at 01:23:04PM +0200, David Hildenbrand wrote:
> Let's sense support and use it for preallocation. MADV_POPULATE_WRITE
> does not require a SIGBUS handler, doesn't actually touch page content,
> and avoids context switches; it is, therefore, faster and easier to handle
> than our current approach.
> 
> This resolves the TODO in do_touch_pages().
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/qemu/osdep.h |  7 ++++
>  util/oslib-posix.c   | 84 +++++++++++++++++++++++++++++++++-----------
>  2 files changed, 71 insertions(+), 20 deletions(-)
> 

> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index e8bdb02e1d..679796ac1f 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -484,10 +484,6 @@ static void *do_touch_pages(void *arg)
>               *
>               * 'volatile' to stop compiler optimizing this away
>               * to a no-op
> -             *
> -             * TODO: get a better solution from kernel so we
> -             * don't need to write at all so we don't cause
> -             * wear on the storage backing the region...
>               */
>              *(volatile char *)addr = *addr;
>              addr += hpagesize;
> @@ -497,6 +493,27 @@ static void *do_touch_pages(void *arg)
>      return NULL;
>  }
>  
> +static void *do_madv_populate_write_pages(void *arg)
> +{
> +    MemsetThread *memset_args = (MemsetThread *)arg;
> +    const size_t size = memset_args->numpages * memset_args->hpagesize;
> +    char * const addr = memset_args->addr;
> +    int ret;
> +
> +    /* See do_touch_pages(). */
> +    qemu_mutex_lock(&page_mutex);
> +    while (!threads_created_flag) {
> +        qemu_cond_wait(&page_cond, &page_mutex);
> +    }
> +    qemu_mutex_unlock(&page_mutex);
> +
> +    ret = qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE);
> +    if (ret) {
> +        memset_thread_failed = true;
> +    }
> +    return NULL;
> +}
> +
>  static inline int get_memset_num_threads(int smp_cpus)
>  {
>      long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
> @@ -510,10 +527,11 @@ static inline int get_memset_num_threads(int smp_cpus)
>  }
>  
>  static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> -                            int smp_cpus)
> +                            int smp_cpus, bool use_madv_populate_write)
>  {
>      static gsize initialized = 0;
>      size_t numpages_per_thread, leftover;
> +    void *(*touch_fn)(void *);
>      char *addr = area;
>      int i = 0;
>  
> @@ -523,6 +541,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>          g_once_init_leave(&initialized, 1);
>      }
>  
> +    if (use_madv_populate_write) {
> +        touch_fn = do_madv_populate_write_pages;
> +    } else {
> +        touch_fn = do_touch_pages;
> +    }
> +
>      memset_thread_failed = false;
>      threads_created_flag = false;
>      memset_num_threads = get_memset_num_threads(smp_cpus);
> @@ -534,7 +558,7 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>          memset_thread[i].numpages = numpages_per_thread + (i < leftover);
>          memset_thread[i].hpagesize = hpagesize;
>          qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
> -                           do_touch_pages, &memset_thread[i],
> +                           touch_fn, &memset_thread[i],
>                             QEMU_THREAD_JOINABLE);
>          addr += memset_thread[i].numpages * hpagesize;
>      }

Do you have an indication of what the speed differential is for the
old read/write dance vs the kernel madvise. We needed to use threads
previously because the read/write dance is pretty terribly slow.

Is that still a problem with the madvise approach ? I would (perhaps
naively), expect that the kernel would be able to do this efficiently
for arbitrarily large memory regions, such that QEMU would not need
to play games with threads.


> @@ -553,6 +577,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>      return memset_thread_failed;
>  }
>  
> +static bool madv_populate_write_possible(char *area, size_t pagesize)
> +{
> +    return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
> +           errno != EINVAL;
> +}
> +
>  void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
>                       Error **errp)
>  {
> @@ -560,29 +590,43 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
>      struct sigaction act, oldact;
>      size_t hpagesize = qemu_fd_getpagesize(fd);
>      size_t numpages = DIV_ROUND_UP(memory, hpagesize);
> +    bool use_madv_populate_write;

Initialized with random garbage from the stack

> +
> +    /*
> +     * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
> +     * some special mappings, such as mapping /dev/mem.
> +     */
> +    if (madv_populate_write_possible(area, hpagesize)) {
> +        use_madv_populate_write = true;
> +    }

but this implicitly assumes it was initialized to false.

>  
> -    memset(&act, 0, sizeof(act));
> -    act.sa_handler = &sigbus_handler;
> -    act.sa_flags = 0;
> +    if (!use_madv_populate_write) {
> +        memset(&act, 0, sizeof(act));
> +        act.sa_handler = &sigbus_handler;
> +        act.sa_flags = 0;
>  
> -    ret = sigaction(SIGBUS, &act, &oldact);
> -    if (ret) {
> -        error_setg_errno(errp, errno,
> -            "os_mem_prealloc: failed to install signal handler");
> -        return;
> +        ret = sigaction(SIGBUS, &act, &oldact);
> +        if (ret) {
> +            error_setg_errno(errp, errno,
> +                "os_mem_prealloc: failed to install signal handler");
> +            return;
> +        }
>      }
>  
>      /* touch pages simultaneously */
> -    if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
> +    if (touch_all_pages(area, hpagesize, numpages, smp_cpus,
> +                        use_madv_populate_write)) {
>          error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
>              "pages available to allocate guest RAM");
>      }
>  
> -    ret = sigaction(SIGBUS, &oldact, NULL);
> -    if (ret) {
> -        /* Terminate QEMU since it can't recover from error */
> -        perror("os_mem_prealloc: failed to reinstall signal handler");
> -        exit(1);
> +    if (!use_madv_populate_write) {
> +        ret = sigaction(SIGBUS, &oldact, NULL);
> +        if (ret) {
> +            /* Terminate QEMU since it can't recover from error */
> +            perror("os_mem_prealloc: failed to reinstall signal handler");
> +            exit(1);
> +        }
>      }
>  }
>  
> -- 
> 2.31.1
> 
> 

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [PATCH v1 1/3] util/oslib-posix: Support MADV_POPULATE_WRITE for os_mem_prealloc()

Posted by David Hildenbrand 4 years, 6 months ago

>>       memset_thread_failed = false;
>>       threads_created_flag = false;
>>       memset_num_threads = get_memset_num_threads(smp_cpus);
>> @@ -534,7 +558,7 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>>           memset_thread[i].numpages = numpages_per_thread + (i < leftover);
>>           memset_thread[i].hpagesize = hpagesize;
>>           qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
>> -                           do_touch_pages, &memset_thread[i],
>> +                           touch_fn, &memset_thread[i],
>>                              QEMU_THREAD_JOINABLE);
>>           addr += memset_thread[i].numpages * hpagesize;
>>       }
> 
> Do you have an indication of what the speed differential is for the
> old read/write dance vs the kernel madvise. We needed to use threads
> previously because the read/write dance is pretty terribly slow.

The kernel patch has some performance numbers: 
https://lkml.kernel.org/r/20210712083917.16361-1-david@redhat.com

For example (compressed),

     **************************************************
     4096 MiB MAP_PRIVATE:
     **************************************************
     Anon 4 KiB     : Read/Write               :  1054.041 ms
     Anon 4 KiB     : POPULATE_WRITE           :   572.582 ms
     Memfd 4 KiB    : Read/Write               :  1106.561 ms
     Memfd 4 KiB    : POPULATE_WRITE           :   805.881 ms
     Memfd 2 MiB    : Read/Write               :   357.606 ms
     Memfd 2 MiB    : POPULATE_WRITE           :   356.937 ms
     tmpfs          : Read/Write               :  1105.954 ms
     tmpfs          : POPULATE_WRITE           :   822.826 ms
     file           : Read/Write               :  1107.439 ms
     file           : POPULATE_WRITE           :   857.622 ms
     hugetlbfs      : Read/Write               :   356.127 ms
     hugetlbfs      : POPULATE_WRITE           :   355.138 ms


     4096 MiB MAP_SHARED:
     **************************************************
     Anon 4 KiB     : Read/Write               :  1060.350 m
     Anon 4 KiB     : POPULATE_WRITE           :   782.885 ms
     Anon 2 MiB     : Read/Write               :   357.992 ms
     Anon 2 MiB     : POPULATE_WRITE           :   357.808 ms
     Memfd 4 KiB    : Read/Write               :  1100.391 ms
     Memfd 4 KiB    : POPULATE_WRITE           :   804.394 ms
     Memfd 2 MiB    : Read/Write               :   358.250 ms
     Memfd 2 MiB    : POPULATE_WRITE           :   357.334 ms
     tmpfs          : Read/Write               :  1107.567 ms
     tmpfs          : POPULATE_WRITE           :   810.094 ms
     file           : Read/Write               :  1289.509 ms
     file           : POPULATE_WRITE           :  1106.816 ms
     hugetlbfs      : Read/Write               :   357.120 ms
     hugetlbfs      : POPULATE_WRITE           :   356.693 ms

For huge pages, it barely makes a difference with smallish VMs. In the 
other cases, it speeds it up, but not as extreme as that it would allow 
for dropping multi-threading.

The original MADV_POPULATE from 2016 
https://lore.kernel.org/patchwork/patch/389581/ mentiones that it 
especially helps speed up multi-threaded pre-faulting, due to reduced 
mmap_lock contention. I did not do any multi-threading benchmarks, though.

[...]

> 
> Initialized with random garbage from the stack
> 
>> +
>> +    /*
>> +     * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
>> +     * some special mappings, such as mapping /dev/mem.
>> +     */
>> +    if (madv_populate_write_possible(area, hpagesize)) {
>> +        use_madv_populate_write = true;
>> +    }
> 
> but this implicitly assumes it was initialized to false.

Indeed, thanks for catching that!


-- 
Thanks,

David / dhildenb

[PATCH v1 1/3] util/oslib-posix: Support MADV_POPULATE_WRITE for os_mem_prealloc()
[PATCH v1 2/3] util/oslib-posix: Introduce and use MemsetContext for touch_all_pages()
[PATCH v1 3/3] util/oslib-posix: Support concurrent os_mem_prealloc() invocation