Let's sense support and use it for preallocation. MADV_POPULATE_WRITE
does not require a SIGBUS handler, doesn't actually touch page content,
and avoids context switches; it is, therefore, faster and easier to handle
than our current approach.
This resolves the TODO in do_touch_pages().
Signed-off-by: David Hildenbrand <david@redhat.com>
---
include/qemu/osdep.h | 7 ++++
util/oslib-posix.c | 84 +++++++++++++++++++++++++++++++++-----------
2 files changed, 71 insertions(+), 20 deletions(-)
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 60718fc342..d1660d67fa 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -471,6 +471,11 @@ static inline void qemu_cleanup_generic_vfree(void *p)
#else
#define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED
#endif
+#ifdef MADV_POPULATE_WRITE
+#define QEMU_MADV_POPULATE_WRITE MADV_POPULATE_WRITE
+#else
+#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID
+#endif
#elif defined(CONFIG_POSIX_MADVISE)
@@ -484,6 +489,7 @@ static inline void qemu_cleanup_generic_vfree(void *p)
#define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID
#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID
#define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED
+#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID
#else /* no-op */
@@ -497,6 +503,7 @@ static inline void qemu_cleanup_generic_vfree(void *p)
#define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID
#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID
#define QEMU_MADV_REMOVE QEMU_MADV_INVALID
+#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID
#endif
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e8bdb02e1d..679796ac1f 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -484,10 +484,6 @@ static void *do_touch_pages(void *arg)
*
* 'volatile' to stop compiler optimizing this away
* to a no-op
- *
- * TODO: get a better solution from kernel so we
- * don't need to write at all so we don't cause
- * wear on the storage backing the region...
*/
*(volatile char *)addr = *addr;
addr += hpagesize;
@@ -497,6 +493,27 @@ static void *do_touch_pages(void *arg)
return NULL;
}
+static void *do_madv_populate_write_pages(void *arg)
+{
+ MemsetThread *memset_args = (MemsetThread *)arg;
+ const size_t size = memset_args->numpages * memset_args->hpagesize;
+ char * const addr = memset_args->addr;
+ int ret;
+
+ /* See do_touch_pages(). */
+ qemu_mutex_lock(&page_mutex);
+ while (!threads_created_flag) {
+ qemu_cond_wait(&page_cond, &page_mutex);
+ }
+ qemu_mutex_unlock(&page_mutex);
+
+ ret = qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE);
+ if (ret) {
+ memset_thread_failed = true;
+ }
+ return NULL;
+}
+
static inline int get_memset_num_threads(int smp_cpus)
{
long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
@@ -510,10 +527,11 @@ static inline int get_memset_num_threads(int smp_cpus)
}
static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
- int smp_cpus)
+ int smp_cpus, bool use_madv_populate_write)
{
static gsize initialized = 0;
size_t numpages_per_thread, leftover;
+ void *(*touch_fn)(void *);
char *addr = area;
int i = 0;
@@ -523,6 +541,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
g_once_init_leave(&initialized, 1);
}
+ if (use_madv_populate_write) {
+ touch_fn = do_madv_populate_write_pages;
+ } else {
+ touch_fn = do_touch_pages;
+ }
+
memset_thread_failed = false;
threads_created_flag = false;
memset_num_threads = get_memset_num_threads(smp_cpus);
@@ -534,7 +558,7 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
memset_thread[i].numpages = numpages_per_thread + (i < leftover);
memset_thread[i].hpagesize = hpagesize;
qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
- do_touch_pages, &memset_thread[i],
+ touch_fn, &memset_thread[i],
QEMU_THREAD_JOINABLE);
addr += memset_thread[i].numpages * hpagesize;
}
@@ -553,6 +577,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
return memset_thread_failed;
}
+static bool madv_populate_write_possible(char *area, size_t pagesize)
+{
+ return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
+ errno != EINVAL;
+}
+
void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
Error **errp)
{
@@ -560,29 +590,43 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
struct sigaction act, oldact;
size_t hpagesize = qemu_fd_getpagesize(fd);
size_t numpages = DIV_ROUND_UP(memory, hpagesize);
+ bool use_madv_populate_write;
+
+ /*
+ * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
+ * some special mappings, such as mapping /dev/mem.
+ */
+ if (madv_populate_write_possible(area, hpagesize)) {
+ use_madv_populate_write = true;
+ }
- memset(&act, 0, sizeof(act));
- act.sa_handler = &sigbus_handler;
- act.sa_flags = 0;
+ if (!use_madv_populate_write) {
+ memset(&act, 0, sizeof(act));
+ act.sa_handler = &sigbus_handler;
+ act.sa_flags = 0;
- ret = sigaction(SIGBUS, &act, &oldact);
- if (ret) {
- error_setg_errno(errp, errno,
- "os_mem_prealloc: failed to install signal handler");
- return;
+ ret = sigaction(SIGBUS, &act, &oldact);
+ if (ret) {
+ error_setg_errno(errp, errno,
+ "os_mem_prealloc: failed to install signal handler");
+ return;
+ }
}
/* touch pages simultaneously */
- if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
+ if (touch_all_pages(area, hpagesize, numpages, smp_cpus,
+ use_madv_populate_write)) {
error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
"pages available to allocate guest RAM");
}
- ret = sigaction(SIGBUS, &oldact, NULL);
- if (ret) {
- /* Terminate QEMU since it can't recover from error */
- perror("os_mem_prealloc: failed to reinstall signal handler");
- exit(1);
+ if (!use_madv_populate_write) {
+ ret = sigaction(SIGBUS, &oldact, NULL);
+ if (ret) {
+ /* Terminate QEMU since it can't recover from error */
+ perror("os_mem_prealloc: failed to reinstall signal handler");
+ exit(1);
+ }
}
}
--
2.31.1
On Wed, Jul 14, 2021 at 01:23:04PM +0200, David Hildenbrand wrote:
> Let's sense support and use it for preallocation. MADV_POPULATE_WRITE
> does not require a SIGBUS handler, doesn't actually touch page content,
> and avoids context switches; it is, therefore, faster and easier to handle
> than our current approach.
>
> This resolves the TODO in do_touch_pages().
>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
> include/qemu/osdep.h | 7 ++++
> util/oslib-posix.c | 84 +++++++++++++++++++++++++++++++++-----------
> 2 files changed, 71 insertions(+), 20 deletions(-)
>
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index e8bdb02e1d..679796ac1f 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -484,10 +484,6 @@ static void *do_touch_pages(void *arg)
> *
> * 'volatile' to stop compiler optimizing this away
> * to a no-op
> - *
> - * TODO: get a better solution from kernel so we
> - * don't need to write at all so we don't cause
> - * wear on the storage backing the region...
> */
> *(volatile char *)addr = *addr;
> addr += hpagesize;
> @@ -497,6 +493,27 @@ static void *do_touch_pages(void *arg)
> return NULL;
> }
>
> +static void *do_madv_populate_write_pages(void *arg)
> +{
> + MemsetThread *memset_args = (MemsetThread *)arg;
> + const size_t size = memset_args->numpages * memset_args->hpagesize;
> + char * const addr = memset_args->addr;
> + int ret;
> +
> + /* See do_touch_pages(). */
> + qemu_mutex_lock(&page_mutex);
> + while (!threads_created_flag) {
> + qemu_cond_wait(&page_cond, &page_mutex);
> + }
> + qemu_mutex_unlock(&page_mutex);
> +
> + ret = qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE);
> + if (ret) {
> + memset_thread_failed = true;
> + }
> + return NULL;
> +}
> +
> static inline int get_memset_num_threads(int smp_cpus)
> {
> long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
> @@ -510,10 +527,11 @@ static inline int get_memset_num_threads(int smp_cpus)
> }
>
> static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> - int smp_cpus)
> + int smp_cpus, bool use_madv_populate_write)
> {
> static gsize initialized = 0;
> size_t numpages_per_thread, leftover;
> + void *(*touch_fn)(void *);
> char *addr = area;
> int i = 0;
>
> @@ -523,6 +541,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> g_once_init_leave(&initialized, 1);
> }
>
> + if (use_madv_populate_write) {
> + touch_fn = do_madv_populate_write_pages;
> + } else {
> + touch_fn = do_touch_pages;
> + }
> +
> memset_thread_failed = false;
> threads_created_flag = false;
> memset_num_threads = get_memset_num_threads(smp_cpus);
> @@ -534,7 +558,7 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> memset_thread[i].numpages = numpages_per_thread + (i < leftover);
> memset_thread[i].hpagesize = hpagesize;
> qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
> - do_touch_pages, &memset_thread[i],
> + touch_fn, &memset_thread[i],
> QEMU_THREAD_JOINABLE);
> addr += memset_thread[i].numpages * hpagesize;
> }
Do you have an indication of what the speed differential is for the
old read/write dance vs the kernel madvise. We needed to use threads
previously because the read/write dance is pretty terribly slow.
Is that still a problem with the madvise approach ? I would (perhaps
naively), expect that the kernel would be able to do this efficiently
for arbitrarily large memory regions, such that QEMU would not need
to play games with threads.
> @@ -553,6 +577,12 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
> return memset_thread_failed;
> }
>
> +static bool madv_populate_write_possible(char *area, size_t pagesize)
> +{
> + return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
> + errno != EINVAL;
> +}
> +
> void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
> Error **errp)
> {
> @@ -560,29 +590,43 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
> struct sigaction act, oldact;
> size_t hpagesize = qemu_fd_getpagesize(fd);
> size_t numpages = DIV_ROUND_UP(memory, hpagesize);
> + bool use_madv_populate_write;
Initialized with random garbage from the stack
> +
> + /*
> + * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
> + * some special mappings, such as mapping /dev/mem.
> + */
> + if (madv_populate_write_possible(area, hpagesize)) {
> + use_madv_populate_write = true;
> + }
but this implicitly assumes it was initialized to false.
>
> - memset(&act, 0, sizeof(act));
> - act.sa_handler = &sigbus_handler;
> - act.sa_flags = 0;
> + if (!use_madv_populate_write) {
> + memset(&act, 0, sizeof(act));
> + act.sa_handler = &sigbus_handler;
> + act.sa_flags = 0;
>
> - ret = sigaction(SIGBUS, &act, &oldact);
> - if (ret) {
> - error_setg_errno(errp, errno,
> - "os_mem_prealloc: failed to install signal handler");
> - return;
> + ret = sigaction(SIGBUS, &act, &oldact);
> + if (ret) {
> + error_setg_errno(errp, errno,
> + "os_mem_prealloc: failed to install signal handler");
> + return;
> + }
> }
>
> /* touch pages simultaneously */
> - if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
> + if (touch_all_pages(area, hpagesize, numpages, smp_cpus,
> + use_madv_populate_write)) {
> error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
> "pages available to allocate guest RAM");
> }
>
> - ret = sigaction(SIGBUS, &oldact, NULL);
> - if (ret) {
> - /* Terminate QEMU since it can't recover from error */
> - perror("os_mem_prealloc: failed to reinstall signal handler");
> - exit(1);
> + if (!use_madv_populate_write) {
> + ret = sigaction(SIGBUS, &oldact, NULL);
> + if (ret) {
> + /* Terminate QEMU since it can't recover from error */
> + perror("os_mem_prealloc: failed to reinstall signal handler");
> + exit(1);
> + }
> }
> }
>
> --
> 2.31.1
>
>
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
>> memset_thread_failed = false;
>> threads_created_flag = false;
>> memset_num_threads = get_memset_num_threads(smp_cpus);
>> @@ -534,7 +558,7 @@ static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
>> memset_thread[i].numpages = numpages_per_thread + (i < leftover);
>> memset_thread[i].hpagesize = hpagesize;
>> qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
>> - do_touch_pages, &memset_thread[i],
>> + touch_fn, &memset_thread[i],
>> QEMU_THREAD_JOINABLE);
>> addr += memset_thread[i].numpages * hpagesize;
>> }
>
> Do you have an indication of what the speed differential is for the
> old read/write dance vs the kernel madvise. We needed to use threads
> previously because the read/write dance is pretty terribly slow.
The kernel patch has some performance numbers:
https://lkml.kernel.org/r/20210712083917.16361-1-david@redhat.com
For example (compressed),
**************************************************
4096 MiB MAP_PRIVATE:
**************************************************
Anon 4 KiB : Read/Write : 1054.041 ms
Anon 4 KiB : POPULATE_WRITE : 572.582 ms
Memfd 4 KiB : Read/Write : 1106.561 ms
Memfd 4 KiB : POPULATE_WRITE : 805.881 ms
Memfd 2 MiB : Read/Write : 357.606 ms
Memfd 2 MiB : POPULATE_WRITE : 356.937 ms
tmpfs : Read/Write : 1105.954 ms
tmpfs : POPULATE_WRITE : 822.826 ms
file : Read/Write : 1107.439 ms
file : POPULATE_WRITE : 857.622 ms
hugetlbfs : Read/Write : 356.127 ms
hugetlbfs : POPULATE_WRITE : 355.138 ms
4096 MiB MAP_SHARED:
**************************************************
Anon 4 KiB : Read/Write : 1060.350 m
Anon 4 KiB : POPULATE_WRITE : 782.885 ms
Anon 2 MiB : Read/Write : 357.992 ms
Anon 2 MiB : POPULATE_WRITE : 357.808 ms
Memfd 4 KiB : Read/Write : 1100.391 ms
Memfd 4 KiB : POPULATE_WRITE : 804.394 ms
Memfd 2 MiB : Read/Write : 358.250 ms
Memfd 2 MiB : POPULATE_WRITE : 357.334 ms
tmpfs : Read/Write : 1107.567 ms
tmpfs : POPULATE_WRITE : 810.094 ms
file : Read/Write : 1289.509 ms
file : POPULATE_WRITE : 1106.816 ms
hugetlbfs : Read/Write : 357.120 ms
hugetlbfs : POPULATE_WRITE : 356.693 ms
For huge pages, it barely makes a difference with smallish VMs. In the
other cases, it speeds it up, but not as extreme as that it would allow
for dropping multi-threading.
The original MADV_POPULATE from 2016
https://lore.kernel.org/patchwork/patch/389581/ mentiones that it
especially helps speed up multi-threaded pre-faulting, due to reduced
mmap_lock contention. I did not do any multi-threading benchmarks, though.
[...]
>
> Initialized with random garbage from the stack
>
>> +
>> + /*
>> + * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
>> + * some special mappings, such as mapping /dev/mem.
>> + */
>> + if (madv_populate_write_possible(area, hpagesize)) {
>> + use_madv_populate_write = true;
>> + }
>
> but this implicitly assumes it was initialized to false.
Indeed, thanks for catching that!
--
Thanks,
David / dhildenb
© 2016 - 2026 Red Hat, Inc.