Let's support RAM_NORESERVE via MAP_NORESERVE. At least on Linux,
the flag has no effect on shared mappings - except for anonymous memory
and hugetlbfs.
Linux man page:
"MAP_NORESERVE: Do not reserve swap space for this mapping. When swap
space is reserved, one has the guarantee that it is possible to modify
the mapping. When swap space is not reserved one might get SIGSEGV
upon a write if no physical memory is available. See also the discussion
of the file /proc/sys/vm/overcommit_memory in proc(5). In kernels before
2.6, this flag had effect only for private writable mappings."
Note that the "guarantee" part is wrong with memory overcommit in Linux.
Also, in Linux hugetlbfs is treated differently - we configure reservation
of huge pages from the pool, not reservation of swap space (huge pages
cannot be swapped).
The rough behavior is [1]:
a) !Hugetlbfs:
1) Without MAP_NORESERVE *or* with memory overcommit under Linux
disabled ("/proc/sys/vm/overcommit_memory == 2"), the following
accounting/reservation happens:
For a file backed map
SHARED or READ-only - 0 cost (the file is the map not swap)
PRIVATE WRITABLE - size of mapping per instance
For an anonymous or /dev/zero map
SHARED - size of mapping
PRIVATE READ-only - 0 cost (but of little use)
PRIVATE WRITABLE - size of mapping per instance
2) With MAP_NORESERVE, no accounting/reservation happens.
b) Hugetlbfs:
1) Without MAP_NORESERVE, huge pages are reserved.
2) With MAP_NORESERVE, no huge pages are reserved.
Note: With "/proc/sys/vm/overcommit_memory == 0", we were already able
to configure it for !hugetlbfs globally; this toggle now allows
configuring it more fine-grained, not for the whole system.
The target use case is virtio-mem, which dynamically exposes memory
inside a large, sparse memory area to the VM.
[1] https://www.kernel.org/doc/Documentation/vm/overcommit-accounting
Signed-off-by: David Hildenbrand <david@redhat.com>
---
softmmu/physmem.c | 1 +
util/mmap-alloc.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 9820d845c0..f45a85add1 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -2248,6 +2248,7 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
abort();
} else {
flags = MAP_FIXED;
+ flags |= (block->flags & RAM_NORESERVE) ? MAP_NORESERVE : 0;
if (block->fd >= 0) {
flags |= (block->flags & RAM_SHARED ?
MAP_SHARED : MAP_PRIVATE);
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index bb99843106..f77d9ca574 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -20,6 +20,7 @@
#include "qemu/osdep.h"
#include "qemu/mmap-alloc.h"
#include "qemu/host-utils.h"
+#include "qemu/cutils.h"
#include "qemu/error-report.h"
#define HUGETLBFS_MAGIC 0x958458f6
@@ -120,7 +121,7 @@ static void *mmap_reserve(size_t size, int fd)
* it accessible.
*/
static void *mmap_activate(void *ptr, size_t size, int fd, bool readonly,
- bool shared, bool is_pmem)
+ bool shared, bool is_pmem, bool noreserve)
{
const int prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
int map_sync_flags = 0;
@@ -129,6 +130,7 @@ static void *mmap_activate(void *ptr, size_t size, int fd, bool readonly,
flags |= fd == -1 ? MAP_ANONYMOUS : 0;
flags |= shared ? MAP_SHARED : MAP_PRIVATE;
+ flags |= noreserve ? MAP_NORESERVE : 0;
if (shared && is_pmem) {
map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
}
@@ -170,6 +172,43 @@ static inline size_t mmap_guard_pagesize(int fd)
#endif
}
+#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
+static bool map_noreserve_effective(int fd, bool shared)
+{
+#if defined(__linux__)
+ gchar *content = NULL;
+ const char *endptr;
+ unsigned int tmp;
+
+ /* hugetlbfs behaves differently */
+ if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size) {
+ return true;
+ }
+
+ /* only private shared mappings are accounted (ignoring /dev/zero) */
+ if (fd != -1 && shared) {
+ return true;
+ }
+
+ if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
+ !qemu_strtoui(content, &endptr, 0, &tmp) &&
+ (!endptr || *endptr == '\n')) {
+ if (tmp == 2) {
+ error_report("Skipping reservation of swap space is not supported: "
+ " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
+ return false;
+ }
+ return true;
+ }
+ /* this interface has been around since Linux 2.6 */
+ error_report("Skipping reservation of swap space is not supported: "
+ " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
+ return false;
+#else
+ return true;
+#endif
+}
+
void *qemu_ram_mmap(int fd,
size_t size,
size_t align,
@@ -182,8 +221,7 @@ void *qemu_ram_mmap(int fd,
size_t offset, total;
void *ptr, *guardptr;
- if (noreserve) {
- error_report("Skipping reservation of swap space is not supported");
+ if (noreserve && !map_noreserve_effective(fd, shared)) {
return MAP_FAILED;
}
@@ -204,7 +242,8 @@ void *qemu_ram_mmap(int fd,
offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
- ptr = mmap_activate(guardptr + offset, size, fd, readonly, shared, is_pmem);
+ ptr = mmap_activate(guardptr + offset, size, fd, readonly, shared, is_pmem,
+ noreserve);
if (ptr == MAP_FAILED) {
munmap(guardptr, total);
return MAP_FAILED;
--
2.29.2
On Tue, Feb 09, 2021 at 02:49:38PM +0100, David Hildenbrand wrote:
> +#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
> +static bool map_noreserve_effective(int fd, bool shared)
> +{
> +#if defined(__linux__)
> + gchar *content = NULL;
> + const char *endptr;
> + unsigned int tmp;
> +
> + /* hugetlbfs behaves differently */
> + if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size) {
> + return true;
> + }
> +
> + /* only private shared mappings are accounted (ignoring /dev/zero) */
> + if (fd != -1 && shared) {
> + return true;
> + }
> +
> + if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
> + !qemu_strtoui(content, &endptr, 0, &tmp) &&
> + (!endptr || *endptr == '\n')) {
> + if (tmp == 2) {
> + error_report("Skipping reservation of swap space is not supported: "
> + " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
> + return false;
> + }
> + return true;
> + }
> + /* this interface has been around since Linux 2.6 */
> + error_report("Skipping reservation of swap space is not supported: "
> + " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
> + return false;
> +#else
> + return true;
> +#endif
> +}
I feel like this helper wants to fail gracefully for some conditions. Could
you elaborate one example and attach to the commit log?
I'm also wondering whether it would worth to check the global value. Even if
overcommit is globally disabled, do we (as an application process) need to care
about it? I think the MAP_NORESERVE would simply be silently ignored by the
kernel and that seems to be design of it, otherwise would all apps who uses
MAP_NORESERVE would need to do similar things too?
Thanks,
--
Peter Xu
On 02.03.21 18:51, Peter Xu wrote:
> On Tue, Feb 09, 2021 at 02:49:38PM +0100, David Hildenbrand wrote:
>> +#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
>> +static bool map_noreserve_effective(int fd, bool shared)
>> +{
>> +#if defined(__linux__)
>> + gchar *content = NULL;
>> + const char *endptr;
>> + unsigned int tmp;
>> +
>> + /* hugetlbfs behaves differently */
>> + if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size) {
>> + return true;
>> + }
>> +
>> + /* only private shared mappings are accounted (ignoring /dev/zero) */
>> + if (fd != -1 && shared) {
>> + return true;
>> + }
>> +
>> + if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
>> + !qemu_strtoui(content, &endptr, 0, &tmp) &&
>> + (!endptr || *endptr == '\n')) {
>> + if (tmp == 2) {
>> + error_report("Skipping reservation of swap space is not supported: "
>> + " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
>> + return false;
>> + }
>> + return true;
>> + }
>> + /* this interface has been around since Linux 2.6 */
>> + error_report("Skipping reservation of swap space is not supported: "
>> + " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
>> + return false;
>> +#else
>> + return true;
>> +#endif
>> +}
>
> I feel like this helper wants to fail gracefully for some conditions. Could
> you elaborate one example and attach to the commit log?
Sure. The case is "/proc/sys/vm/overcommit_memory == 2" (never overcommit)
MAP_NORESERVE is without effect and sparse memory regions are somewhat
impossible.
>
> I'm also wondering whether it would worth to check the global value. Even if
> overcommit is globally disabled, do we (as an application process) need to care
> about it? I think the MAP_NORESERVE would simply be silently ignored by the
> kernel and that seems to be design of it, otherwise would all apps who uses > MAP_NORESERVE would need to do similar things too?
Right, I want to catch the "gets silently ignored" part, because someone
requested "reserved=off" (!default) but does not actually get what he
asked for.
As one example, glibc manages heaps via:
a) Creating a new heap: mmap(PROT_NONE, MAP_NORESERVE) the maximum size,
then mprotect(PROT_READ|PROT_WRITE) the initial heap size. Even if
MAP_NORESERVE is ignored, only !PROT_NONE memory ever gets committed
("reserve swap space") in Linux.
b) Growing the heap via mprotect(PROT_READ|PROT_WRITE) within the
existing mmap. This will commit memory in case MAP_NORESERVE got ignored.
c) Shrinking the heap ("discard memory") via MADV_DONTNEED *unless*
"/proc/sys/vm/overcommit_memory == 2" - the only way to undo
mprotect(PROT_READ|PROT_WRITE) and to un-commit memory is by doing a
mmap(PROT_NONE, MAP_FIXED) over the problematic region.
If you're interested, you can take a look at:
malloc/arena.c
sysdeps/unix/sysv/linux/malloc-sysdep.h:check_may_shrink_heap()
Thanks!
--
Thanks,
David / dhildenb
On Tue, Mar 02, 2021 at 08:01:11PM +0100, David Hildenbrand wrote:
> On 02.03.21 18:51, Peter Xu wrote:
> > On Tue, Feb 09, 2021 at 02:49:38PM +0100, David Hildenbrand wrote:
> > > +#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
> > > +static bool map_noreserve_effective(int fd, bool shared)
> > > +{
> > > +#if defined(__linux__)
> > > + gchar *content = NULL;
> > > + const char *endptr;
> > > + unsigned int tmp;
> > > +
> > > + /* hugetlbfs behaves differently */
> > > + if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size) {
> > > + return true;
> > > + }
> > > +
> > > + /* only private shared mappings are accounted (ignoring /dev/zero) */
> > > + if (fd != -1 && shared) {
> > > + return true;
> > > + }
[1]
> > > +
> > > + if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
> > > + !qemu_strtoui(content, &endptr, 0, &tmp) &&
> > > + (!endptr || *endptr == '\n')) {
> > > + if (tmp == 2) {
> > > + error_report("Skipping reservation of swap space is not supported: "
> > > + " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
> > > + return false;
> > > + }
> > > + return true;
> > > + }
> > > + /* this interface has been around since Linux 2.6 */
> > > + error_report("Skipping reservation of swap space is not supported: "
> > > + " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
> > > + return false;
> > > +#else
> > > + return true;
> > > +#endif
> > > +}
> >
> > I feel like this helper wants to fail gracefully for some conditions. Could
> > you elaborate one example and attach to the commit log?
>
> Sure. The case is "/proc/sys/vm/overcommit_memory == 2" (never overcommit)
>
> MAP_NORESERVE is without effect and sparse memory regions are somewhat
> impossible.
>
> >
> > I'm also wondering whether it would worth to check the global value. Even if
> > overcommit is globally disabled, do we (as an application process) need to care
> > about it? I think the MAP_NORESERVE would simply be silently ignored by the
> > kernel and that seems to be design of it, otherwise would all apps who uses > MAP_NORESERVE would need to do similar things too?
>
> Right, I want to catch the "gets silently ignored" part, because someone
> requested "reserved=off" (!default) but does not actually get what he asked
> for.
>
> As one example, glibc manages heaps via:
>
> a) Creating a new heap: mmap(PROT_NONE, MAP_NORESERVE) the maximum size,
> then mprotect(PROT_READ|PROT_WRITE) the initial heap size. Even if
> MAP_NORESERVE is ignored, only !PROT_NONE memory ever gets committed
> ("reserve swap space") in Linux.
>
> b) Growing the heap via mprotect(PROT_READ|PROT_WRITE) within the existing
> mmap. This will commit memory in case MAP_NORESERVE got ignored.
>
> c) Shrinking the heap ("discard memory") via MADV_DONTNEED *unless*
> "/proc/sys/vm/overcommit_memory == 2" - the only way to undo
> mprotect(PROT_READ|PROT_WRITE) and to un-commit memory is by doing a
> mmap(PROT_NONE, MAP_FIXED) over the problematic region.
>
> If you're interested, you can take a look at:
>
> malloc/arena.c
> sysdeps/unix/sysv/linux/malloc-sysdep.h:check_may_shrink_heap()
Thanks for the context. It's interesting to know libc has such special heap
operations.
Glibc shrinks heap to save memory for the no-over-commit case, however in our
case currently we'd like to fail some users using global_overcommit=2 but
reserve=off - it means even if we don't fail the user, mmap() could also fail
if it's overcommitted. Even if this mmap() didn't fail, it'll fail very easily
later on iiuc, right?
I think it's fine to have that early failure, it just seems less helpful than
what glibc was doing which shrinks active memory for real, meanwhile there
seems to encode some very detailed OS information into this helper, so just
less charming.
Btw above [1] "fd != -1 && shared" looks weird to me.
Firstly it'll bypass overcommit_memory==2 check and return true directly, is
that right? I thought the global will be meaningful for all memories except
hugetlbfs (in do_mmap() of Linux).
Meanwhile, I don't see why file-backed share memories is so special too.. From
your commit message, I'm not sure whether you wanted to return false instead,
however that's still not the case IIUC, since e.g. /dev/shmem still does
accounting iiuc, while MAP_NORESERVE will skip it.
--
Peter Xu
On 02.03.21 22:44, Peter Xu wrote:
> On Tue, Mar 02, 2021 at 08:01:11PM +0100, David Hildenbrand wrote:
>> On 02.03.21 18:51, Peter Xu wrote:
>>> On Tue, Feb 09, 2021 at 02:49:38PM +0100, David Hildenbrand wrote:
>>>> +#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
>>>> +static bool map_noreserve_effective(int fd, bool shared)
>>>> +{
>>>> +#if defined(__linux__)
>>>> + gchar *content = NULL;
>>>> + const char *endptr;
>>>> + unsigned int tmp;
>>>> +
>>>> + /* hugetlbfs behaves differently */
>>>> + if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size) {
>>>> + return true;
>>>> + }
>>>> +
>>>> + /* only private shared mappings are accounted (ignoring /dev/zero) */
>>>> + if (fd != -1 && shared) {
>>>> + return true;
>>>> + }
>
> [1]
>
>>>> +
>>>> + if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
>>>> + !qemu_strtoui(content, &endptr, 0, &tmp) &&
>>>> + (!endptr || *endptr == '\n')) {
>>>> + if (tmp == 2) {
>>>> + error_report("Skipping reservation of swap space is not supported: "
>>>> + " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
>>>> + return false;
>>>> + }
>>>> + return true;
>>>> + }
>>>> + /* this interface has been around since Linux 2.6 */
>>>> + error_report("Skipping reservation of swap space is not supported: "
>>>> + " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
>>>> + return false;
>>>> +#else
>>>> + return true;
>>>> +#endif
>>>> +}
>>>
>>> I feel like this helper wants to fail gracefully for some conditions. Could
>>> you elaborate one example and attach to the commit log?
>>
>> Sure. The case is "/proc/sys/vm/overcommit_memory == 2" (never overcommit)
>>
>> MAP_NORESERVE is without effect and sparse memory regions are somewhat
>> impossible.
>>
>>>
>>> I'm also wondering whether it would worth to check the global value. Even if
>>> overcommit is globally disabled, do we (as an application process) need to care
>>> about it? I think the MAP_NORESERVE would simply be silently ignored by the
>>> kernel and that seems to be design of it, otherwise would all apps who uses > MAP_NORESERVE would need to do similar things too?
>>
>> Right, I want to catch the "gets silently ignored" part, because someone
>> requested "reserved=off" (!default) but does not actually get what he asked
>> for.
>>
>> As one example, glibc manages heaps via:
>>
>> a) Creating a new heap: mmap(PROT_NONE, MAP_NORESERVE) the maximum size,
>> then mprotect(PROT_READ|PROT_WRITE) the initial heap size. Even if
>> MAP_NORESERVE is ignored, only !PROT_NONE memory ever gets committed
>> ("reserve swap space") in Linux.
>>
>> b) Growing the heap via mprotect(PROT_READ|PROT_WRITE) within the existing
>> mmap. This will commit memory in case MAP_NORESERVE got ignored.
>>
>> c) Shrinking the heap ("discard memory") via MADV_DONTNEED *unless*
>> "/proc/sys/vm/overcommit_memory == 2" - the only way to undo
>> mprotect(PROT_READ|PROT_WRITE) and to un-commit memory is by doing a
>> mmap(PROT_NONE, MAP_FIXED) over the problematic region.
>>
>> If you're interested, you can take a look at:
>>
>> malloc/arena.c
>> sysdeps/unix/sysv/linux/malloc-sysdep.h:check_may_shrink_heap()
>
> Thanks for the context. It's interesting to know libc has such special heap
> operations.
>
> Glibc shrinks heap to save memory for the no-over-commit case, however in our
> case currently we'd like to fail some users using global_overcommit=2 but
> reserve=off - it means even if we don't fail the user, mmap() could also fail
> if it's overcommitted. Even if this mmap() didn't fail, it'll fail very easily
> later on iiuc, right?
Here is the issue I want to catch.
Assume you create a VM with a virtio-mem device that can grow big in the
future. You specify reserved=off. mmap() succeeds and you assume the
very sparse memory region does not actually commit memory. But it
commits all memory in the sparse mmap.
Assume you want to create another VM. It just fails although you still
have plenty of free memory - because the previous MAP_NORESERVE got
ignored.
I want to warn the user right away that the configuration is messed up
and that "reserved=off" is not effective.
For anonymous memory, "reserved=off" will start really being useful when
having a way to dynamically reserve swap space.
>
> I think it's fine to have that early failure, it just seems less helpful than
> what glibc was doing which shrinks active memory for real, meanwhile there
> seems to encode some very detailed OS information into this helper, so just
> less charming.
It's not nice, but the messed-up Linux implementation is to blame. Just
read the Linux man page of the mmap where there is an explicit link to
"/proc/sys/vm/overcommit_memory" for this very reason.
>
> Btw above [1] "fd != -1 && shared" looks weird to me.
As we never have shared anonymous memory (and that's good, because it is
super weird) in QEMU, we can simplify to
"if (shared) { return true; }"
>
> Firstly it'll bypass overcommit_memory==2 check and return true directly, is
> that right? I thought the global will be meaningful for all memories except
> hugetlbfs (in do_mmap() of Linux).
See the description in the patch. On basically all (except anonymous
shared) shared memory we don't ever reserve swap space.
See mmap_region():
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
whereby
accountable_mapping():
if (file && is_file_hugepages(file))
return 0;
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
in do_mmap(), we only affect if VM_NORESERVE is set or not.
And this makes perfect sense: Most* shared mappings don't need swap
space because they can just be written back to the original backing
storage (file). This is different to private mappings, which cannot be
written back to the file - so we need swap space.
>
> Meanwhile, I don't see why file-backed share memories is so special too.. From
Just think about 100000 processes mapping the same file shared. Would
you want to reserve 100000 the same swap space for the same file?
Although you never* ever really need swap space because you can just
writeback to the file instead of swapping.
> your commit message, I'm not sure whether you wanted to return false instead,
> however that's still not the case IIUC, since e.g. /dev/shmem still does
> accounting iiuc, while MAP_NORESERVE will skip it.
*except /dev/shmem. It actually needs swap space because the actual file
resides in tmpfs->RAM but Linux will never ever commit that memory.
Main reason is: because you don't know when/how much to commit.
a) When creating/truncating the file we don't know if it's going to be
sparse. So how much should we commit?
b) When mapping the file, the same comment as above applies: you don't
want to reserve per-mapper but instead per-file.
You can create and map gigantic memfd/shmem files even with
"/proc/sys/vm/overcommit_memory == 2", because we will never ever commit
any of that memory. Yes, shmem is dangerous.
Shared mappings behave like always having MAP_NORESERVE, thus the
"return true;"
It's interesting that you also raise this point: I also want to propose
dynamic reservation of swap space for shmem in the future. Instead of
being per process, this would have to be per file and similar allow to
coordinate with the kernel how much memory we are actually intending to
use (->commit) such that the kernel can properly account and reject if
it wouldn't be possible. If only a single day would have more than 24
hours :)
--
Thanks,
David / dhildenb
On Wed, Mar 03, 2021 at 11:14:10AM +0100, David Hildenbrand wrote:
> On 02.03.21 22:44, Peter Xu wrote:
> > On Tue, Mar 02, 2021 at 08:01:11PM +0100, David Hildenbrand wrote:
> > > On 02.03.21 18:51, Peter Xu wrote:
> > > > On Tue, Feb 09, 2021 at 02:49:38PM +0100, David Hildenbrand wrote:
> > > > > +#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
> > > > > +static bool map_noreserve_effective(int fd, bool shared)
> > > > > +{
> > > > > +#if defined(__linux__)
> > > > > + gchar *content = NULL;
> > > > > + const char *endptr;
> > > > > + unsigned int tmp;
> > > > > +
> > > > > + /* hugetlbfs behaves differently */
> > > > > + if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size) {
> > > > > + return true;
> > > > > + }
> > > > > +
> > > > > + /* only private shared mappings are accounted (ignoring /dev/zero) */
> > > > > + if (fd != -1 && shared) {
> > > > > + return true;
> > > > > + }
> >
> > [1]
> >
> > > > > +
> > > > > + if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
> > > > > + !qemu_strtoui(content, &endptr, 0, &tmp) &&
> > > > > + (!endptr || *endptr == '\n')) {
> > > > > + if (tmp == 2) {
> > > > > + error_report("Skipping reservation of swap space is not supported: "
> > > > > + " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
> > > > > + return false;
> > > > > + }
> > > > > + return true;
> > > > > + }
> > > > > + /* this interface has been around since Linux 2.6 */
> > > > > + error_report("Skipping reservation of swap space is not supported: "
> > > > > + " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
> > > > > + return false;
> > > > > +#else
> > > > > + return true;
> > > > > +#endif
> > > > > +}
> > > >
> > > > I feel like this helper wants to fail gracefully for some conditions. Could
> > > > you elaborate one example and attach to the commit log?
> > >
> > > Sure. The case is "/proc/sys/vm/overcommit_memory == 2" (never overcommit)
> > >
> > > MAP_NORESERVE is without effect and sparse memory regions are somewhat
> > > impossible.
> > >
> > > >
> > > > I'm also wondering whether it would worth to check the global value. Even if
> > > > overcommit is globally disabled, do we (as an application process) need to care
> > > > about it? I think the MAP_NORESERVE would simply be silently ignored by the
> > > > kernel and that seems to be design of it, otherwise would all apps who uses > MAP_NORESERVE would need to do similar things too?
> > >
> > > Right, I want to catch the "gets silently ignored" part, because someone
> > > requested "reserved=off" (!default) but does not actually get what he asked
> > > for.
> > >
> > > As one example, glibc manages heaps via:
> > >
> > > a) Creating a new heap: mmap(PROT_NONE, MAP_NORESERVE) the maximum size,
> > > then mprotect(PROT_READ|PROT_WRITE) the initial heap size. Even if
> > > MAP_NORESERVE is ignored, only !PROT_NONE memory ever gets committed
> > > ("reserve swap space") in Linux.
> > >
> > > b) Growing the heap via mprotect(PROT_READ|PROT_WRITE) within the existing
> > > mmap. This will commit memory in case MAP_NORESERVE got ignored.
> > >
> > > c) Shrinking the heap ("discard memory") via MADV_DONTNEED *unless*
> > > "/proc/sys/vm/overcommit_memory == 2" - the only way to undo
> > > mprotect(PROT_READ|PROT_WRITE) and to un-commit memory is by doing a
> > > mmap(PROT_NONE, MAP_FIXED) over the problematic region.
> > >
> > > If you're interested, you can take a look at:
> > >
> > > malloc/arena.c
> > > sysdeps/unix/sysv/linux/malloc-sysdep.h:check_may_shrink_heap()
> >
> > Thanks for the context. It's interesting to know libc has such special heap
> > operations.
> >
> > Glibc shrinks heap to save memory for the no-over-commit case, however in our
> > case currently we'd like to fail some users using global_overcommit=2 but
> > reserve=off - it means even if we don't fail the user, mmap() could also fail
> > if it's overcommitted. Even if this mmap() didn't fail, it'll fail very easily
> > later on iiuc, right?
>
> Here is the issue I want to catch.
>
> Assume you create a VM with a virtio-mem device that can grow big in the
> future. You specify reserved=off. mmap() succeeds and you assume the very
> sparse memory region does not actually commit memory. But it commits all
> memory in the sparse mmap.
>
> Assume you want to create another VM. It just fails although you still have
> plenty of free memory - because the previous MAP_NORESERVE got ignored.
Right. So I think that's moving the failure earlier. Again I am not sure how
much it would help but it should be slightly better than a latter failure.
>
> I want to warn the user right away that the configuration is messed up and
> that "reserved=off" is not effective.
>
> For anonymous memory, "reserved=off" will start really being useful when
> having a way to dynamically reserve swap space.
>
> >
> > I think it's fine to have that early failure, it just seems less helpful than
> > what glibc was doing which shrinks active memory for real, meanwhile there
> > seems to encode some very detailed OS information into this helper, so just
> > less charming.
>
> It's not nice, but the messed-up Linux implementation is to blame. Just read
> the Linux man page of the mmap where there is an explicit link to
> "/proc/sys/vm/overcommit_memory" for this very reason.
>
> >
> > Btw above [1] "fd != -1 && shared" looks weird to me.
>
> As we never have shared anonymous memory (and that's good, because it is
> super weird) in QEMU, we can simplify to
>
> "if (shared) { return true; }"
Yes this looks easier to read. Maybe you can assert the fd in the block too if
you are pretty sure of it.
>
> >
> > Firstly it'll bypass overcommit_memory==2 check and return true directly, is
> > that right? I thought the global will be meaningful for all memories except
> > hugetlbfs (in do_mmap() of Linux).
>
> See the description in the patch. On basically all (except anonymous shared)
> shared memory we don't ever reserve swap space.
>
> See mmap_region():
>
> if (accountable_mapping(file, vm_flags)) {
> charged = len >> PAGE_SHIFT;
> if (security_vm_enough_memory_mm(mm, charged))
> return -ENOMEM;
> vm_flags |= VM_ACCOUNT;
> }
>
> whereby
>
> accountable_mapping():
>
> if (file && is_file_hugepages(file))
> return 0;
> return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
>
>
> in do_mmap(), we only affect if VM_NORESERVE is set or not.
>
>
> And this makes perfect sense: Most* shared mappings don't need swap space
> because they can just be written back to the original backing storage
> (file). This is different to private mappings, which cannot be written back
> to the file - so we need swap space.
>
> >
> > Meanwhile, I don't see why file-backed share memories is so special too.. From
>
> Just think about 100000 processes mapping the same file shared. Would you
> want to reserve 100000 the same swap space for the same file?
>
> Although you never* ever really need swap space because you can just
> writeback to the file instead of swapping.
>
> > your commit message, I'm not sure whether you wanted to return false instead,
> > however that's still not the case IIUC, since e.g. /dev/shmem still does
> > accounting iiuc, while MAP_NORESERVE will skip it.
>
> *except /dev/shmem. It actually needs swap space because the actual file
> resides in tmpfs->RAM but Linux will never ever commit that memory.
>
> Main reason is: because you don't know when/how much to commit.
>
> a) When creating/truncating the file we don't know if it's going to be
> sparse. So how much should we commit?
> b) When mapping the file, the same comment as above applies: you don't
> want to reserve per-mapper but instead per-file.
>
> You can create and map gigantic memfd/shmem files even with
> "/proc/sys/vm/overcommit_memory == 2", because we will never ever commit any
> of that memory. Yes, shmem is dangerous.
>
> Shared mappings behave like always having MAP_NORESERVE, thus the "return
> true;"
>
> It's interesting that you also raise this point: I also want to propose
> dynamic reservation of swap space for shmem in the future. Instead of being
> per process, this would have to be per file and similar allow to coordinate
> with the kernel how much memory we are actually intending to use (->commit)
> such that the kernel can properly account and reject if it wouldn't be
> possible. If only a single day would have more than 24 hours :)
Thanks, all you said looks sane.
I can't say I fully understand what you'd like to propose. To me, a sane user
of /dev/shm should really fallocate() on that before using, but I could also be
wrong.
Another trivial suggestion is you can also add these function pointers into the
comment of the helper, e.g., accountable_mapping() as you referenced. So it
would be easier for people to understand where these code came from.
--
Peter Xu
>> I want to warn the user right away that the configuration is messed up and
>> that "reserved=off" is not effective.
>>
>> For anonymous memory, "reserved=off" will start really being useful when
>> having a way to dynamically reserve swap space.
>>
>>>
>>> I think it's fine to have that early failure, it just seems less helpful than
>>> what glibc was doing which shrinks active memory for real, meanwhile there
>>> seems to encode some very detailed OS information into this helper, so just
>>> less charming.
>>
>> It's not nice, but the messed-up Linux implementation is to blame. Just read
>> the Linux man page of the mmap where there is an explicit link to
>> "/proc/sys/vm/overcommit_memory" for this very reason.
>>
>>>
>>> Btw above [1] "fd != -1 && shared" looks weird to me.
>>
>> As we never have shared anonymous memory (and that's good, because it is
>> super weird) in QEMU, we can simplify to
>>
>> "if (shared) { return true; }"
>
> Yes this looks easier to read. Maybe you can assert the fd in the block too if
> you are pretty sure of it.
Yes, and I'll also add more comments to clarify.
[...]
>> It's interesting that you also raise this point: I also want to propose
>> dynamic reservation of swap space for shmem in the future. Instead of being
>> per process, this would have to be per file and similar allow to coordinate
>> with the kernel how much memory we are actually intending to use (->commit)
>> such that the kernel can properly account and reject if it wouldn't be
>> possible. If only a single day would have more than 24 hours :)
>
> Thanks, all you said looks sane.
>
> I can't say I fully understand what you'd like to propose. To me, a sane user
> of /dev/shm should really fallocate() on that before using, but I could also be
> wrong.
Even then, no swap space gets reserved / memory gets accounted; as
accounting is a bit expensive we don't want to do that on fault/discard
time. It's still a bit of a puzzle.
Also, fallocate() gets pretty useless once you bring memory overcommit /
virtio-balloon into the picture :(
>
> Another trivial suggestion is you can also add these function pointers into the
> comment of the helper, e.g., accountable_mapping() as you referenced. So it
> would be easier for people to understand where these code came from.
Yes, I'll try adding more comments.
--
Thanks,
David / dhildenb
© 2016 - 2026 Red Hat, Inc.