1 | Currently, passing mem-lock=on to QEMU causes memory usage to grow by | 1 | Currently, passing mem-lock=on to QEMU causes memory usage to grow by |
---|---|---|---|
2 | huge amounts: | 2 | huge amounts: |
3 | 3 | ||
4 | no memlock: | 4 | no memlock: |
5 | $ qemu-system-x86_64 -overcommit mem-lock=off | 5 | $ ./qemu-system-x86_64 -overcommit mem-lock=off |
6 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= | 6 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= |
7 | 45652 | 7 | 45652 |
8 | 8 | ||
9 | $ ./qemu-system-x86_64 -overcommit mem-lock=off -enable-kvm | 9 | $ ./qemu-system-x86_64 -overcommit mem-lock=off -enable-kvm |
10 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= | 10 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= |
11 | 39756 | 11 | 39756 |
12 | 12 | ||
13 | memlock: | 13 | memlock: |
14 | $ qemu-system-x86_64 -overcommit mem-lock=on | 14 | $ ./qemu-system-x86_64 -overcommit mem-lock=on |
15 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= | 15 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= |
16 | 1309876 | 16 | 1309876 |
17 | 17 | ||
18 | $ ./qemu-system-x86_64 -overcommit mem-lock=on -enable-kvm | 18 | $ ./qemu-system-x86_64 -overcommit mem-lock=on -enable-kvm |
19 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= | 19 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= |
... | ... | ||
30 | active. | 30 | active. |
31 | 31 | ||
32 | mem-lock=on helps against this (given compact_unevictable_allowed is 0), | 32 | mem-lock=on helps against this (given compact_unevictable_allowed is 0), |
33 | but the memory overhead it introduces is an undesirable side effect, | 33 | but the memory overhead it introduces is an undesirable side effect, |
34 | which we can completely avoid by passing MCL_ONFAULT to mlockall, which | 34 | which we can completely avoid by passing MCL_ONFAULT to mlockall, which |
35 | is what this series allows to do with a new command line option called | 35 | is what this series allows to do with a new option for mem-lock called |
36 | mem-lock-onfault. | 36 | on-fault. |
37 | 37 | ||
38 | memlock-onfault: | 38 | memlock=on-fault: |
39 | $ qemu-system-x86_64 -overcommit mem-lock-onfault=on | 39 | $ ./qemu-system-x86_64 -overcommit mem-lock=on-fault |
40 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= | 40 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= |
41 | 54004 | 41 | 54004 |
42 | 42 | ||
43 | $ ./qemu-system-x86_64 -overcommit mem-lock-onfault=on -enable-kvm | 43 | $ ./qemu-system-x86_64 -overcommit mem-lock=on-fault -enable-kvm |
44 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= | 44 | $ ps -p $(pidof ./qemu-system-x86_64) -o rss= |
45 | 47772 | 45 | 47772 |
46 | 46 | ||
47 | You may notice the memory usage is still slightly higher, in this case | 47 | You may notice the memory usage is still slightly higher, in this case |
48 | by a few megabytes over the mem-lock=off case. I was able to trace this | 48 | by a few megabytes over the mem-lock=off case. I was able to trace this |
49 | down to a bug in the linux kernel with MCL_ONFAULT not being honored for | 49 | down to a bug in the linux kernel with MCL_ONFAULT not being honored for |
50 | the early process heap (with brk(2) etc.) so it is still write-faulted in | 50 | the early process heap (with brk(2) etc.) so it is still write-faulted in |
51 | this case, but it's still way less than it was with just the mem-lock=on. | 51 | this case, but it's still way less than it was with just the mem-lock=on. |
52 | 52 | ||
53 | Changes since v1: | ||
54 | - Don't make a separate mem-lock-onfault, add an on-fault option to mem-lock instead | ||
55 | |||
53 | Daniil Tatianin (2): | 56 | Daniil Tatianin (2): |
54 | os: add an ability to lock memory on_fault | 57 | os: add an ability to lock memory on_fault |
55 | overcommit: introduce mem-lock-onfault | 58 | overcommit: introduce mem-lock=on-fault |
56 | 59 | ||
57 | include/sysemu/os-posix.h | 2 +- | 60 | include/sysemu/os-posix.h | 2 +- |
58 | include/sysemu/os-win32.h | 3 ++- | 61 | include/sysemu/os-win32.h | 3 ++- |
59 | include/sysemu/sysemu.h | 1 + | 62 | include/sysemu/sysemu.h | 1 + |
60 | migration/postcopy-ram.c | 4 ++-- | 63 | migration/postcopy-ram.c | 4 ++-- |
61 | os-posix.c | 10 ++++++++-- | 64 | os-posix.c | 10 +++++++-- |
62 | qemu-options.hx | 13 ++++++++++--- | 65 | qemu-options.hx | 14 +++++++----- |
63 | system/globals.c | 1 + | 66 | system/globals.c | 1 + |
64 | system/vl.c | 18 ++++++++++++++++-- | 67 | system/vl.c | 46 +++++++++++++++++++++++++++++++-------- |
65 | 8 files changed, 41 insertions(+), 11 deletions(-) | 68 | 8 files changed, 61 insertions(+), 20 deletions(-) |
66 | 69 | ||
67 | -- | 70 | -- |
68 | 2.34.1 | 71 | 2.34.1 | diff view generated by jsdifflib |
1 | This will be used in the following commits to make it possible to only | 1 | This will be used in the following commits to make it possible to only |
---|---|---|---|
2 | lock memory on fault instead of right away. | 2 | lock memory on fault instead of right away. |
3 | 3 | ||
4 | Signed-off-by: Daniil Tatianin <d-tatianin@yandex-team.ru> | 4 | Signed-off-by: Daniil Tatianin <d-tatianin@yandex-team.ru> |
5 | --- | 5 | --- |
6 | include/sysemu/os-posix.h | 2 +- | 6 | include/sysemu/os-posix.h | 2 +- |
7 | include/sysemu/os-win32.h | 3 ++- | 7 | include/sysemu/os-win32.h | 3 ++- |
8 | migration/postcopy-ram.c | 2 +- | 8 | migration/postcopy-ram.c | 2 +- |
9 | os-posix.c | 10 ++++++++-- | 9 | os-posix.c | 10 ++++++++-- |
10 | system/vl.c | 2 +- | 10 | system/vl.c | 2 +- |
11 | 5 files changed, 13 insertions(+), 6 deletions(-) | 11 | 5 files changed, 13 insertions(+), 6 deletions(-) |
12 | 12 | ||
13 | diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h | 13 | diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h |
14 | index XXXXXXX..XXXXXXX 100644 | 14 | index XXXXXXX..XXXXXXX 100644 |
15 | --- a/include/sysemu/os-posix.h | 15 | --- a/include/sysemu/os-posix.h |
16 | +++ b/include/sysemu/os-posix.h | 16 | +++ b/include/sysemu/os-posix.h |
17 | @@ -XXX,XX +XXX,XX @@ bool os_set_runas(const char *user_id); | 17 | @@ -XXX,XX +XXX,XX @@ bool os_set_runas(const char *user_id); |
18 | void os_set_chroot(const char *path); | 18 | void os_set_chroot(const char *path); |
19 | void os_setup_limits(void); | 19 | void os_setup_limits(void); |
20 | void os_setup_post(void); | 20 | void os_setup_post(void); |
21 | -int os_mlock(void); | 21 | -int os_mlock(void); |
22 | +int os_mlock(bool on_fault); | 22 | +int os_mlock(bool on_fault); |
23 | 23 | ||
24 | /** | 24 | /** |
25 | * qemu_alloc_stack: | 25 | * qemu_alloc_stack: |
26 | diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h | 26 | diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h |
27 | index XXXXXXX..XXXXXXX 100644 | 27 | index XXXXXXX..XXXXXXX 100644 |
28 | --- a/include/sysemu/os-win32.h | 28 | --- a/include/sysemu/os-win32.h |
29 | +++ b/include/sysemu/os-win32.h | 29 | +++ b/include/sysemu/os-win32.h |
30 | @@ -XXX,XX +XXX,XX @@ static inline bool is_daemonized(void) | 30 | @@ -XXX,XX +XXX,XX @@ static inline bool is_daemonized(void) |
31 | return false; | 31 | return false; |
32 | } | 32 | } |
33 | 33 | ||
34 | -static inline int os_mlock(void) | 34 | -static inline int os_mlock(void) |
35 | +static inline int os_mlock(bool on_fault) | 35 | +static inline int os_mlock(bool on_fault) |
36 | { | 36 | { |
37 | + (void)on_fault; | 37 | + (void)on_fault; |
38 | return -ENOSYS; | 38 | return -ENOSYS; |
39 | } | 39 | } |
40 | 40 | ||
41 | diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c | 41 | diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c |
42 | index XXXXXXX..XXXXXXX 100644 | 42 | index XXXXXXX..XXXXXXX 100644 |
43 | --- a/migration/postcopy-ram.c | 43 | --- a/migration/postcopy-ram.c |
44 | +++ b/migration/postcopy-ram.c | 44 | +++ b/migration/postcopy-ram.c |
45 | @@ -XXX,XX +XXX,XX @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) | 45 | @@ -XXX,XX +XXX,XX @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) |
46 | } | 46 | } |
47 | 47 | ||
48 | if (enable_mlock) { | 48 | if (enable_mlock) { |
49 | - if (os_mlock() < 0) { | 49 | - if (os_mlock() < 0) { |
50 | + if (os_mlock(false) < 0) { | 50 | + if (os_mlock(false) < 0) { |
51 | error_report("mlock: %s", strerror(errno)); | 51 | error_report("mlock: %s", strerror(errno)); |
52 | /* | 52 | /* |
53 | * It doesn't feel right to fail at this point, we have a valid | 53 | * It doesn't feel right to fail at this point, we have a valid |
54 | diff --git a/os-posix.c b/os-posix.c | 54 | diff --git a/os-posix.c b/os-posix.c |
55 | index XXXXXXX..XXXXXXX 100644 | 55 | index XXXXXXX..XXXXXXX 100644 |
56 | --- a/os-posix.c | 56 | --- a/os-posix.c |
57 | +++ b/os-posix.c | 57 | +++ b/os-posix.c |
58 | @@ -XXX,XX +XXX,XX @@ void os_set_line_buffering(void) | 58 | @@ -XXX,XX +XXX,XX @@ void os_set_line_buffering(void) |
59 | setvbuf(stdout, NULL, _IOLBF, 0); | 59 | setvbuf(stdout, NULL, _IOLBF, 0); |
60 | } | 60 | } |
61 | 61 | ||
62 | -int os_mlock(void) | 62 | -int os_mlock(void) |
63 | +int os_mlock(bool on_fault) | 63 | +int os_mlock(bool on_fault) |
64 | { | 64 | { |
65 | #ifdef HAVE_MLOCKALL | 65 | #ifdef HAVE_MLOCKALL |
66 | int ret = 0; | 66 | int ret = 0; |
67 | + int flags = MCL_CURRENT | MCL_FUTURE; | 67 | + int flags = MCL_CURRENT | MCL_FUTURE; |
68 | 68 | ||
69 | - ret = mlockall(MCL_CURRENT | MCL_FUTURE); | 69 | - ret = mlockall(MCL_CURRENT | MCL_FUTURE); |
70 | + if (on_fault) { | 70 | + if (on_fault) { |
71 | + flags |= MCL_ONFAULT; | 71 | + flags |= MCL_ONFAULT; |
72 | + } | 72 | + } |
73 | + | 73 | + |
74 | + ret = mlockall(flags); | 74 | + ret = mlockall(flags); |
75 | if (ret < 0) { | 75 | if (ret < 0) { |
76 | error_report("mlockall: %s", strerror(errno)); | 76 | error_report("mlockall: %s", strerror(errno)); |
77 | } | 77 | } |
78 | 78 | ||
79 | return ret; | 79 | return ret; |
80 | #else | 80 | #else |
81 | + (void)on_fault; | 81 | + (void)on_fault; |
82 | return -ENOSYS; | 82 | return -ENOSYS; |
83 | #endif | 83 | #endif |
84 | } | 84 | } |
85 | diff --git a/system/vl.c b/system/vl.c | 85 | diff --git a/system/vl.c b/system/vl.c |
86 | index XXXXXXX..XXXXXXX 100644 | 86 | index XXXXXXX..XXXXXXX 100644 |
87 | --- a/system/vl.c | 87 | --- a/system/vl.c |
88 | +++ b/system/vl.c | 88 | +++ b/system/vl.c |
89 | @@ -XXX,XX +XXX,XX @@ static QemuOptsList qemu_run_with_opts = { | 89 | @@ -XXX,XX +XXX,XX @@ static QemuOptsList qemu_run_with_opts = { |
90 | static void realtime_init(void) | 90 | static void realtime_init(void) |
91 | { | 91 | { |
92 | if (enable_mlock) { | 92 | if (enable_mlock) { |
93 | - if (os_mlock() < 0) { | 93 | - if (os_mlock() < 0) { |
94 | + if (os_mlock(false) < 0) { | 94 | + if (os_mlock(false) < 0) { |
95 | error_report("locking memory failed"); | 95 | error_report("locking memory failed"); |
96 | exit(1); | 96 | exit(1); |
97 | } | 97 | } |
98 | -- | 98 | -- |
99 | 2.34.1 | 99 | 2.34.1 | diff view generated by jsdifflib |
... | ... | ||
---|---|---|---|
7 | 7 | ||
8 | Signed-off-by: Daniil Tatianin <d-tatianin@yandex-team.ru> | 8 | Signed-off-by: Daniil Tatianin <d-tatianin@yandex-team.ru> |
9 | --- | 9 | --- |
10 | include/sysemu/sysemu.h | 1 + | 10 | include/sysemu/sysemu.h | 1 + |
11 | migration/postcopy-ram.c | 4 ++-- | 11 | migration/postcopy-ram.c | 4 ++-- |
12 | qemu-options.hx | 13 ++++++++++--- | 12 | qemu-options.hx | 14 +++++++----- |
13 | system/globals.c | 1 + | 13 | system/globals.c | 1 + |
14 | system/vl.c | 18 ++++++++++++++++-- | 14 | system/vl.c | 46 ++++++++++++++++++++++++++++++++-------- |
15 | 5 files changed, 30 insertions(+), 7 deletions(-) | 15 | 5 files changed, 50 insertions(+), 16 deletions(-) |
16 | 16 | ||
17 | diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h | 17 | diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h |
18 | index XXXXXXX..XXXXXXX 100644 | 18 | index XXXXXXX..XXXXXXX 100644 |
19 | --- a/include/sysemu/sysemu.h | 19 | --- a/include/sysemu/sysemu.h |
20 | +++ b/include/sysemu/sysemu.h | 20 | +++ b/include/sysemu/sysemu.h |
... | ... | ||
48 | @@ -XXX,XX +XXX,XX @@ SRST | 48 | @@ -XXX,XX +XXX,XX @@ SRST |
49 | ERST | 49 | ERST |
50 | 50 | ||
51 | DEF("overcommit", HAS_ARG, QEMU_OPTION_overcommit, | 51 | DEF("overcommit", HAS_ARG, QEMU_OPTION_overcommit, |
52 | - "-overcommit [mem-lock=on|off][cpu-pm=on|off]\n" | 52 | - "-overcommit [mem-lock=on|off][cpu-pm=on|off]\n" |
53 | + "-overcommit [mem-lock=on|off][mem-lock-onfault=on|off][cpu-pm=on|off]\n" | 53 | + "-overcommit [mem-lock=on|off|on-fault][cpu-pm=on|off]\n" |
54 | " run qemu with overcommit hints\n" | 54 | " run qemu with overcommit hints\n" |
55 | " mem-lock=on|off controls memory lock support (default: off)\n" | 55 | - " mem-lock=on|off controls memory lock support (default: off)\n" |
56 | + " mem-lock-onfault=on|off controls memory lock on fault support (default: off)\n" | 56 | + " mem-lock=on|off|on-fault controls memory lock support (default: off)\n" |
57 | " cpu-pm=on|off controls cpu power management (default: off)\n", | 57 | " cpu-pm=on|off controls cpu power management (default: off)\n", |
58 | QEMU_ARCH_ALL) | 58 | QEMU_ARCH_ALL) |
59 | SRST | 59 | SRST |
60 | ``-overcommit mem-lock=on|off`` | 60 | -``-overcommit mem-lock=on|off`` |
61 | +``-overcommit mem-lock=on|off|on-fault`` | ||
61 | \ | 62 | \ |
62 | +``-overcommit mem-lock-onfault=on|off`` | ||
63 | + \ | ||
64 | ``-overcommit cpu-pm=on|off`` | 63 | ``-overcommit cpu-pm=on|off`` |
65 | Run qemu with hints about host resource overcommit. The default is | 64 | Run qemu with hints about host resource overcommit. The default is |
66 | to assume that host overcommits all resources. | 65 | to assume that host overcommits all resources. |
67 | 66 | ||
68 | Locking qemu and guest memory can be enabled via ``mem-lock=on`` | 67 | Locking qemu and guest memory can be enabled via ``mem-lock=on`` |
69 | - (disabled by default). This works when host memory is not | 68 | - (disabled by default). This works when host memory is not |
70 | - overcommitted and reduces the worst-case latency for guest. | 69 | - overcommitted and reduces the worst-case latency for guest. |
71 | + or ``mem-lock-onfault=on`` (disabled by default). This works when | 70 | + or ``mem-lock=on-fault`` (disabled by default). This works when |
72 | + host memory is not overcommitted and reduces the worst-case latency for | 71 | + host memory is not overcommitted and reduces the worst-case latency for |
73 | + guest. The on-fault option is better for reducing the memory footprint | 72 | + guest. The on-fault option is better for reducing the memory footprint |
74 | + since it makes allocations lazy, but the pages still get locked in place | 73 | + since it makes allocations lazy, but the pages still get locked in place |
75 | + once faulted by the guest or QEMU. Note that the two options are mutually | 74 | + once faulted by the guest or QEMU. Note that the two options are mutually |
76 | + exclusive. | 75 | + exclusive. |
... | ... | ||
92 | diff --git a/system/vl.c b/system/vl.c | 91 | diff --git a/system/vl.c b/system/vl.c |
93 | index XXXXXXX..XXXXXXX 100644 | 92 | index XXXXXXX..XXXXXXX 100644 |
94 | --- a/system/vl.c | 93 | --- a/system/vl.c |
95 | +++ b/system/vl.c | 94 | +++ b/system/vl.c |
96 | @@ -XXX,XX +XXX,XX @@ static QemuOptsList qemu_overcommit_opts = { | 95 | @@ -XXX,XX +XXX,XX @@ static QemuOptsList qemu_overcommit_opts = { |
96 | .desc = { | ||
97 | { | ||
97 | .name = "mem-lock", | 98 | .name = "mem-lock", |
98 | .type = QEMU_OPT_BOOL, | 99 | - .type = QEMU_OPT_BOOL, |
100 | + .type = QEMU_OPT_STRING, | ||
99 | }, | 101 | }, |
100 | + { | ||
101 | + .name = "mem-lock-onfault", | ||
102 | + .type = QEMU_OPT_BOOL, | ||
103 | + }, | ||
104 | { | 102 | { |
105 | .name = "cpu-pm", | 103 | .name = "cpu-pm", |
106 | .type = QEMU_OPT_BOOL, | ||
107 | @@ -XXX,XX +XXX,XX @@ static QemuOptsList qemu_run_with_opts = { | 104 | @@ -XXX,XX +XXX,XX @@ static QemuOptsList qemu_run_with_opts = { |
108 | 105 | ||
109 | static void realtime_init(void) | 106 | static void realtime_init(void) |
110 | { | 107 | { |
111 | - if (enable_mlock) { | 108 | - if (enable_mlock) { |
... | ... | ||
114 | + if (os_mlock(enable_mlock_onfault) < 0) { | 111 | + if (os_mlock(enable_mlock_onfault) < 0) { |
115 | error_report("locking memory failed"); | 112 | error_report("locking memory failed"); |
116 | exit(1); | 113 | exit(1); |
117 | } | 114 | } |
118 | @@ -XXX,XX +XXX,XX @@ void qemu_init(int argc, char **argv) | 115 | @@ -XXX,XX +XXX,XX @@ void qemu_init(int argc, char **argv) |
119 | if (!opts) { | 116 | object_option_parse(optarg); |
117 | break; | ||
118 | case QEMU_OPTION_overcommit: | ||
119 | - opts = qemu_opts_parse_noisily(qemu_find_opts("overcommit"), | ||
120 | - optarg, false); | ||
121 | - if (!opts) { | ||
122 | + { | ||
123 | + const char *mem_lock_opt; | ||
124 | + | ||
125 | + opts = qemu_opts_parse_noisily(qemu_find_opts("overcommit"), | ||
126 | + optarg, false); | ||
127 | + if (!opts) { | ||
128 | + exit(1); | ||
129 | + } | ||
130 | + | ||
131 | + enable_cpu_pm = qemu_opt_get_bool(opts, "cpu-pm", enable_cpu_pm); | ||
132 | + | ||
133 | + mem_lock_opt = qemu_opt_get(opts, "mem-lock"); | ||
134 | + if (!mem_lock_opt) { | ||
135 | + break; | ||
136 | + } | ||
137 | + | ||
138 | + if (strcmp(mem_lock_opt, "on") == 0) { | ||
139 | + enable_mlock = true; | ||
140 | + break; | ||
141 | + } | ||
142 | + | ||
143 | + if (strcmp(mem_lock_opt, "off") == 0) { | ||
144 | + enable_mlock = false; | ||
145 | + enable_mlock_onfault = false; | ||
146 | + break; | ||
147 | + } | ||
148 | + | ||
149 | + if (strcmp(mem_lock_opt, "on-fault") == 0) { | ||
150 | + enable_mlock_onfault = true; | ||
151 | + break; | ||
152 | + } | ||
153 | + | ||
154 | + error_report("parameter 'mem-lock' expects one of " | ||
155 | + "'on', 'off', 'on-fault'"); | ||
120 | exit(1); | 156 | exit(1); |
121 | } | 157 | } |
122 | + | 158 | - enable_mlock = qemu_opt_get_bool(opts, "mem-lock", enable_mlock); |
123 | enable_mlock = qemu_opt_get_bool(opts, "mem-lock", enable_mlock); | 159 | - enable_cpu_pm = qemu_opt_get_bool(opts, "cpu-pm", enable_cpu_pm); |
124 | + enable_mlock_onfault = qemu_opt_get_bool(opts, | 160 | - break; |
125 | + "mem-lock-onfault", | ||
126 | + enable_mlock_onfault); | ||
127 | + if (enable_mlock && enable_mlock_onfault) { | ||
128 | + error_report("mem-lock and mem-lock-onfault are mutually" | ||
129 | + "exclusive"); | ||
130 | + exit(1); | ||
131 | + } | ||
132 | + | ||
133 | enable_cpu_pm = qemu_opt_get_bool(opts, "cpu-pm", enable_cpu_pm); | ||
134 | break; | ||
135 | case QEMU_OPTION_compat: | 161 | case QEMU_OPTION_compat: |
162 | { | ||
163 | CompatPolicy *opts_policy; | ||
136 | -- | 164 | -- |
137 | 2.34.1 | 165 | 2.34.1 | diff view generated by jsdifflib |