[PATCH v2] linux-user: Add partial support for MADV_DONTNEED

Ilya Leoshkevich posted 1 patch 1 year, 10 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20220621144205.158452-1-iii@linux.ibm.com
Maintainers: Laurent Vivier <laurent@vivier.eu>
linux-user/mmap.c           | 64 +++++++++++++++++++++++++++++++++++++
linux-user/syscall.c        |  8 ++---
linux-user/user-internals.h |  1 +
linux-user/user-mmap.h      |  1 +
4 files changed, 68 insertions(+), 6 deletions(-)
[PATCH v2] linux-user: Add partial support for MADV_DONTNEED
Posted by Ilya Leoshkevich 1 year, 10 months ago
Currently QEMU ignores madvise(MADV_DONTNEED), which break apps that
rely on this for zeroing out memory [1]. Improve the situation by doing
a passthrough when the range in question is a host-page-aligned
anonymous mapping.

This is based on the patches from Simon Hausmann [2] and Chris Fallin
[3]. The structure is taken from Simon's patch. The PAGE_MAP_ANONYMOUS
bits are superseded by commit 26bab757d41b ("linux-user: Introduce
PAGE_ANON"). In the end the patch acts like the one from Chris: we
either pass-through the entire syscall, or do nothing, since doing this
only partially would not help the affected applications much. Finally,
add some extra checks to match the behavior of the Linux kernel [4].

[1] https://gitlab.com/qemu-project/qemu/-/issues/326
[2] https://patchew.org/QEMU/20180827084037.25316-1-simon.hausmann@qt.io/
[3] https://github.com/bytecodealliance/wasmtime/blob/v0.37.0/ci/qemu-madvise.patch
[4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/madvise.c?h=v5.19-rc3#n1368

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Reviewed-by: Laurent Vivier <laurent@vivier.eu>
---

v1: https://lists.gnu.org/archive/html/qemu-devel/2022-06/msg03572.html
v1 -> v2:
* Make get_errno() extern.
* Simplify errno handling (Laurent).

 linux-user/mmap.c           | 64 +++++++++++++++++++++++++++++++++++++
 linux-user/syscall.c        |  8 ++---
 linux-user/user-internals.h |  1 +
 linux-user/user-mmap.h      |  1 +
 4 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index 48e1373796..4e7a6be6ee 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -835,3 +835,67 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
     mmap_unlock();
     return new_addr;
 }
+
+static bool can_passthrough_madv_dontneed(abi_ulong start, abi_ulong end)
+{
+    ulong addr;
+
+    if ((start | end) & ~qemu_host_page_mask) {
+        return false;
+    }
+
+    for (addr = start; addr < end; addr += TARGET_PAGE_SIZE) {
+        if (!(page_get_flags(addr) & PAGE_ANON)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice)
+{
+    abi_ulong len, end;
+    int ret = 0;
+
+    if (start & ~TARGET_PAGE_MASK) {
+        return -TARGET_EINVAL;
+    }
+    len = TARGET_PAGE_ALIGN(len_in);
+
+    if (len_in && !len) {
+        return -TARGET_EINVAL;
+    }
+
+    end = start + len;
+    if (end < start) {
+        return -TARGET_EINVAL;
+    }
+
+    if (end == start) {
+        return 0;
+    }
+
+    if (!guest_range_valid_untagged(start, len)) {
+        return -TARGET_EINVAL;
+    }
+
+    /*
+     * A straight passthrough may not be safe because qemu sometimes turns
+     * private file-backed mappings into anonymous mappings.
+     *
+     * This is a hint, so ignoring and returning success is ok.
+     *
+     * This breaks MADV_DONTNEED, completely implementing which is quite
+     * complicated. However, there is one low-hanging fruit: host-page-aligned
+     * anonymous mappings. In this case passthrough is safe, so do it.
+     */
+    mmap_lock();
+    if ((advice & MADV_DONTNEED) &&
+        can_passthrough_madv_dontneed(start, end)) {
+        ret = get_errno(madvise(g2h_untagged(start), len, MADV_DONTNEED));
+    }
+    mmap_unlock();
+
+    return ret;
+}
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index f55cdebee5..8f68f255c0 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -538,7 +538,7 @@ static inline int target_to_host_errno(int target_errno)
     }
 }
 
-static inline abi_long get_errno(abi_long ret)
+abi_long get_errno(abi_long ret)
 {
     if (ret == -1)
         return -host_to_target_errno(errno);
@@ -11807,11 +11807,7 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1,
 
 #ifdef TARGET_NR_madvise
     case TARGET_NR_madvise:
-        /* A straight passthrough may not be safe because qemu sometimes
-           turns private file-backed mappings into anonymous mappings.
-           This will break MADV_DONTNEED.
-           This is a hint, so ignoring and returning success is ok.  */
-        return 0;
+        return target_madvise(arg1, arg2, arg3);
 #endif
 #ifdef TARGET_NR_fcntl64
     case TARGET_NR_fcntl64:
diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h
index 6175ce53db..0280e76add 100644
--- a/linux-user/user-internals.h
+++ b/linux-user/user-internals.h
@@ -65,6 +65,7 @@ abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1,
                     abi_long arg8);
 extern __thread CPUState *thread_cpu;
 G_NORETURN void cpu_loop(CPUArchState *env);
+abi_long get_errno(abi_long ret);
 const char *target_strerror(int err);
 int get_osversion(void);
 void init_qemu_uname_release(void);
diff --git a/linux-user/user-mmap.h b/linux-user/user-mmap.h
index d1dec99c02..480ce1c114 100644
--- a/linux-user/user-mmap.h
+++ b/linux-user/user-mmap.h
@@ -25,6 +25,7 @@ int target_munmap(abi_ulong start, abi_ulong len);
 abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
                        abi_ulong new_size, unsigned long flags,
                        abi_ulong new_addr);
+abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice);
 extern unsigned long last_brk;
 extern abi_ulong mmap_next_start;
 abi_ulong mmap_find_vma(abi_ulong, abi_ulong, abi_ulong);
-- 
2.35.3
Re: [PATCH v2] linux-user: Add partial support for MADV_DONTNEED
Posted by Laurent Vivier 1 year, 10 months ago
Le 21/06/2022 à 16:42, Ilya Leoshkevich a écrit :
> Currently QEMU ignores madvise(MADV_DONTNEED), which break apps that
> rely on this for zeroing out memory [1]. Improve the situation by doing
> a passthrough when the range in question is a host-page-aligned
> anonymous mapping.
> 
> This is based on the patches from Simon Hausmann [2] and Chris Fallin
> [3]. The structure is taken from Simon's patch. The PAGE_MAP_ANONYMOUS
> bits are superseded by commit 26bab757d41b ("linux-user: Introduce
> PAGE_ANON"). In the end the patch acts like the one from Chris: we
> either pass-through the entire syscall, or do nothing, since doing this
> only partially would not help the affected applications much. Finally,
> add some extra checks to match the behavior of the Linux kernel [4].
> 
> [1] https://gitlab.com/qemu-project/qemu/-/issues/326
> [2] https://patchew.org/QEMU/20180827084037.25316-1-simon.hausmann@qt.io/
> [3] https://github.com/bytecodealliance/wasmtime/blob/v0.37.0/ci/qemu-madvise.patch
> [4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/madvise.c?h=v5.19-rc3#n1368
> 
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> Reviewed-by: Laurent Vivier <laurent@vivier.eu>
> ---
> 
> v1: https://lists.gnu.org/archive/html/qemu-devel/2022-06/msg03572.html
> v1 -> v2:
> * Make get_errno() extern.
> * Simplify errno handling (Laurent).
> 
>   linux-user/mmap.c           | 64 +++++++++++++++++++++++++++++++++++++
>   linux-user/syscall.c        |  8 ++---
>   linux-user/user-internals.h |  1 +
>   linux-user/user-mmap.h      |  1 +
>   4 files changed, 68 insertions(+), 6 deletions(-)
> 
> diff --git a/linux-user/mmap.c b/linux-user/mmap.c
> index 48e1373796..4e7a6be6ee 100644
> --- a/linux-user/mmap.c
> +++ b/linux-user/mmap.c
> @@ -835,3 +835,67 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
>       mmap_unlock();
>       return new_addr;
>   }
> +
> +static bool can_passthrough_madv_dontneed(abi_ulong start, abi_ulong end)
> +{
> +    ulong addr;
> +
> +    if ((start | end) & ~qemu_host_page_mask) {
> +        return false;
> +    }
> +
> +    for (addr = start; addr < end; addr += TARGET_PAGE_SIZE) {
> +        if (!(page_get_flags(addr) & PAGE_ANON)) {
> +            return false;
> +        }
> +    }
> +
> +    return true;
> +}
> +
> +abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice)
> +{
> +    abi_ulong len, end;
> +    int ret = 0;
> +
> +    if (start & ~TARGET_PAGE_MASK) {
> +        return -TARGET_EINVAL;
> +    }
> +    len = TARGET_PAGE_ALIGN(len_in);
> +
> +    if (len_in && !len) {
> +        return -TARGET_EINVAL;
> +    }
> +
> +    end = start + len;
> +    if (end < start) {
> +        return -TARGET_EINVAL;
> +    }
> +
> +    if (end == start) {
> +        return 0;
> +    }
> +
> +    if (!guest_range_valid_untagged(start, len)) {
> +        return -TARGET_EINVAL;
> +    }
> +
> +    /*
> +     * A straight passthrough may not be safe because qemu sometimes turns
> +     * private file-backed mappings into anonymous mappings.
> +     *
> +     * This is a hint, so ignoring and returning success is ok.
> +     *
> +     * This breaks MADV_DONTNEED, completely implementing which is quite
> +     * complicated. However, there is one low-hanging fruit: host-page-aligned
> +     * anonymous mappings. In this case passthrough is safe, so do it.
> +     */
> +    mmap_lock();
> +    if ((advice & MADV_DONTNEED) &&
> +        can_passthrough_madv_dontneed(start, end)) {
> +        ret = get_errno(madvise(g2h_untagged(start), len, MADV_DONTNEED));
> +    }
> +    mmap_unlock();
> +
> +    return ret;
> +}
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index f55cdebee5..8f68f255c0 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -538,7 +538,7 @@ static inline int target_to_host_errno(int target_errno)
>       }
>   }
>   
> -static inline abi_long get_errno(abi_long ret)
> +abi_long get_errno(abi_long ret)
>   {
>       if (ret == -1)
>           return -host_to_target_errno(errno);
> @@ -11807,11 +11807,7 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1,
>   
>   #ifdef TARGET_NR_madvise
>       case TARGET_NR_madvise:
> -        /* A straight passthrough may not be safe because qemu sometimes
> -           turns private file-backed mappings into anonymous mappings.
> -           This will break MADV_DONTNEED.
> -           This is a hint, so ignoring and returning success is ok.  */
> -        return 0;
> +        return target_madvise(arg1, arg2, arg3);
>   #endif
>   #ifdef TARGET_NR_fcntl64
>       case TARGET_NR_fcntl64:
> diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h
> index 6175ce53db..0280e76add 100644
> --- a/linux-user/user-internals.h
> +++ b/linux-user/user-internals.h
> @@ -65,6 +65,7 @@ abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1,
>                       abi_long arg8);
>   extern __thread CPUState *thread_cpu;
>   G_NORETURN void cpu_loop(CPUArchState *env);
> +abi_long get_errno(abi_long ret);
>   const char *target_strerror(int err);
>   int get_osversion(void);
>   void init_qemu_uname_release(void);
> diff --git a/linux-user/user-mmap.h b/linux-user/user-mmap.h
> index d1dec99c02..480ce1c114 100644
> --- a/linux-user/user-mmap.h
> +++ b/linux-user/user-mmap.h
> @@ -25,6 +25,7 @@ int target_munmap(abi_ulong start, abi_ulong len);
>   abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
>                          abi_ulong new_size, unsigned long flags,
>                          abi_ulong new_addr);
> +abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice);
>   extern unsigned long last_brk;
>   extern abi_ulong mmap_next_start;
>   abi_ulong mmap_find_vma(abi_ulong, abi_ulong, abi_ulong);

Applied to my linux-user-for-7.1 branch.

Thanks,
Laurent