If a user wishes to enable KSM mergeability for an entire process and all
fork/exec'd processes that come after it, they use the prctl()
PR_SET_MEMORY_MERGE operation.
This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set
(in order to indicate they are KSM mergeable), as well as setting this flag
for all existing VMAs.
However it also entirely and completely breaks VMA merging for the process
and all forked (and fork/exec'd) processes.
This is because when a new mapping is proposed, the flags specified will
never have VM_MERGEABLE set. However all adjacent VMAs will already have
VM_MERGEABLE set, rendering VMAs unmergeable by default.
To work around this, we try to set the VM_MERGEABLE flag prior to
attempting a merge. In the case of brk() this can always be done.
However on mmap() things are more complicated - while KSM is not supported
for file-backed mappings, it is supported for MAP_PRIVATE file-backed
mappings.
And these mappings may have deprecated .mmap() callbacks specified which
could, in theory, adjust flags and thus KSM merge eligiblity.
So we check to determine whether this at all possible. If not, we set
VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the
previous behaviour.
When .mmap_prepare() is more widely used, we can remove this precaution.
While this doesn't quite cover all cases, it covers a great many (all
anonymous memory, for instance), meaning we should already see a
significant improvement in VMA mergeability.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
include/linux/ksm.h | 4 ++--
mm/ksm.c | 20 ++++++++++++------
mm/vma.c | 49 +++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 63 insertions(+), 10 deletions(-)
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index d73095b5cd96..ba5664daca6e 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -17,8 +17,8 @@
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags);
-
-void ksm_add_vma(struct vm_area_struct *vma);
+vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+ vm_flags_t vm_flags);
int ksm_enable_merge_any(struct mm_struct *mm);
int ksm_disable_merge_any(struct mm_struct *mm);
int ksm_disable(struct mm_struct *mm);
diff --git a/mm/ksm.c b/mm/ksm.c
index d0c763abd499..022af14a95ea 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2731,16 +2731,24 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
return 0;
}
/**
- * ksm_add_vma - Mark vma as mergeable if compatible
+ * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
*
- * @vma: Pointer to vma
+ * @mm: Proposed VMA's mm_struct
+ * @file: Proposed VMA's file-backed mapping, if any.
+ * @vm_flags: Proposed VMA"s flags.
+ *
+ * Returns: @vm_flags possibly updated to mark mergeable.
*/
-void ksm_add_vma(struct vm_area_struct *vma)
+vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+ vm_flags_t vm_flags)
{
- struct mm_struct *mm = vma->vm_mm;
+ vm_flags_t ret = vm_flags;
- if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
- __ksm_add_vma(vma);
+ if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
+ __ksm_should_add_vma(file, vm_flags))
+ ret |= VM_MERGEABLE;
+
+ return ret;
}
static void ksm_add_vmas(struct mm_struct *mm)
diff --git a/mm/vma.c b/mm/vma.c
index 3ff6cfbe3338..5bebe55ea737 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2482,7 +2482,6 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
*/
if (!vma_is_anonymous(vma))
khugepaged_enter_vma(vma, map->flags);
- ksm_add_vma(vma);
*vmap = vma;
return 0;
@@ -2585,6 +2584,45 @@ static void set_vma_user_defined_fields(struct vm_area_struct *vma,
vma->vm_private_data = map->vm_private_data;
}
+static void update_ksm_flags(struct mmap_state *map)
+{
+ map->flags = ksm_vma_flags(map->mm, map->file, map->flags);
+}
+
+/*
+ * Are we guaranteed no driver can change state such as to preclude KSM merging?
+ * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
+ *
+ * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via
+ * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set.
+ *
+ * If this is not the case, then we set the flag after considering mergeability,
+ * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new
+ * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will,
+ * preventing any merge.
+ */
+static bool can_set_ksm_flags_early(struct mmap_state *map)
+{
+ struct file *file = map->file;
+
+ /* Anonymous mappings have no driver which can change them. */
+ if (!file)
+ return true;
+
+ /* shmem is safe. */
+ if (shmem_file(file))
+ return true;
+
+ /*
+ * If .mmap_prepare() is specified, then the driver will have already
+ * manipulated state prior to updating KSM flags.
+ */
+ if (file->f_op->mmap_prepare)
+ return true;
+
+ return false;
+}
+
static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
@@ -2595,6 +2633,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
bool have_mmap_prepare = file && file->f_op->mmap_prepare;
VMA_ITERATOR(vmi, mm, addr);
MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+ bool check_ksm_early = can_set_ksm_flags_early(&map);
error = __mmap_prepare(&map, uf);
if (!error && have_mmap_prepare)
@@ -2602,6 +2641,9 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
if (error)
goto abort_munmap;
+ if (check_ksm_early)
+ update_ksm_flags(&map);
+
/* Attempt to merge with adjacent VMAs... */
if (map.prev || map.next) {
VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
@@ -2611,6 +2653,9 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
/* ...but if we can't, allocate a new VMA. */
if (!vma) {
+ if (!check_ksm_early)
+ update_ksm_flags(&map);
+
error = __mmap_new_vma(&map, &vma);
if (error)
goto unacct_error;
@@ -2713,6 +2758,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* Note: This happens *after* clearing old mappings in some code paths.
*/
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ flags = ksm_vma_flags(mm, NULL, flags);
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
@@ -2756,7 +2802,6 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
mm->map_count++;
validate_mm(mm);
- ksm_add_vma(vma);
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
--
2.49.0
On 2025/5/19 16:51, Lorenzo Stoakes wrote:
> If a user wishes to enable KSM mergeability for an entire process and all
> fork/exec'd processes that come after it, they use the prctl()
> PR_SET_MEMORY_MERGE operation.
>
> This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set
> (in order to indicate they are KSM mergeable), as well as setting this flag
> for all existing VMAs.
>
> However it also entirely and completely breaks VMA merging for the process
> and all forked (and fork/exec'd) processes.
>
> This is because when a new mapping is proposed, the flags specified will
> never have VM_MERGEABLE set. However all adjacent VMAs will already have
> VM_MERGEABLE set, rendering VMAs unmergeable by default.
>
> To work around this, we try to set the VM_MERGEABLE flag prior to
> attempting a merge. In the case of brk() this can always be done.
>
> However on mmap() things are more complicated - while KSM is not supported
> for file-backed mappings, it is supported for MAP_PRIVATE file-backed
> mappings.
>
> And these mappings may have deprecated .mmap() callbacks specified which
> could, in theory, adjust flags and thus KSM merge eligiblity.
>
> So we check to determine whether this at all possible. If not, we set
> VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the
> previous behaviour.
>
> When .mmap_prepare() is more widely used, we can remove this precaution.
>
> While this doesn't quite cover all cases, it covers a great many (all
> anonymous memory, for instance), meaning we should already see a
> significant improvement in VMA mergeability.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Looks good to me with the build fix. And it seems that ksm_add_vma()
is not used anymore..
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Thanks!
> ---
> include/linux/ksm.h | 4 ++--
> mm/ksm.c | 20 ++++++++++++------
> mm/vma.c | 49 +++++++++++++++++++++++++++++++++++++++++++--
> 3 files changed, 63 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/ksm.h b/include/linux/ksm.h
> index d73095b5cd96..ba5664daca6e 100644
> --- a/include/linux/ksm.h
> +++ b/include/linux/ksm.h
> @@ -17,8 +17,8 @@
> #ifdef CONFIG_KSM
> int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
> unsigned long end, int advice, unsigned long *vm_flags);
> -
> -void ksm_add_vma(struct vm_area_struct *vma);
> +vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
> + vm_flags_t vm_flags);
> int ksm_enable_merge_any(struct mm_struct *mm);
> int ksm_disable_merge_any(struct mm_struct *mm);
> int ksm_disable(struct mm_struct *mm);
> diff --git a/mm/ksm.c b/mm/ksm.c
> index d0c763abd499..022af14a95ea 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2731,16 +2731,24 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
> return 0;
> }
> /**
> - * ksm_add_vma - Mark vma as mergeable if compatible
> + * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
> *
> - * @vma: Pointer to vma
> + * @mm: Proposed VMA's mm_struct
> + * @file: Proposed VMA's file-backed mapping, if any.
> + * @vm_flags: Proposed VMA"s flags.
> + *
> + * Returns: @vm_flags possibly updated to mark mergeable.
> */
> -void ksm_add_vma(struct vm_area_struct *vma)
> +vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
> + vm_flags_t vm_flags)
> {
> - struct mm_struct *mm = vma->vm_mm;
> + vm_flags_t ret = vm_flags;
>
> - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
> - __ksm_add_vma(vma);
> + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
> + __ksm_should_add_vma(file, vm_flags))
> + ret |= VM_MERGEABLE;
> +
> + return ret;
> }
>
> static void ksm_add_vmas(struct mm_struct *mm)
> diff --git a/mm/vma.c b/mm/vma.c
> index 3ff6cfbe3338..5bebe55ea737 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -2482,7 +2482,6 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
> */
> if (!vma_is_anonymous(vma))
> khugepaged_enter_vma(vma, map->flags);
> - ksm_add_vma(vma);
> *vmap = vma;
> return 0;
>
> @@ -2585,6 +2584,45 @@ static void set_vma_user_defined_fields(struct vm_area_struct *vma,
> vma->vm_private_data = map->vm_private_data;
> }
>
> +static void update_ksm_flags(struct mmap_state *map)
> +{
> + map->flags = ksm_vma_flags(map->mm, map->file, map->flags);
> +}
> +
> +/*
> + * Are we guaranteed no driver can change state such as to preclude KSM merging?
> + * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
> + *
> + * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via
> + * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set.
> + *
> + * If this is not the case, then we set the flag after considering mergeability,
> + * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new
> + * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will,
> + * preventing any merge.
> + */
> +static bool can_set_ksm_flags_early(struct mmap_state *map)
> +{
> + struct file *file = map->file;
> +
> + /* Anonymous mappings have no driver which can change them. */
> + if (!file)
> + return true;
> +
> + /* shmem is safe. */
> + if (shmem_file(file))
> + return true;
> +
> + /*
> + * If .mmap_prepare() is specified, then the driver will have already
> + * manipulated state prior to updating KSM flags.
> + */
> + if (file->f_op->mmap_prepare)
> + return true;
> +
> + return false;
> +}
> +
> static unsigned long __mmap_region(struct file *file, unsigned long addr,
> unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> struct list_head *uf)
> @@ -2595,6 +2633,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> bool have_mmap_prepare = file && file->f_op->mmap_prepare;
> VMA_ITERATOR(vmi, mm, addr);
> MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
> + bool check_ksm_early = can_set_ksm_flags_early(&map);
>
> error = __mmap_prepare(&map, uf);
> if (!error && have_mmap_prepare)
> @@ -2602,6 +2641,9 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> if (error)
> goto abort_munmap;
>
> + if (check_ksm_early)
> + update_ksm_flags(&map);
> +
> /* Attempt to merge with adjacent VMAs... */
> if (map.prev || map.next) {
> VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
> @@ -2611,6 +2653,9 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
>
> /* ...but if we can't, allocate a new VMA. */
> if (!vma) {
> + if (!check_ksm_early)
> + update_ksm_flags(&map);
> +
> error = __mmap_new_vma(&map, &vma);
> if (error)
> goto unacct_error;
> @@ -2713,6 +2758,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> * Note: This happens *after* clearing old mappings in some code paths.
> */
> flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
> + flags = ksm_vma_flags(mm, NULL, flags);
> if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
> return -ENOMEM;
>
> @@ -2756,7 +2802,6 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
>
> mm->map_count++;
> validate_mm(mm);
> - ksm_add_vma(vma);
> out:
> perf_event_mmap(vma);
> mm->total_vm += len >> PAGE_SHIFT;
On Tue, May 20, 2025 at 11:55:20AM +0800, Chengming Zhou wrote:
> On 2025/5/19 16:51, Lorenzo Stoakes wrote:
> > If a user wishes to enable KSM mergeability for an entire process and all
> > fork/exec'd processes that come after it, they use the prctl()
> > PR_SET_MEMORY_MERGE operation.
> >
> > This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set
> > (in order to indicate they are KSM mergeable), as well as setting this flag
> > for all existing VMAs.
> >
> > However it also entirely and completely breaks VMA merging for the process
> > and all forked (and fork/exec'd) processes.
> >
> > This is because when a new mapping is proposed, the flags specified will
> > never have VM_MERGEABLE set. However all adjacent VMAs will already have
> > VM_MERGEABLE set, rendering VMAs unmergeable by default.
> >
> > To work around this, we try to set the VM_MERGEABLE flag prior to
> > attempting a merge. In the case of brk() this can always be done.
> >
> > However on mmap() things are more complicated - while KSM is not supported
> > for file-backed mappings, it is supported for MAP_PRIVATE file-backed
> > mappings.
> >
> > And these mappings may have deprecated .mmap() callbacks specified which
> > could, in theory, adjust flags and thus KSM merge eligiblity.
> >
> > So we check to determine whether this at all possible. If not, we set
> > VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the
> > previous behaviour.
> >
> > When .mmap_prepare() is more widely used, we can remove this precaution.
> >
> > While this doesn't quite cover all cases, it covers a great many (all
> > anonymous memory, for instance), meaning we should already see a
> > significant improvement in VMA mergeability.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Looks good to me with the build fix. And it seems that ksm_add_vma()
> is not used anymore..
>
> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
>
> Thanks!
Thanks! Yeah in the fix I also drop that, will obviously send a v2 to clear
things up and address comments :)
>
> > ---
> > include/linux/ksm.h | 4 ++--
> > mm/ksm.c | 20 ++++++++++++------
> > mm/vma.c | 49 +++++++++++++++++++++++++++++++++++++++++++--
> > 3 files changed, 63 insertions(+), 10 deletions(-)
> >
> > diff --git a/include/linux/ksm.h b/include/linux/ksm.h
> > index d73095b5cd96..ba5664daca6e 100644
> > --- a/include/linux/ksm.h
> > +++ b/include/linux/ksm.h
> > @@ -17,8 +17,8 @@
> > #ifdef CONFIG_KSM
> > int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
> > unsigned long end, int advice, unsigned long *vm_flags);
> > -
> > -void ksm_add_vma(struct vm_area_struct *vma);
> > +vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
> > + vm_flags_t vm_flags);
> > int ksm_enable_merge_any(struct mm_struct *mm);
> > int ksm_disable_merge_any(struct mm_struct *mm);
> > int ksm_disable(struct mm_struct *mm);
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index d0c763abd499..022af14a95ea 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -2731,16 +2731,24 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
> > return 0;
> > }
> > /**
> > - * ksm_add_vma - Mark vma as mergeable if compatible
> > + * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
> > *
> > - * @vma: Pointer to vma
> > + * @mm: Proposed VMA's mm_struct
> > + * @file: Proposed VMA's file-backed mapping, if any.
> > + * @vm_flags: Proposed VMA"s flags.
> > + *
> > + * Returns: @vm_flags possibly updated to mark mergeable.
> > */
> > -void ksm_add_vma(struct vm_area_struct *vma)
> > +vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
> > + vm_flags_t vm_flags)
> > {
> > - struct mm_struct *mm = vma->vm_mm;
> > + vm_flags_t ret = vm_flags;
> > - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
> > - __ksm_add_vma(vma);
> > + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
> > + __ksm_should_add_vma(file, vm_flags))
> > + ret |= VM_MERGEABLE;
> > +
> > + return ret;
> > }
> > static void ksm_add_vmas(struct mm_struct *mm)
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 3ff6cfbe3338..5bebe55ea737 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -2482,7 +2482,6 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
> > */
> > if (!vma_is_anonymous(vma))
> > khugepaged_enter_vma(vma, map->flags);
> > - ksm_add_vma(vma);
> > *vmap = vma;
> > return 0;
> > @@ -2585,6 +2584,45 @@ static void set_vma_user_defined_fields(struct vm_area_struct *vma,
> > vma->vm_private_data = map->vm_private_data;
> > }
> > +static void update_ksm_flags(struct mmap_state *map)
> > +{
> > + map->flags = ksm_vma_flags(map->mm, map->file, map->flags);
> > +}
> > +
> > +/*
> > + * Are we guaranteed no driver can change state such as to preclude KSM merging?
> > + * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
> > + *
> > + * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via
> > + * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set.
> > + *
> > + * If this is not the case, then we set the flag after considering mergeability,
> > + * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new
> > + * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will,
> > + * preventing any merge.
> > + */
> > +static bool can_set_ksm_flags_early(struct mmap_state *map)
> > +{
> > + struct file *file = map->file;
> > +
> > + /* Anonymous mappings have no driver which can change them. */
> > + if (!file)
> > + return true;
> > +
> > + /* shmem is safe. */
> > + if (shmem_file(file))
> > + return true;
> > +
> > + /*
> > + * If .mmap_prepare() is specified, then the driver will have already
> > + * manipulated state prior to updating KSM flags.
> > + */
> > + if (file->f_op->mmap_prepare)
> > + return true;
> > +
> > + return false;
> > +}
> > +
> > static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> > struct list_head *uf)
> > @@ -2595,6 +2633,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > bool have_mmap_prepare = file && file->f_op->mmap_prepare;
> > VMA_ITERATOR(vmi, mm, addr);
> > MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
> > + bool check_ksm_early = can_set_ksm_flags_early(&map);
> > error = __mmap_prepare(&map, uf);
> > if (!error && have_mmap_prepare)
> > @@ -2602,6 +2641,9 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > if (error)
> > goto abort_munmap;
> > + if (check_ksm_early)
> > + update_ksm_flags(&map);
> > +
> > /* Attempt to merge with adjacent VMAs... */
> > if (map.prev || map.next) {
> > VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
> > @@ -2611,6 +2653,9 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > /* ...but if we can't, allocate a new VMA. */
> > if (!vma) {
> > + if (!check_ksm_early)
> > + update_ksm_flags(&map);
> > +
> > error = __mmap_new_vma(&map, &vma);
> > if (error)
> > goto unacct_error;
> > @@ -2713,6 +2758,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > * Note: This happens *after* clearing old mappings in some code paths.
> > */
> > flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
> > + flags = ksm_vma_flags(mm, NULL, flags);
> > if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
> > return -ENOMEM;
> > @@ -2756,7 +2802,6 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > mm->map_count++;
> > validate_mm(mm);
> > - ksm_add_vma(vma);
> > out:
> > perf_event_mmap(vma);
> > mm->total_vm += len >> PAGE_SHIFT;
On 19.05.25 10:51, Lorenzo Stoakes wrote:
> If a user wishes to enable KSM mergeability for an entire process and all
> fork/exec'd processes that come after it, they use the prctl()
> PR_SET_MEMORY_MERGE operation.
>
> This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set
> (in order to indicate they are KSM mergeable), as well as setting this flag
> for all existing VMAs.
>
> However it also entirely and completely breaks VMA merging for the process
> and all forked (and fork/exec'd) processes.
>
> This is because when a new mapping is proposed, the flags specified will
> never have VM_MERGEABLE set. However all adjacent VMAs will already have
> VM_MERGEABLE set, rendering VMAs unmergeable by default.
>
> To work around this, we try to set the VM_MERGEABLE flag prior to
> attempting a merge. In the case of brk() this can always be done.
>
> However on mmap() things are more complicated - while KSM is not supported
> for file-backed mappings, it is supported for MAP_PRIVATE file-backed
> mappings.
>
> And these mappings may have deprecated .mmap() callbacks specified which
> could, in theory, adjust flags and thus KSM merge eligiblity.
>
> So we check to determine whether this at all possible. If not, we set
> VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the
> previous behaviour.
>
> When .mmap_prepare() is more widely used, we can remove this precaution.
>
> While this doesn't quite cover all cases, it covers a great many (all
> anonymous memory, for instance), meaning we should already see a
> significant improvement in VMA mergeability.
We should add a Fixes: tag.
CCing stable is likely not a good idea at this point (and might be
rather hairy).
[...]
> /**
> - * ksm_add_vma - Mark vma as mergeable if compatible
> + * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
> *
> - * @vma: Pointer to vma
> + * @mm: Proposed VMA's mm_struct
> + * @file: Proposed VMA's file-backed mapping, if any.
> + * @vm_flags: Proposed VMA"s flags.
> + *
> + * Returns: @vm_flags possibly updated to mark mergeable.
> */
> -void ksm_add_vma(struct vm_area_struct *vma)
> +vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
> + vm_flags_t vm_flags)
> {
> - struct mm_struct *mm = vma->vm_mm;
> + vm_flags_t ret = vm_flags;
>
> - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
> - __ksm_add_vma(vma);
> + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
> + __ksm_should_add_vma(file, vm_flags))
> + ret |= VM_MERGEABLE;
> +
> + return ret;
> }
No need for ret without harming readability.
if ()
vm_flags |= VM_MERGEABLE
return vm_flags;
[...]
> +/*
> + * Are we guaranteed no driver can change state such as to preclude KSM merging?
> + * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
> + *
> + * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via
> + * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set.
> + *
> + * If this is not the case, then we set the flag after considering mergeability,
> + * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new
> + * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will,
> + * preventing any merge.
Hmmm, so an ordinary MAP_PRIVATE of any file (executable etc.) will get
VM_MERGEABLE set but not be able to merge?
Probably these are not often expected to be merged ...
Preventing merging should really only happen because of VMA flags that
are getting set: VM_PFNMAP, VM_MIXEDMAP, VM_DONTEXPAND, VM_IO.
I am not 100% sure why we bail out on special mappings: all we have to
do is reliably identify anon pages, and we should be able to do that.
GUP does currently refuses any VM_PFNMAP | VM_IO, and KSM uses GUP,
which might need a tweak then (maybe the solution could be to ... not
use GUP but a folio_walk).
So, assuming we could remove the VM_PFNMAP | VM_IO | VM_DONTEXPAND |
VM_MIXEDMAP constraint from vma_ksm_compatible(), could we simplify?
That is: the other ones must not really be updated during mmap(), right?
(in particular: VM_SHARED | VM_MAYSHARE | VM_HUGETLB | VM_DROPPABLE)
Have to double-check VM_SAO and VM_SPARC_ADI.
> + */
> +static bool can_set_ksm_flags_early(struct mmap_state *map)
> +{
> + struct file *file = map->file;
> +
> + /* Anonymous mappings have no driver which can change them. */
> + if (!file)
> + return true;
> +
> + /* shmem is safe. */
> + if (shmem_file(file))
> + return true;
> +
> + /*
> + * If .mmap_prepare() is specified, then the driver will have already
> + * manipulated state prior to updating KSM flags.
> + */
> + if (file->f_op->mmap_prepare)
> + return true;
> +
> + return false;
> +}
So, long-term (mmap_prepare) this function will essentially go away?
Nothing jumped at me, this definitely improves the situation.
--
Cheers,
David / dhildenb
On Mon, May 19, 2025 at 08:00:29PM +0200, David Hildenbrand wrote:
> On 19.05.25 10:51, Lorenzo Stoakes wrote:
> > If a user wishes to enable KSM mergeability for an entire process and all
> > fork/exec'd processes that come after it, they use the prctl()
> > PR_SET_MEMORY_MERGE operation.
> >
> > This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set
> > (in order to indicate they are KSM mergeable), as well as setting this flag
> > for all existing VMAs.
> >
> > However it also entirely and completely breaks VMA merging for the process
> > and all forked (and fork/exec'd) processes.
> >
> > This is because when a new mapping is proposed, the flags specified will
> > never have VM_MERGEABLE set. However all adjacent VMAs will already have
> > VM_MERGEABLE set, rendering VMAs unmergeable by default.
> >
> > To work around this, we try to set the VM_MERGEABLE flag prior to
> > attempting a merge. In the case of brk() this can always be done.
> >
> > However on mmap() things are more complicated - while KSM is not supported
> > for file-backed mappings, it is supported for MAP_PRIVATE file-backed
> > mappings.
> >
> > And these mappings may have deprecated .mmap() callbacks specified which
> > could, in theory, adjust flags and thus KSM merge eligiblity.
> >
> > So we check to determine whether this at all possible. If not, we set
> > VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the
> > previous behaviour.
> >
> > When .mmap_prepare() is more widely used, we can remove this precaution.
> >
> > While this doesn't quite cover all cases, it covers a great many (all
> > anonymous memory, for instance), meaning we should already see a
> > significant improvement in VMA mergeability.
>
> We should add a Fixes: tag.
>
> CCing stable is likely not a good idea at this point (and might be rather
> hairy).
We should probably underline to Andrew not to add one :>) but sure can add.
A backport would be a total pain yes.
>
> [...]
>
> > /**
> > - * ksm_add_vma - Mark vma as mergeable if compatible
> > + * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
> > *
> > - * @vma: Pointer to vma
> > + * @mm: Proposed VMA's mm_struct
> > + * @file: Proposed VMA's file-backed mapping, if any.
> > + * @vm_flags: Proposed VMA"s flags.
> > + *
> > + * Returns: @vm_flags possibly updated to mark mergeable.
> > */
> > -void ksm_add_vma(struct vm_area_struct *vma)
> > +vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
> > + vm_flags_t vm_flags)
> > {
> > - struct mm_struct *mm = vma->vm_mm;
> > + vm_flags_t ret = vm_flags;
> > - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
> > - __ksm_add_vma(vma);
> > + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
> > + __ksm_should_add_vma(file, vm_flags))
> > + ret |= VM_MERGEABLE;
> > +
> > + return ret;
> > }
>
>
> No need for ret without harming readability.
>
> if ()
> vm_flags |= VM_MERGEABLE
> return vm_flags;
Ack this was just me following the 'don't mutate arguments' rule-of-thumb
that obviously we violate constantly int he kernel anyway and probably
never really matters... :>)
But yeah ret is kind of gross here, will change.
>
> [...]
>
> > +/*
> > + * Are we guaranteed no driver can change state such as to preclude KSM merging?
> > + * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
> > + *
> > + * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via
> > + * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set.
> > + *
> > + * If this is not the case, then we set the flag after considering mergeability,
> > + * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new
> > + * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will,
> > + * preventing any merge.
>
> Hmmm, so an ordinary MAP_PRIVATE of any file (executable etc.) will get
> VM_MERGEABLE set but not be able to merge?
>
> Probably these are not often expected to be merged ...
>
> Preventing merging should really only happen because of VMA flags that are
> getting set: VM_PFNMAP, VM_MIXEDMAP, VM_DONTEXPAND, VM_IO.
>
>
> I am not 100% sure why we bail out on special mappings: all we have to do is
> reliably identify anon pages, and we should be able to do that.
But they map e.g. kernel memory (at least for VM_PFNMAP, purely and by
implication really VM_IO), it wouldn't make sense for KSM to be asked to
try to merge these right?
And of course no underlying struct page to pin, no reference counting
either, so I think you'd end up in trouble potentially here wouldn't you?
And how would the CoW work?
>
> GUP does currently refuses any VM_PFNMAP | VM_IO, and KSM uses GUP, which
> might need a tweak then (maybe the solution could be to ... not use GUP but
> a folio_walk).
How could GUP work when there's no struct page to grab?
>
> So, assuming we could remove the VM_PFNMAP | VM_IO | VM_DONTEXPAND |
> VM_MIXEDMAP constraint from vma_ksm_compatible(), could we simplify?
Well I question removing this constraint for above reasons.
At any rate, even if we _could_ this feels like a bigger change that we
should come later.
But hmm this has made me think :)
So if something is backed by struct file *filp, and the driver says 'make
this PFN mapped' then of course it won't erroneously merge anyway.
If adjacent VMAs are not file-backed, then the merge will fail anyway.
So actually we're probably perfectly safe from a _merging_ point of view.
Buuut we are not safe from a setting VM_MERGEABLE point of view :)
So I think things have to stay the way they are, sensibly.
Fact that .mmap_prepare() will fix this in future makes it more reasonable
I think.
>
> That is: the other ones must not really be updated during mmap(), right?
> (in particular: VM_SHARED | VM_MAYSHARE | VM_HUGETLB | VM_DROPPABLE)
>
> Have to double-check VM_SAO and VM_SPARC_ADI.
_Probably_ :)
It really is mostly VM_SPECIAL.
>
> > + */
> > +static bool can_set_ksm_flags_early(struct mmap_state *map)
> > +{
> > + struct file *file = map->file;
> > +
> > + /* Anonymous mappings have no driver which can change them. */
> > + if (!file)
> > + return true;
> > +
> > + /* shmem is safe. */
> > + if (shmem_file(file))
> > + return true;
> > +
> > + /*
> > + * If .mmap_prepare() is specified, then the driver will have already
> > + * manipulated state prior to updating KSM flags.
> > + */
> > + if (file->f_op->mmap_prepare)
> > + return true;
> > +
> > + return false;
> > +}
>
> So, long-term (mmap_prepare) this function will essentially go away?
Yes, .mmap_prepare() solves this totally. Again it's super useful to have
the ability to get the driver to tell us 'we want flags X, Y, Z' ahead of
time. .mmap() must die :)
>
> Nothing jumped at me, this definitely improves the situation.
Thanks!
>
> --
> Cheers,
>
> David / dhildenb
>
On Mon, 19 May 2025 19:52:00 +0100 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > CCing stable is likely not a good idea at this point (and might be rather > > hairy). > > We should probably underline to Andrew not to add one :>) but sure can add. Thank deity for that. I'll await v2, thanks. It might be helpful to cc Stefan Roesch on that?
On Mon, May 19, 2025 at 02:57:07PM -0700, Andrew Morton wrote: > On Mon, 19 May 2025 19:52:00 +0100 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > > > CCing stable is likely not a good idea at this point (and might be rather > > > hairy). > > > > We should probably underline to Andrew not to add one :>) but sure can add. > > Thank deity for that. Yes this one would be a bit... grim :) > > I'll await v2, thanks. It might be helpful to cc Stefan Roesch on that? Ack and ack. > >
>>
>> I am not 100% sure why we bail out on special mappings: all we have to do is
>> reliably identify anon pages, and we should be able to do that.
>
> But they map e.g. kernel memory (at least for VM_PFNMAP, purely and by
> implication really VM_IO), it wouldn't make sense for KSM to be asked to
> try to merge these right?
>
> And of course no underlying struct page to pin, no reference counting
> either, so I think you'd end up in trouble potentially here wouldn't you?
> And how would the CoW work?
KSM only operates on anonymous pages. It cannot de-duplicate anything
else. (therefore, only MAP_PRIVATE applies)
Anything else (no struct page, not a CoW anon folio in such a mapping)
is skipped.
Take a look at scan_get_next_rmap_item() where we do
folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
if (folio) {
if (!folio_is_zone_device(folio) &&
folio_test_anon(folio)) {
folio_get(folio);
tmp_page = fw.page;
}
folio_walk_end(&fw, vma)
}
Before I changed that code, we were using GUP. And GUP just always
refuses VM_IO|VM_PFNMAP because it cannot handle it properly.
>>
>> So, assuming we could remove the VM_PFNMAP | VM_IO | VM_DONTEXPAND |
>> VM_MIXEDMAP constraint from vma_ksm_compatible(), could we simplify?
>
> Well I question removing this constraint for above reasons.
>
> At any rate, even if we _could_ this feels like a bigger change that we
> should come later.
"bigger" -- it might just be removing these 4 flags from the check ;)
I'll dig a bit more.
--
Cheers,
David / dhildenb
On Mon, May 19, 2025 at 08:59:32PM +0200, David Hildenbrand wrote:
> > >
> > > I am not 100% sure why we bail out on special mappings: all we have to do is
> > > reliably identify anon pages, and we should be able to do that.
> >
> > But they map e.g. kernel memory (at least for VM_PFNMAP, purely and by
> > implication really VM_IO), it wouldn't make sense for KSM to be asked to
> > try to merge these right?
> >
> > And of course no underlying struct page to pin, no reference counting
> > either, so I think you'd end up in trouble potentially here wouldn't you?
> > And how would the CoW work?
>
> KSM only operates on anonymous pages. It cannot de-duplicate anything else.
> (therefore, only MAP_PRIVATE applies)
Yes I had this realisation see my reply to your reply :)
I mean is MAP_PRIVATE of a VM_PFNMAP really that common?...
>
> Anything else (no struct page, not a CoW anon folio in such a mapping) is
> skipped.
>
> Take a look at scan_get_next_rmap_item() where we do
>
> folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> if (folio) {
> if (!folio_is_zone_device(folio) &&
> folio_test_anon(folio)) {
> folio_get(folio);
> tmp_page = fw.page;
> }
> folio_walk_end(&fw, vma)
> }
>
>
> Before I changed that code, we were using GUP. And GUP just always refuses
> VM_IO|VM_PFNMAP because it cannot handle it properly.
OK so it boils down to doing KSM _on the already CoW'd_ MAP_PRIVATE mapping?
But hang on, how do we discover this? vm_normal_page() will screw this up right?
As VM_SPECIAL will be set...
OK now I'm not sure I understand how MAP_PRIVATE-mapped VM_PFNMAP mappings work
:)))
>
> > >
> > > So, assuming we could remove the VM_PFNMAP | VM_IO | VM_DONTEXPAND |
> > > VM_MIXEDMAP constraint from vma_ksm_compatible(), could we simplify?
> >
> > Well I question removing this constraint for above reasons.
> >
> > At any rate, even if we _could_ this feels like a bigger change that we
> > should come later.
>
> "bigger" -- it might just be removing these 4 flags from the check ;)
>
> I'll dig a bit more.
Right, but doing so would be out of scope here don't you think?
I'd rather not delay fixing this bug on this basis ideally, esp. as easy to
adjust later.
I suggest we put this in as-is (or close to it anyway) and then if we remove the
flags we can change this...
As I said in other reply, .mmap() means the driver can do literally anything
they want (which is _hateful_), so we'd really want to have some confidence they
didn't do something so crazy, unless we were happy to just let that possibly
explode.
>
> --
> Cheers,
>
> David / dhildenb
>
>>>> >>>> So, assuming we could remove the VM_PFNMAP | VM_IO | VM_DONTEXPAND | >>>> VM_MIXEDMAP constraint from vma_ksm_compatible(), could we simplify? >>> >>> Well I question removing this constraint for above reasons. >>> >>> At any rate, even if we _could_ this feels like a bigger change that we >>> should come later. >> >> "bigger" -- it might just be removing these 4 flags from the check ;) >> >> I'll dig a bit more. > > Right, but doing so would be out of scope here don't you think? I'm fine with moving forward with this here. Just thinking how we can make more VMA merging "easily" possible and avoid the KSM magic in the mmap handling code. (that early ksm check handling is rather ugly) Your patch promises "prevent KSM from completely breaking VMA merging", and I guess that's true: after this patch merging with at least anon and MAP_PRIVATE of shmem it's not broken anymore. :) -- Cheers, David / dhildenb
On Mon, May 19, 2025 at 08:14:17PM +0100, Lorenzo Stoakes wrote:
> On Mon, May 19, 2025 at 08:59:32PM +0200, David Hildenbrand wrote:
> > > >
> > > > I am not 100% sure why we bail out on special mappings: all we have to do is
> > > > reliably identify anon pages, and we should be able to do that.
> > >
> > > But they map e.g. kernel memory (at least for VM_PFNMAP, purely and by
> > > implication really VM_IO), it wouldn't make sense for KSM to be asked to
> > > try to merge these right?
> > >
> > > And of course no underlying struct page to pin, no reference counting
> > > either, so I think you'd end up in trouble potentially here wouldn't you?
> > > And how would the CoW work?
> >
> > KSM only operates on anonymous pages. It cannot de-duplicate anything else.
> > (therefore, only MAP_PRIVATE applies)
>
> Yes I had this realisation see my reply to your reply :)
>
> I mean is MAP_PRIVATE of a VM_PFNMAP really that common?...
>
> >
> > Anything else (no struct page, not a CoW anon folio in such a mapping) is
> > skipped.
> >
> > Take a look at scan_get_next_rmap_item() where we do
> >
> > folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> > if (folio) {
> > if (!folio_is_zone_device(folio) &&
> > folio_test_anon(folio)) {
> > folio_get(folio);
> > tmp_page = fw.page;
> > }
> > folio_walk_end(&fw, vma)
> > }
> >
> >
> > Before I changed that code, we were using GUP. And GUP just always refuses
> > VM_IO|VM_PFNMAP because it cannot handle it properly.
>
> OK so it boils down to doing KSM _on the already CoW'd_ MAP_PRIVATE mapping?
>
> But hang on, how do we discover this? vm_normal_page() will screw this up right?
> As VM_SPECIAL will be set...
>
> OK now I'm not sure I understand how MAP_PRIVATE-mapped VM_PFNMAP mappings work
> :)))
Oh wait hang on...
So here, MAP_PRIVATE, CoW'd would clear pte_special() presumably:
if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
if (likely(!pte_special(pte)))
goto check_pfn;
...
And in non-PTE special flag case:
...
if (!is_cow_mapping(vma->vm_flags))
return NULL;
...
And of course, this will be a CoW mapping...
So actually we should be fine. Got it.
>
> >
> > > >
> > > > So, assuming we could remove the VM_PFNMAP | VM_IO | VM_DONTEXPAND |
> > > > VM_MIXEDMAP constraint from vma_ksm_compatible(), could we simplify?
> > >
> > > Well I question removing this constraint for above reasons.
> > >
> > > At any rate, even if we _could_ this feels like a bigger change that we
> > > should come later.
> >
> > "bigger" -- it might just be removing these 4 flags from the check ;)
> >
> > I'll dig a bit more.
>
> Right, but doing so would be out of scope here don't you think?
>
> I'd rather not delay fixing this bug on this basis ideally, esp. as easy to
> adjust later.
>
> I suggest we put this in as-is (or close to it anyway) and then if we remove the
> flags we can change this...
>
> As I said in other reply, .mmap() means the driver can do literally anything
> they want (which is _hateful_), so we'd really want to have some confidence they
> didn't do something so crazy, unless we were happy to just let that possibly
> explode.
>
> >
> > --
> > Cheers,
> >
> > David / dhildenb
> >
>> +/* >> + * Are we guaranteed no driver can change state such as to preclude KSM merging? >> + * If so, let's set the KSM mergeable flag early so we don't break VMA merging. >> + * >> + * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via >> + * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set. >> + * >> + * If this is not the case, then we set the flag after considering mergeability, >> + * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new >> + * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will, >> + * preventing any merge. > > Hmmm, so an ordinary MAP_PRIVATE of any file (executable etc.) will get > VM_MERGEABLE set but not be able to merge? > > Probably these are not often expected to be merged ... > > Preventing merging should really only happen because of VMA flags that > are getting set: VM_PFNMAP, VM_MIXEDMAP, VM_DONTEXPAND, VM_IO. > > > I am not 100% sure why we bail out on special mappings: all we have to > do is reliably identify anon pages, and we should be able to do that. > > GUP does currently refuses any VM_PFNMAP | VM_IO, and KSM uses GUP, > which might need a tweak then (maybe the solution could be to ... not > use GUP but a folio_walk). Oh, someone called "David" already did that. Nice :) So we *should* be able to drop * VM_PFNMAP: we correctly identify CoWed pages * VM_MIXEDMAP: we correctly identify CoWed pages * VM_IO: should not affect CoWed pages * VM_DONTEXPAND: no idea why that should even matter here -- Cheers, David / dhildenb
On Mon, May 19, 2025 at 08:04:22PM +0200, David Hildenbrand wrote: > > > > +/* > > > + * Are we guaranteed no driver can change state such as to preclude KSM merging? > > > + * If so, let's set the KSM mergeable flag early so we don't break VMA merging. > > > + * > > > + * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via > > > + * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set. > > > + * > > > + * If this is not the case, then we set the flag after considering mergeability, > > > + * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new > > > + * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will, > > > + * preventing any merge. > > > > Hmmm, so an ordinary MAP_PRIVATE of any file (executable etc.) will get > > VM_MERGEABLE set but not be able to merge? > > > > Probably these are not often expected to be merged ... > > > > Preventing merging should really only happen because of VMA flags that > > are getting set: VM_PFNMAP, VM_MIXEDMAP, VM_DONTEXPAND, VM_IO. > > > > > > I am not 100% sure why we bail out on special mappings: all we have to > > do is reliably identify anon pages, and we should be able to do that. > > > > GUP does currently refuses any VM_PFNMAP | VM_IO, and KSM uses GUP, > > which might need a tweak then (maybe the solution could be to ... not > > use GUP but a folio_walk). > > Oh, someone called "David" already did that. Nice :) > > So we *should* be able to drop > > * VM_PFNMAP: we correctly identify CoWed pages > * VM_MIXEDMAP: we correctly identify CoWed pages > * VM_IO: should not affect CoWed pages > * VM_DONTEXPAND: no idea why that should even matter here I objected in the other thread but now realise I forgot we're talking about MAP_PRIVATE... So we can do the CoW etc. Right. Then we just need to be able to copy the thing on CoW... but what about write-through etc. cache settings? I suppose we don't care once CoW'd... But is this common enough of a use case to be worth the hassle of checking this is all ok? I don't know KSM well enough to comment on VM_DONTEXPAND but obviously meaningful for purposes of _VMA merging_. We refuse to merge all of these. Anyway this all feels like something for the future :) It'd make life easier here yes, but we'd have to be _really sure_ there isn't _some .mmap() hook somewhere_ that's doing something crazy. Which is another reason why I really really hate .mmap() as-is and why I'm changing it. So it may still be more conservative to leave things as they are even if this change was made... Guess it depends how much we care about 'crazy' drivers. > > -- > Cheers, > > David / dhildenb >
On 19.05.25 21:02, Lorenzo Stoakes wrote: > On Mon, May 19, 2025 at 08:04:22PM +0200, David Hildenbrand wrote: >> >>>> +/* >>>> + * Are we guaranteed no driver can change state such as to preclude KSM merging? >>>> + * If so, let's set the KSM mergeable flag early so we don't break VMA merging. >>>> + * >>>> + * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via >>>> + * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set. >>>> + * >>>> + * If this is not the case, then we set the flag after considering mergeability, >>>> + * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new >>>> + * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will, >>>> + * preventing any merge. >>> >>> Hmmm, so an ordinary MAP_PRIVATE of any file (executable etc.) will get >>> VM_MERGEABLE set but not be able to merge? >>> >>> Probably these are not often expected to be merged ... >>> >>> Preventing merging should really only happen because of VMA flags that >>> are getting set: VM_PFNMAP, VM_MIXEDMAP, VM_DONTEXPAND, VM_IO. >>> >>> >>> I am not 100% sure why we bail out on special mappings: all we have to >>> do is reliably identify anon pages, and we should be able to do that. >>> >>> GUP does currently refuses any VM_PFNMAP | VM_IO, and KSM uses GUP, >>> which might need a tweak then (maybe the solution could be to ... not >>> use GUP but a folio_walk). >> >> Oh, someone called "David" already did that. Nice :) >> >> So we *should* be able to drop >> >> * VM_PFNMAP: we correctly identify CoWed pages >> * VM_MIXEDMAP: we correctly identify CoWed pages >> * VM_IO: should not affect CoWed pages >> * VM_DONTEXPAND: no idea why that should even matter here > > I objected in the other thread but now realise I forgot we're talking about > MAP_PRIVATE... So we can do the CoW etc. Right. > > Then we just need to be able to copy the thing on CoW... but what about > write-through etc. cache settings? I suppose we don't care once CoW'd... Yes. It's ordinary kernel-managed memory. > > But is this common enough of a use case to be worth the hassle of checking this > is all ok? The reason I bring it up is because 1) Just because some drivers do weird mmap() things, we cannot merge any MAP_PRIVATE file mappings (except shmem ;) and mmap_prepare). 2) The whole "early_ksm" checks/handling would go away, making this patch significantly simpler :) -- Cheers, David / dhildenb
On Mon, May 19, 2025 at 09:11:10PM +0200, David Hildenbrand wrote: > On 19.05.25 21:02, Lorenzo Stoakes wrote: > > On Mon, May 19, 2025 at 08:04:22PM +0200, David Hildenbrand wrote: > > > > > > > > +/* > > > > > + * Are we guaranteed no driver can change state such as to preclude KSM merging? > > > > > + * If so, let's set the KSM mergeable flag early so we don't break VMA merging. > > > > > + * > > > > > + * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via > > > > > + * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set. > > > > > + * > > > > > + * If this is not the case, then we set the flag after considering mergeability, > > > > > + * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new > > > > > + * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will, > > > > > + * preventing any merge. > > > > > > > > Hmmm, so an ordinary MAP_PRIVATE of any file (executable etc.) will get > > > > VM_MERGEABLE set but not be able to merge? > > > > > > > > Probably these are not often expected to be merged ... > > > > > > > > Preventing merging should really only happen because of VMA flags that > > > > are getting set: VM_PFNMAP, VM_MIXEDMAP, VM_DONTEXPAND, VM_IO. > > > > > > > > > > > > I am not 100% sure why we bail out on special mappings: all we have to > > > > do is reliably identify anon pages, and we should be able to do that. > > > > > > > > GUP does currently refuses any VM_PFNMAP | VM_IO, and KSM uses GUP, > > > > which might need a tweak then (maybe the solution could be to ... not > > > > use GUP but a folio_walk). > > > > > > Oh, someone called "David" already did that. Nice :) > > > > > > So we *should* be able to drop > > > > > > * VM_PFNMAP: we correctly identify CoWed pages > > > * VM_MIXEDMAP: we correctly identify CoWed pages > > > * VM_IO: should not affect CoWed pages > > > * VM_DONTEXPAND: no idea why that should even matter here > > > > I objected in the other thread but now realise I forgot we're talking about > > MAP_PRIVATE... So we can do the CoW etc. Right. > > > > Then we just need to be able to copy the thing on CoW... but what about > > write-through etc. cache settings? I suppose we don't care once CoW'd... > > Yes. It's ordinary kernel-managed memory. Yeah, it's the CoW'd bit right? So it's fine. > > > > > But is this common enough of a use case to be worth the hassle of checking this > > is all ok? > > The reason I bring it up is because > > 1) Just because some drivers do weird mmap() things, we cannot merge any > MAP_PRIVATE file mappings (except shmem ;) and mmap_prepare). > > 2) The whole "early_ksm" checks/handling would go away, making this patch > significantly simpler :) OK you're starting to convince me... Maybe this isn't such a big deal if the KSM code handles it already anwyay. If you're sure GUP isn't relying on this... it could be an additional patch like: 'remove VM_SPECIAL limitation, KSM can already handle this' And probably we _should_ let any insane driver blow up if they change stupid things they must not change. > > -- > Cheers, > > David / dhildenb >
On 19.05.25 21:26, Lorenzo Stoakes wrote: > On Mon, May 19, 2025 at 09:11:10PM +0200, David Hildenbrand wrote: >> On 19.05.25 21:02, Lorenzo Stoakes wrote: >>> On Mon, May 19, 2025 at 08:04:22PM +0200, David Hildenbrand wrote: >>>> >>>>>> +/* >>>>>> + * Are we guaranteed no driver can change state such as to preclude KSM merging? >>>>>> + * If so, let's set the KSM mergeable flag early so we don't break VMA merging. >>>>>> + * >>>>>> + * This is applicable when PR_SET_MEMORY_MERGE has been set on the mm_struct via >>>>>> + * prctl() causing newly mapped VMAs to have the KSM mergeable VMA flag set. >>>>>> + * >>>>>> + * If this is not the case, then we set the flag after considering mergeability, >>>>>> + * which will prevent mergeability as, when PR_SET_MEMORY_MERGE is set, a new >>>>>> + * VMA will not have the KSM mergeability VMA flag set, but all other VMAs will, >>>>>> + * preventing any merge. >>>>> >>>>> Hmmm, so an ordinary MAP_PRIVATE of any file (executable etc.) will get >>>>> VM_MERGEABLE set but not be able to merge? >>>>> >>>>> Probably these are not often expected to be merged ... >>>>> >>>>> Preventing merging should really only happen because of VMA flags that >>>>> are getting set: VM_PFNMAP, VM_MIXEDMAP, VM_DONTEXPAND, VM_IO. >>>>> >>>>> >>>>> I am not 100% sure why we bail out on special mappings: all we have to >>>>> do is reliably identify anon pages, and we should be able to do that. >>>>> >>>>> GUP does currently refuses any VM_PFNMAP | VM_IO, and KSM uses GUP, >>>>> which might need a tweak then (maybe the solution could be to ... not >>>>> use GUP but a folio_walk). >>>> >>>> Oh, someone called "David" already did that. Nice :) >>>> >>>> So we *should* be able to drop >>>> >>>> * VM_PFNMAP: we correctly identify CoWed pages >>>> * VM_MIXEDMAP: we correctly identify CoWed pages >>>> * VM_IO: should not affect CoWed pages >>>> * VM_DONTEXPAND: no idea why that should even matter here >>> >>> I objected in the other thread but now realise I forgot we're talking about >>> MAP_PRIVATE... So we can do the CoW etc. Right. >>> >>> Then we just need to be able to copy the thing on CoW... but what about >>> write-through etc. cache settings? I suppose we don't care once CoW'd... >> >> Yes. It's ordinary kernel-managed memory. > > Yeah, it's the CoW'd bit right? So it's fine. > >> >>> >>> But is this common enough of a use case to be worth the hassle of checking this >>> is all ok? >> >> The reason I bring it up is because >> >> 1) Just because some drivers do weird mmap() things, we cannot merge any >> MAP_PRIVATE file mappings (except shmem ;) and mmap_prepare). >> >> 2) The whole "early_ksm" checks/handling would go away, making this patch >> significantly simpler :) > > OK you're starting to convince me... > > Maybe this isn't such a big deal if the KSM code handles it already anwyay. > > If you're sure GUP isn't relying on this... it could be an additional patch > like: > > 'remove VM_SPECIAL limitation, KSM can already handle this' > > And probably we _should_ let any insane driver blow up if they change stupid > things they must not change. That is exactly my thinking. But I'm also fine with doing that later, if you want to be careful. -- Cheers, David / dhildenb
Hi Lorenzo,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Lorenzo-Stoakes/mm-ksm-have-KSM-VMA-checks-not-require-a-VMA-pointer/20250519-165315
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/418d3edbec3a718a7023f1beed5478f5952fc3df.1747431920.git.lorenzo.stoakes%40oracle.com
patch subject: [PATCH 3/4] mm: prevent KSM from completely breaking VMA merging
config: x86_64-buildonly-randconfig-001-20250519 (https://download.01.org/0day-ci/archive/20250519/202505192132.NsAm4haK-lkp@intel.com/config)
compiler: clang version 20.1.2 (https://github.com/llvm/llvm-project 58df0ef89dd64126512e4ee27b4ac3fd8ddf6247)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250519/202505192132.NsAm4haK-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202505192132.NsAm4haK-lkp@intel.com/
All errors (new ones prefixed by >>):
>> mm/vma.c:2589:15: error: call to undeclared function 'ksm_vma_flags'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
2589 | map->flags = ksm_vma_flags(map->mm, map->file, map->flags);
| ^
mm/vma.c:2761:10: error: call to undeclared function 'ksm_vma_flags'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
2761 | flags = ksm_vma_flags(mm, NULL, flags);
| ^
2 errors generated.
vim +/ksm_vma_flags +2589 mm/vma.c
2586
2587 static void update_ksm_flags(struct mmap_state *map)
2588 {
> 2589 map->flags = ksm_vma_flags(map->mm, map->file, map->flags);
2590 }
2591
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Andrew - fix-patch enclosed for this. Silly oversight.
Will also fixup on any respin.
Thanks, Lorenzo
On Mon, May 19, 2025 at 09:19:50PM +0800, kernel test robot wrote:
> Hi Lorenzo,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on akpm-mm/mm-everything]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Lorenzo-Stoakes/mm-ksm-have-KSM-VMA-checks-not-require-a-VMA-pointer/20250519-165315
> base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> patch link: https://lore.kernel.org/r/418d3edbec3a718a7023f1beed5478f5952fc3df.1747431920.git.lorenzo.stoakes%40oracle.com
> patch subject: [PATCH 3/4] mm: prevent KSM from completely breaking VMA merging
> config: x86_64-buildonly-randconfig-001-20250519 (https://download.01.org/0day-ci/archive/20250519/202505192132.NsAm4haK-lkp@intel.com/config)
> compiler: clang version 20.1.2 (https://github.com/llvm/llvm-project 58df0ef89dd64126512e4ee27b4ac3fd8ddf6247)
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250519/202505192132.NsAm4haK-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202505192132.NsAm4haK-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
> >> mm/vma.c:2589:15: error: call to undeclared function 'ksm_vma_flags'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
> 2589 | map->flags = ksm_vma_flags(map->mm, map->file, map->flags);
> | ^
> mm/vma.c:2761:10: error: call to undeclared function 'ksm_vma_flags'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
> 2761 | flags = ksm_vma_flags(mm, NULL, flags);
> | ^
> 2 errors generated.
Ugh ok looks like I forgot to provide an #else for the #ifdef CONFIG_KSM here.
>
>
> vim +/ksm_vma_flags +2589 mm/vma.c
>
> 2586
> 2587 static void update_ksm_flags(struct mmap_state *map)
> 2588 {
> > 2589 map->flags = ksm_vma_flags(map->mm, map->file, map->flags);
> 2590 }
> 2591
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki
----8<----
From 2dbb9c4471f6145a513b5a2a661c78d3269fc9fe Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 19 May 2025 14:36:14 +0100
Subject: [PATCH] fix
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
include/linux/ksm.h | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index ba5664daca6e..febc8acc565d 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -97,8 +97,11 @@ bool ksm_process_mergeable(struct mm_struct *mm);
#else /* !CONFIG_KSM */
-static inline void ksm_add_vma(struct vm_area_struct *vma)
+static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
+ const struct file *file,
+ vm_flags_t vm_flags)
{
+ return vm_flags;
}
static inline int ksm_disable(struct mm_struct *mm)
--
2.49.0
On 2025/5/19 16:51, Lorenzo Stoakes wrote: > If a user wishes to enable KSM mergeability for an entire process and all > fork/exec'd processes that come after it, they use the prctl() > PR_SET_MEMORY_MERGE operation. > > This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set > (in order to indicate they are KSM mergeable), as well as setting this flag > for all existing VMAs. > > However it also entirely and completely breaks VMA merging for the process > and all forked (and fork/exec'd) processes. > > This is because when a new mapping is proposed, the flags specified will > never have VM_MERGEABLE set. However all adjacent VMAs will already have > VM_MERGEABLE set, rendering VMAs unmergeable by default. Great catch! I'm wondering how about fixing the vma_merge_new_range() to make it mergeable in this case? > > To work around this, we try to set the VM_MERGEABLE flag prior to > attempting a merge. In the case of brk() this can always be done. > > However on mmap() things are more complicated - while KSM is not supported > for file-backed mappings, it is supported for MAP_PRIVATE file-backed > mappings. So we don't need to set VM_MERGEABLE flag so early, given these corner cases that you described below need to consider. > > And these mappings may have deprecated .mmap() callbacks specified which > could, in theory, adjust flags and thus KSM merge eligiblity. > > So we check to determine whether this at all possible. If not, we set > VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the > previous behaviour. > > When .mmap_prepare() is more widely used, we can remove this precaution. Sounds good too. > > While this doesn't quite cover all cases, it covers a great many (all > anonymous memory, for instance), meaning we should already see a > significant improvement in VMA mergeability. > Agree, it's a very good improvement. Thanks!
On Mon, May 19, 2025 at 09:08:57PM +0800, Chengming Zhou wrote: > On 2025/5/19 16:51, Lorenzo Stoakes wrote: > > If a user wishes to enable KSM mergeability for an entire process and all > > fork/exec'd processes that come after it, they use the prctl() > > PR_SET_MEMORY_MERGE operation. > > > > This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set > > (in order to indicate they are KSM mergeable), as well as setting this flag > > for all existing VMAs. > > > > However it also entirely and completely breaks VMA merging for the process > > and all forked (and fork/exec'd) processes. > > > > This is because when a new mapping is proposed, the flags specified will > > never have VM_MERGEABLE set. However all adjacent VMAs will already have > > VM_MERGEABLE set, rendering VMAs unmergeable by default. > > Great catch! Thanks! :) > > I'm wondering how about fixing the vma_merge_new_range() to make it mergeable > in this case? There's no need, we apply the flag before we attempt to merge. It wouldn't be correct to make any change in the actual merging logic, as we can't merge VMAs with/without this flag set. So the approach taken here - to (if appropriate) apply this flag prior to merge attempt - I think is the correct one. > > > > > To work around this, we try to set the VM_MERGEABLE flag prior to > > attempting a merge. In the case of brk() this can always be done. > > > > However on mmap() things are more complicated - while KSM is not supported > > for file-backed mappings, it is supported for MAP_PRIVATE file-backed > > mappings. > > So we don't need to set VM_MERGEABLE flag so early, given these corner cases > that you described below need to consider. No, we do, just we might miss some corner cases. However this are likely not very common in practice. As the .mmap_prepare() hook is more commonly used, this problem will be solved, and I think that's fine as a way forwards. > > > > > And these mappings may have deprecated .mmap() callbacks specified which > > could, in theory, adjust flags and thus KSM merge eligiblity. > > > > So we check to determine whether this at all possible. If not, we set > > VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the > > previous behaviour. > > > > When .mmap_prepare() is more widely used, we can remove this precaution. > > Sounds good too. Thanks! > > > > > While this doesn't quite cover all cases, it covers a great many (all > > anonymous memory, for instance), meaning we should already see a > > significant improvement in VMA mergeability. > > > > Agree, it's a very good improvement. > > Thanks! And again thank you :))
© 2016 - 2025 Red Hat, Inc.