virt/kvm/guest_memfd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
kvm_gmem_get_policy() sets *ilx to the full page offset
(vm_pgoff + vma offset). But get_vma_policy() adds the page
offset on top of *ilx, so the offset is counted twice. This
causes NUMA interleaving to skip nodes: for order-0 pages the
effective index jumps by 2 for each consecutive page.
The get_policy vm_op should return only a per-file bias in *ilx
(like shmem_get_policy does with inode->i_ino), letting
get_vma_policy() add the page-offset component.
Fix by setting *ilx to inode->i_ino instead of the full page
offset. The page offset is computed by get_vma_policy() in
mm/mempolicy.c. The full offset is still computed
in kvm_gmem_get_policy() for mpol_shared_policy_lookup().
shmem_get_policy() follows the same pattern.
Found by Sashiko (sashiko.dev) AI code review.
Fixes: ed1ffa810bd6 ("KVM: guest_memfd: Enforce NUMA mempolicy using shared policy")
Cc: Sean Christopherson <seanjc@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
virt/kvm/guest_memfd.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 69c9d6d546b2..0bcf6fc08e2d 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -438,11 +438,12 @@ static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpo
}
static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
- unsigned long addr, pgoff_t *pgoff)
+ unsigned long addr, pgoff_t *ilx)
{
struct inode *inode = file_inode(vma->vm_file);
+ pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
- *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+ *ilx = inode->i_ino;
/*
* Return the memory policy for this index, or NULL if none is set.
@@ -453,7 +454,7 @@ static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
* can then replace NULL with the default memory policy instead of the
* current task's memory policy.
*/
- return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
+ return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff);
}
#endif /* CONFIG_NUMA */
--
MST
On 6/3/26 17:57, Michael S. Tsirkin wrote:
> kvm_gmem_get_policy() sets *ilx to the full page offset
> (vm_pgoff + vma offset). But get_vma_policy() adds the page
> offset on top of *ilx, so the offset is counted twice. This
> causes NUMA interleaving to skip nodes: for order-0 pages the
> effective index jumps by 2 for each consecutive page.
>
> The get_policy vm_op should return only a per-file bias in *ilx
> (like shmem_get_policy does with inode->i_ino), letting
> get_vma_policy() add the page-offset component.
>
> Fix by setting *ilx to inode->i_ino instead of the full page
> offset. The page offset is computed by get_vma_policy() in
> mm/mempolicy.c. The full offset is still computed
> in kvm_gmem_get_policy() for mpol_shared_policy_lookup().
> shmem_get_policy() follows the same pattern.
>
> Found by Sashiko (sashiko.dev) AI code review.
>
> Fixes: ed1ffa810bd6 ("KVM: guest_memfd: Enforce NUMA mempolicy using shared policy")
> Cc: Sean Christopherson <seanjc@google.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Assisted-by: Claude:claude-opus-4-6
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> virt/kvm/guest_memfd.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 69c9d6d546b2..0bcf6fc08e2d 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -438,11 +438,12 @@ static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpo
> }
>
> static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> - unsigned long addr, pgoff_t *pgoff)
> + unsigned long addr, pgoff_t *ilx)
That now matches the definition in struct vm_operations_struct.
> {
> struct inode *inode = file_inode(vma->vm_file);
> + pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
>
> - *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> + *ilx = inode->i_ino;
>
> /*
> * Return the memory policy for this index, or NULL if none is set.
> @@ -453,7 +454,7 @@ static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> * can then replace NULL with the default memory policy instead of the
> * current task's memory policy.
> */
> - return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
> + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff);
> }
> #endif /* CONFIG_NUMA */
>
That now matches what shmem_get_policy() does logically.
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
--
Cheers,
David
On 6/3/2026 9:27 PM, Michael S. Tsirkin wrote:
> kvm_gmem_get_policy() sets *ilx to the full page offset
> (vm_pgoff + vma offset). But get_vma_policy() adds the page
> offset on top of *ilx, so the offset is counted twice. This
> causes NUMA interleaving to skip nodes: for order-0 pages the
> effective index jumps by 2 for each consecutive page.
>
> The get_policy vm_op should return only a per-file bias in *ilx
> (like shmem_get_policy does with inode->i_ino), letting
> get_vma_policy() add the page-offset component.
>
> Fix by setting *ilx to inode->i_ino instead of the full page
> offset. The page offset is computed by get_vma_policy() in
> mm/mempolicy.c. The full offset is still computed
> in kvm_gmem_get_policy() for mpol_shared_policy_lookup().
> shmem_get_policy() follows the same pattern.
>
> Found by Sashiko (sashiko.dev) AI code review.
>
> Fixes: ed1ffa810bd6 ("KVM: guest_memfd: Enforce NUMA mempolicy using shared policy")
> Cc: Sean Christopherson <seanjc@google.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Assisted-by: Claude:claude-opus-4-6
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> virt/kvm/guest_memfd.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 69c9d6d546b2..0bcf6fc08e2d 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -438,11 +438,12 @@ static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpo
> }
>
> static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> - unsigned long addr, pgoff_t *pgoff)
> + unsigned long addr, pgoff_t *ilx)
> {
> struct inode *inode = file_inode(vma->vm_file);
> + pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
>
> - *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> + *ilx = inode->i_ino;
>
> /*
> * Return the memory policy for this index, or NULL if none is set.
> @@ -453,7 +454,7 @@ static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> * can then replace NULL with the default memory policy instead of the
> * current task's memory policy.
> */
> - return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
> + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff);
> }
> #endif /* CONFIG_NUMA */
>
> --
> MST
>
Thanks for fixing this. LGTM!
Reviewed-by: Shivank Garg <shivankg@amd.com>
Best regards,
Shivank
On Thu, Jun 04, 2026 at 12:21:15AM +0530, Garg, Shivank wrote:
>
>
> On 6/3/2026 9:27 PM, Michael S. Tsirkin wrote:
> > kvm_gmem_get_policy() sets *ilx to the full page offset
> > (vm_pgoff + vma offset). But get_vma_policy() adds the page
> > offset on top of *ilx, so the offset is counted twice. This
> > causes NUMA interleaving to skip nodes: for order-0 pages the
> > effective index jumps by 2 for each consecutive page.
> >
> > The get_policy vm_op should return only a per-file bias in *ilx
> > (like shmem_get_policy does with inode->i_ino), letting
> > get_vma_policy() add the page-offset component.
> >
> > Fix by setting *ilx to inode->i_ino instead of the full page
> > offset. The page offset is computed by get_vma_policy() in
> > mm/mempolicy.c. The full offset is still computed
> > in kvm_gmem_get_policy() for mpol_shared_policy_lookup().
> > shmem_get_policy() follows the same pattern.
> >
> > Found by Sashiko (sashiko.dev) AI code review.
> >
> > Fixes: ed1ffa810bd6 ("KVM: guest_memfd: Enforce NUMA mempolicy using shared policy")
> > Cc: Sean Christopherson <seanjc@google.com>
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Assisted-by: Claude:claude-opus-4-6
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> > virt/kvm/guest_memfd.c | 7 ++++---
> > 1 file changed, 4 insertions(+), 3 deletions(-)
> >
> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > index 69c9d6d546b2..0bcf6fc08e2d 100644
> > --- a/virt/kvm/guest_memfd.c
> > +++ b/virt/kvm/guest_memfd.c
> > @@ -438,11 +438,12 @@ static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpo
> > }
> >
> > static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> > - unsigned long addr, pgoff_t *pgoff)
> > + unsigned long addr, pgoff_t *ilx)
> > {
> > struct inode *inode = file_inode(vma->vm_file);
> > + pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> >
> > - *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> > + *ilx = inode->i_ino;
> >
> > /*
> > * Return the memory policy for this index, or NULL if none is set.
> > @@ -453,7 +454,7 @@ static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> > * can then replace NULL with the default memory policy instead of the
> > * current task's memory policy.
> > */
> > - return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
> > + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff);
> > }
> > #endif /* CONFIG_NUMA */
> >
> > --
> > MST
> >
>
> Thanks for fixing this. LGTM!
>
> Reviewed-by: Shivank Garg <shivankg@amd.com>
Can u actually test it though pls?
Because I think another patch I sent in response so Sashiko
is also needed.
> Best regards,
> Shivank
>
>
On 6/5/2026 5:16 AM, Michael S. Tsirkin wrote:
> [You don't often get email from mst@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>
> On Thu, Jun 04, 2026 at 12:21:15AM +0530, Garg, Shivank wrote:
>>
>>
>> On 6/3/2026 9:27 PM, Michael S. Tsirkin wrote:
>>> kvm_gmem_get_policy() sets *ilx to the full page offset
>>> (vm_pgoff + vma offset). But get_vma_policy() adds the page
>>> offset on top of *ilx, so the offset is counted twice. This
>>> causes NUMA interleaving to skip nodes: for order-0 pages the
>>> effective index jumps by 2 for each consecutive page.
>>>
>>> The get_policy vm_op should return only a per-file bias in *ilx
>>> (like shmem_get_policy does with inode->i_ino), letting
>>> get_vma_policy() add the page-offset component.
>>>
>>> Fix by setting *ilx to inode->i_ino instead of the full page
>>> offset. The page offset is computed by get_vma_policy() in
>>> mm/mempolicy.c. The full offset is still computed
>>> in kvm_gmem_get_policy() for mpol_shared_policy_lookup().
>>> shmem_get_policy() follows the same pattern.
>>>
>>> Found by Sashiko (sashiko.dev) AI code review.
>>>
>>> Fixes: ed1ffa810bd6 ("KVM: guest_memfd: Enforce NUMA mempolicy using shared policy")
>>> Cc: Sean Christopherson <seanjc@google.com>
>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>>> Assisted-by: Claude:claude-opus-4-6
>>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>>> ---
>>> virt/kvm/guest_memfd.c | 7 ++++---
>>> 1 file changed, 4 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>>> index 69c9d6d546b2..0bcf6fc08e2d 100644
>>> --- a/virt/kvm/guest_memfd.c
>>> +++ b/virt/kvm/guest_memfd.c
>>> @@ -438,11 +438,12 @@ static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpo
>>> }
>>>
>>> static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
>>> - unsigned long addr, pgoff_t *pgoff)
>>> + unsigned long addr, pgoff_t *ilx)
>>> {
>>> struct inode *inode = file_inode(vma->vm_file);
>>> + pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
>>>
>>> - *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
>>> + *ilx = inode->i_ino;
>>>
>>> /*
>>> * Return the memory policy for this index, or NULL if none is set.
>>> @@ -453,7 +454,7 @@ static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
>>> * can then replace NULL with the default memory policy instead of the
>>> * current task's memory policy.
>>> */
>>> - return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
>>> + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff);
>>> }
>>> #endif /* CONFIG_NUMA */
>>>
>>> --
>>> MST
>>>
>>
>> Thanks for fixing this. LGTM!
>>
>> Reviewed-by: Shivank Garg <shivankg@amd.com>
>
>
> Can u actually test it though pls?
> Because I think another patch I sent in response so Sashiko
> is also needed.
Hi Michael,
Yes, I tested this.
I used kretprobes to read *ilx on each kvm_gmem_get_policy(), while calling
get_mempolicy(MPOL_F_ADDR) on consecutive offsets(0..7) of guest_memfd mapping:
BEFORE:
page offset: 0 1 2 3 4 5 6 7
*ilx: 0 1 2 3 4 5 6 7
get_vma_policy() again add the page offset on top. so, it will increase by stride 2.
AFTER Fix:
page offset: 0 1 2 3 ... 7
*ilx: 128376 128376 128376 128376 ... 128376
It store i_no, so after get_vma_policy(), it will increase by just 1.
It's hard to show any wrong allocation with the bug because this index value is not
used by allocation path, which uses NO_INTERLEAVE_INDEX.
Tested-by: Shivank Garg <shivankg@amd.com>
Thanks,
Shivank
On Fri, Jun 05, 2026 at 06:31:51PM +0530, Garg, Shivank wrote:
>
>
> On 6/5/2026 5:16 AM, Michael S. Tsirkin wrote:
> > [You don't often get email from mst@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> >
> > On Thu, Jun 04, 2026 at 12:21:15AM +0530, Garg, Shivank wrote:
> >>
> >>
> >> On 6/3/2026 9:27 PM, Michael S. Tsirkin wrote:
> >>> kvm_gmem_get_policy() sets *ilx to the full page offset
> >>> (vm_pgoff + vma offset). But get_vma_policy() adds the page
> >>> offset on top of *ilx, so the offset is counted twice. This
> >>> causes NUMA interleaving to skip nodes: for order-0 pages the
> >>> effective index jumps by 2 for each consecutive page.
> >>>
> >>> The get_policy vm_op should return only a per-file bias in *ilx
> >>> (like shmem_get_policy does with inode->i_ino), letting
> >>> get_vma_policy() add the page-offset component.
> >>>
> >>> Fix by setting *ilx to inode->i_ino instead of the full page
> >>> offset. The page offset is computed by get_vma_policy() in
> >>> mm/mempolicy.c. The full offset is still computed
> >>> in kvm_gmem_get_policy() for mpol_shared_policy_lookup().
> >>> shmem_get_policy() follows the same pattern.
> >>>
> >>> Found by Sashiko (sashiko.dev) AI code review.
> >>>
> >>> Fixes: ed1ffa810bd6 ("KVM: guest_memfd: Enforce NUMA mempolicy using shared policy")
> >>> Cc: Sean Christopherson <seanjc@google.com>
> >>> Cc: Paolo Bonzini <pbonzini@redhat.com>
> >>> Assisted-by: Claude:claude-opus-4-6
> >>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >>> ---
> >>> virt/kvm/guest_memfd.c | 7 ++++---
> >>> 1 file changed, 4 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> >>> index 69c9d6d546b2..0bcf6fc08e2d 100644
> >>> --- a/virt/kvm/guest_memfd.c
> >>> +++ b/virt/kvm/guest_memfd.c
> >>> @@ -438,11 +438,12 @@ static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpo
> >>> }
> >>>
> >>> static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> >>> - unsigned long addr, pgoff_t *pgoff)
> >>> + unsigned long addr, pgoff_t *ilx)
> >>> {
> >>> struct inode *inode = file_inode(vma->vm_file);
> >>> + pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> >>>
> >>> - *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> >>> + *ilx = inode->i_ino;
> >>>
> >>> /*
> >>> * Return the memory policy for this index, or NULL if none is set.
> >>> @@ -453,7 +454,7 @@ static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> >>> * can then replace NULL with the default memory policy instead of the
> >>> * current task's memory policy.
> >>> */
> >>> - return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
> >>> + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff);
> >>> }
> >>> #endif /* CONFIG_NUMA */
> >>>
> >>> --
> >>> MST
> >>>
> >>
> >> Thanks for fixing this. LGTM!
> >>
> >> Reviewed-by: Shivank Garg <shivankg@amd.com>
> >
> >
> > Can u actually test it though pls?
> > Because I think another patch I sent in response so Sashiko
> > is also needed.
>
> Hi Michael,
>
> Yes, I tested this.
>
> I used kretprobes to read *ilx on each kvm_gmem_get_policy(), while calling
> get_mempolicy(MPOL_F_ADDR) on consecutive offsets(0..7) of guest_memfd mapping:
>
> BEFORE:
> page offset: 0 1 2 3 4 5 6 7
> *ilx: 0 1 2 3 4 5 6 7
>
> get_vma_policy() again add the page offset on top. so, it will increase by stride 2.
>
> AFTER Fix:
> page offset: 0 1 2 3 ... 7
> *ilx: 128376 128376 128376 128376 ... 128376
>
> It store i_no, so after get_vma_policy(), it will increase by just 1.
>
> It's hard to show any wrong allocation with the bug because this index value is not
> used by allocation path, which uses NO_INTERLEAVE_INDEX.
>
> Tested-by: Shivank Garg <shivankg@amd.com>
>
> Thanks,
> Shivank
>
So for this to be useful at all
we do need the patch I sent in response to sashiko, right?
Mind trying out that one?
--
MST
On 6/5/2026 8:25 PM, Michael S. Tsirkin wrote:
> On Fri, Jun 05, 2026 at 06:31:51PM +0530, Garg, Shivank wrote:
>>
>>
>> On 6/5/2026 5:16 AM, Michael S. Tsirkin wrote:
>>> [You don't often get email from mst@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
>>>
>>> On Thu, Jun 04, 2026 at 12:21:15AM +0530, Garg, Shivank wrote:
>>>>
>>>>
>>>> On 6/3/2026 9:27 PM, Michael S. Tsirkin wrote:
>>>>> kvm_gmem_get_policy() sets *ilx to the full page offset
>>>>> (vm_pgoff + vma offset). But get_vma_policy() adds the page
>>>>> offset on top of *ilx, so the offset is counted twice. This
>>>>> causes NUMA interleaving to skip nodes: for order-0 pages the
>>>>> effective index jumps by 2 for each consecutive page.
>>>>>
>>>>> The get_policy vm_op should return only a per-file bias in *ilx
>>>>> (like shmem_get_policy does with inode->i_ino), letting
>>>>> get_vma_policy() add the page-offset component.
>>>>>
>>>>> Fix by setting *ilx to inode->i_ino instead of the full page
>>>>> offset. The page offset is computed by get_vma_policy() in
>>>>> mm/mempolicy.c. The full offset is still computed
>>>>> in kvm_gmem_get_policy() for mpol_shared_policy_lookup().
>>>>> shmem_get_policy() follows the same pattern.
>>>>>
>>>>> Found by Sashiko (sashiko.dev) AI code review.
>>>>>
>>>>> Fixes: ed1ffa810bd6 ("KVM: guest_memfd: Enforce NUMA mempolicy using shared policy")
>>>>> Cc: Sean Christopherson <seanjc@google.com>
>>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>>>>> Assisted-by: Claude:claude-opus-4-6
>>>>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>>>>> ---
>>>>> virt/kvm/guest_memfd.c | 7 ++++---
>>>>> 1 file changed, 4 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>>>>> index 69c9d6d546b2..0bcf6fc08e2d 100644
>>>>> --- a/virt/kvm/guest_memfd.c
>>>>> +++ b/virt/kvm/guest_memfd.c
>>>>> @@ -438,11 +438,12 @@ static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpo
>>>>> }
>>>>>
>>>>> static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
>>>>> - unsigned long addr, pgoff_t *pgoff)
>>>>> + unsigned long addr, pgoff_t *ilx)
>>>>> {
>>>>> struct inode *inode = file_inode(vma->vm_file);
>>>>> + pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
>>>>>
>>>>> - *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
>>>>> + *ilx = inode->i_ino;
>>>>>
>>>>> /*
>>>>> * Return the memory policy for this index, or NULL if none is set.
>>>>> @@ -453,7 +454,7 @@ static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
>>>>> * can then replace NULL with the default memory policy instead of the
>>>>> * current task's memory policy.
>>>>> */
>>>>> - return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
>>>>> + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff);
>>>>> }
>>>>> #endif /* CONFIG_NUMA */
>>>>>
>>>>> --
>>>>> MST
>>>>>
>>>>
>>>> Thanks for fixing this. LGTM!
>>>>
>>>> Reviewed-by: Shivank Garg <shivankg@amd.com>
>>>
>>>
>>> Can u actually test it though pls?
>>> Because I think another patch I sent in response so Sashiko
>>> is also needed.
>>
>> Hi Michael,
>>
>> Yes, I tested this.
>>
>> I used kretprobes to read *ilx on each kvm_gmem_get_policy(), while calling
>> get_mempolicy(MPOL_F_ADDR) on consecutive offsets(0..7) of guest_memfd mapping:
>>
>> BEFORE:
>> page offset: 0 1 2 3 4 5 6 7
>> *ilx: 0 1 2 3 4 5 6 7
>>
>> get_vma_policy() again add the page offset on top. so, it will increase by stride 2.
>>
>> AFTER Fix:
>> page offset: 0 1 2 3 ... 7
>> *ilx: 128376 128376 128376 128376 ... 128376
>>
>> It store i_no, so after get_vma_policy(), it will increase by just 1.
>>
>> It's hard to show any wrong allocation with the bug because this index value is not
>> used by allocation path, which uses NO_INTERLEAVE_INDEX.
>>
>> Tested-by: Shivank Garg <shivankg@amd.com>
>>
>> Thanks,
>> Shivank
>>
>
>
> So for this to be useful at all
> we do need the patch I sent in response to sashiko, right?
> Mind trying out that one?
>
I could not find the other patch from you.
Are you talking about this response
https://lore.kernel.org/all/20260604034539-mutt-send-email-mst@kernel.org?
If you send any separate patch to test elsewhere, please point me.
Sashiko comment:
> Doesn't this defeat the purpose of the shared policy, causing interleaving
> to be randomized by the chronological order of vCPU page faults rather than
> deterministically spread based on the guest physical address?
It doesn't defeat the shared policy's purpose.
This design is intentional, The shared policy exists so the policy is a
property of the inode and selected by file offset, and that works.
I don't think we need determinstic interleave now, it can be revisited later.
Thanks,
Shivank
On Sat, Jun 06, 2026 at 06:32:04PM +0530, Garg, Shivank wrote:
>
>
> On 6/5/2026 8:25 PM, Michael S. Tsirkin wrote:
> > On Fri, Jun 05, 2026 at 06:31:51PM +0530, Garg, Shivank wrote:
> >>
> >>
> >> On 6/5/2026 5:16 AM, Michael S. Tsirkin wrote:
> >>> [You don't often get email from mst@redhat.com. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ]
> >>>
> >>> On Thu, Jun 04, 2026 at 12:21:15AM +0530, Garg, Shivank wrote:
> >>>>
> >>>>
> >>>> On 6/3/2026 9:27 PM, Michael S. Tsirkin wrote:
> >>>>> kvm_gmem_get_policy() sets *ilx to the full page offset
> >>>>> (vm_pgoff + vma offset). But get_vma_policy() adds the page
> >>>>> offset on top of *ilx, so the offset is counted twice. This
> >>>>> causes NUMA interleaving to skip nodes: for order-0 pages the
> >>>>> effective index jumps by 2 for each consecutive page.
> >>>>>
> >>>>> The get_policy vm_op should return only a per-file bias in *ilx
> >>>>> (like shmem_get_policy does with inode->i_ino), letting
> >>>>> get_vma_policy() add the page-offset component.
> >>>>>
> >>>>> Fix by setting *ilx to inode->i_ino instead of the full page
> >>>>> offset. The page offset is computed by get_vma_policy() in
> >>>>> mm/mempolicy.c. The full offset is still computed
> >>>>> in kvm_gmem_get_policy() for mpol_shared_policy_lookup().
> >>>>> shmem_get_policy() follows the same pattern.
> >>>>>
> >>>>> Found by Sashiko (sashiko.dev) AI code review.
> >>>>>
> >>>>> Fixes: ed1ffa810bd6 ("KVM: guest_memfd: Enforce NUMA mempolicy using shared policy")
> >>>>> Cc: Sean Christopherson <seanjc@google.com>
> >>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
> >>>>> Assisted-by: Claude:claude-opus-4-6
> >>>>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >>>>> ---
> >>>>> virt/kvm/guest_memfd.c | 7 ++++---
> >>>>> 1 file changed, 4 insertions(+), 3 deletions(-)
> >>>>>
> >>>>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> >>>>> index 69c9d6d546b2..0bcf6fc08e2d 100644
> >>>>> --- a/virt/kvm/guest_memfd.c
> >>>>> +++ b/virt/kvm/guest_memfd.c
> >>>>> @@ -438,11 +438,12 @@ static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpo
> >>>>> }
> >>>>>
> >>>>> static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> >>>>> - unsigned long addr, pgoff_t *pgoff)
> >>>>> + unsigned long addr, pgoff_t *ilx)
> >>>>> {
> >>>>> struct inode *inode = file_inode(vma->vm_file);
> >>>>> + pgoff_t pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> >>>>>
> >>>>> - *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> >>>>> + *ilx = inode->i_ino;
> >>>>>
> >>>>> /*
> >>>>> * Return the memory policy for this index, or NULL if none is set.
> >>>>> @@ -453,7 +454,7 @@ static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> >>>>> * can then replace NULL with the default memory policy instead of the
> >>>>> * current task's memory policy.
> >>>>> */
> >>>>> - return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
> >>>>> + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff);
> >>>>> }
> >>>>> #endif /* CONFIG_NUMA */
> >>>>>
> >>>>> --
> >>>>> MST
> >>>>>
> >>>>
> >>>> Thanks for fixing this. LGTM!
> >>>>
> >>>> Reviewed-by: Shivank Garg <shivankg@amd.com>
> >>>
> >>>
> >>> Can u actually test it though pls?
> >>> Because I think another patch I sent in response so Sashiko
> >>> is also needed.
> >>
> >> Hi Michael,
> >>
> >> Yes, I tested this.
> >>
> >> I used kretprobes to read *ilx on each kvm_gmem_get_policy(), while calling
> >> get_mempolicy(MPOL_F_ADDR) on consecutive offsets(0..7) of guest_memfd mapping:
> >>
> >> BEFORE:
> >> page offset: 0 1 2 3 4 5 6 7
> >> *ilx: 0 1 2 3 4 5 6 7
> >>
> >> get_vma_policy() again add the page offset on top. so, it will increase by stride 2.
> >>
> >> AFTER Fix:
> >> page offset: 0 1 2 3 ... 7
> >> *ilx: 128376 128376 128376 128376 ... 128376
> >>
> >> It store i_no, so after get_vma_policy(), it will increase by just 1.
> >>
> >> It's hard to show any wrong allocation with the bug because this index value is not
> >> used by allocation path, which uses NO_INTERLEAVE_INDEX.
> >>
> >> Tested-by: Shivank Garg <shivankg@amd.com>
> >>
> >> Thanks,
> >> Shivank
> >>
> >
> >
> > So for this to be useful at all
> > we do need the patch I sent in response to sashiko, right?
> > Mind trying out that one?
> >
>
> I could not find the other patch from you.
> Are you talking about this response
> https://lore.kernel.org/all/20260604034539-mutt-send-email-mst@kernel.org?
>
> If you send any separate patch to test elsewhere, please point me.
I could swear I sent it, but you are right. Inline, because untested:
-->
mm: filemap: pass interleave index through filemap_alloc_folio
filemap_alloc_folio_noprof() hardcodes NO_INTERLEAVE_INDEX when
calling folio_alloc_mpol_noprof() for NUMA policy-based allocations.
This causes MPOL_INTERLEAVE to fall back to the task's global
il_prev counter instead of using the file offset for deterministic
page placement.
The only current user passing a non-NULL policy is
__filemap_get_folio_mpol(), called by KVM guest_memfd. The page
index is already available at that call site but was never threaded
down to the allocator.
Add a pgoff_t ilx parameter to filemap_alloc_folio_noprof() and
pass it through to folio_alloc_mpol_noprof(). Update
__filemap_get_folio_mpol() to forward its index argument, and all
other callers (which pass NULL policy and never hit the mpol path)
to pass 0.
Fixes: 7f3779a3ac3e ("mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio()")
Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a02b62e0a8f3..efdec0ac1482 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -452,7 +452,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
masked_constraint_gfp = mapping_gfp_constraint(mapping, constraint_gfp);
masked_constraint_gfp |= __GFP_NOWARN;
- folio = filemap_alloc_folio(masked_constraint_gfp, 0, NULL);
+ folio = filemap_alloc_folio(masked_constraint_gfp, 0, NULL, 0);
if (!folio)
break;
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 0062b3a55781..148fa0bcc974 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -731,7 +731,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
}
folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS),
- 0, NULL);
+ 0, NULL, 0);
if (!folio)
return ERR_PTR(-ENOMEM);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 27ab7bd844ec..f4416b57f480 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -563,7 +563,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
* Allocate a managed folio for cached I/O, or it may be
* then filled with a file-backed folio for in-place I/O
*/
- newfolio = filemap_alloc_folio(gfp, 0, NULL);
+ newfolio = filemap_alloc_folio(gfp, 0, NULL, 0);
if (!newfolio)
continue;
newfolio->private = Z_EROFS_PREALLOCATED_FOLIO;
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 881e76158b96..7494a94338e4 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1954,7 +1954,7 @@ static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
return;
}
- cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0, NULL);
+ cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0, NULL, 0);
if (!cfolio)
return;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 31a848485ad9..e2aea0800815 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -652,10 +652,10 @@ static inline void *detach_page_private(struct page *page)
#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
- struct mempolicy *policy);
+ struct mempolicy *policy, pgoff_t ilx);
#else
static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
- struct mempolicy *policy)
+ struct mempolicy *policy, pgoff_t ilx)
{
return folio_alloc_noprof(gfp, order);
}
@@ -666,7 +666,7 @@ static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int o
static inline struct page *__page_cache_alloc(gfp_t gfp)
{
- return &filemap_alloc_folio(gfp, 0, NULL)->page;
+ return &filemap_alloc_folio(gfp, 0, NULL, 0)->page;
}
static inline gfp_t readahead_gfp_mask(struct address_space *x)
diff --git a/mm/filemap.c b/mm/filemap.c
index 4e636647100c..2fccd9afa4d4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -992,14 +992,14 @@ EXPORT_SYMBOL_GPL(filemap_add_folio);
#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order,
- struct mempolicy *policy)
+ struct mempolicy *policy, pgoff_t ilx)
{
int n;
struct folio *folio;
if (policy)
return folio_alloc_mpol_noprof(gfp, order, policy,
- NO_INTERLEAVE_INDEX, numa_node_id());
+ ilx, numa_node_id());
if (cpuset_do_page_mem_spread()) {
unsigned int cpuset_mems_cookie;
@@ -2009,7 +2009,7 @@ struct folio *__filemap_get_folio_mpol(struct address_space *mapping,
err = -ENOMEM;
if (order > min_order)
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
- folio = filemap_alloc_folio(alloc_gfp, order, policy);
+ folio = filemap_alloc_folio(alloc_gfp, order, policy, index);
if (!folio)
continue;
@@ -2609,7 +2609,7 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
return -EAGAIN;
- folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL);
+ folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL, 0);
if (!folio)
return -ENOMEM;
if (iocb->ki_flags & IOCB_DONTCACHE)
@@ -4067,7 +4067,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping,
repeat:
folio = filemap_get_folio(mapping, index);
if (IS_ERR(folio)) {
- folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL);
+ folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL, 0);
if (!folio)
return ERR_PTR(-ENOMEM);
index = mapping_align_index(mapping, index);
diff --git a/mm/readahead.c b/mm/readahead.c
index 7b05082c89ea..c435aee43e07 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -186,7 +186,7 @@ static struct folio *ractl_alloc_folio(struct readahead_control *ractl,
{
struct folio *folio;
- folio = filemap_alloc_folio(gfp_mask, order, NULL);
+ folio = filemap_alloc_folio(gfp_mask, order, NULL, 0);
if (folio && ractl->dropbehind)
__folio_set_dropbehind(folio);
© 2016 - 2026 Red Hat, Inc.