Add fbind() and NUMA mempolicy support for KVM guest_memfd

[RFC PATCH 2/4] Introduce fbind syscall

Posted by Shivank Garg 2 weeks, 4 days ago

The new fbind syscall sets the NUMA memory policy for file-backed memory
and has following signature:

long fbind(unsigned int fd, unsigned long mode,
                const unsigned long nodemask[(.maxnode + ULONG_WIDTH - 1)
                                               / ULONG_WIDTH],
                unsigned long maxnode, unsigned int flags);

fbind behaves similar to mbind except that it takes file descriptor as
input instead of address ranges.

TODO:
1. Support fbind syscall on all architectures.
2. Expand commit msg and add documentation.
3. clean-up the code.

[Shivansh: add create_mpol_from_args()]
Signed-off-by: Shivansh Dhiman <shivansh.dhiman@amd.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 include/linux/fs.h                     |  3 ++
 include/linux/mempolicy.h              |  3 ++
 include/linux/syscalls.h               |  3 ++
 include/uapi/asm-generic/unistd.h      |  5 ++-
 kernel/sys_ni.c                        |  1 +
 mm/Makefile                            |  2 +-
 mm/fbind.c                             | 49 +++++++++++++++++++++++
 mm/mempolicy.c                         | 55 ++++++++++++++++++++++++++
 10 files changed, 121 insertions(+), 2 deletions(-)
 create mode 100644 mm/fbind.c

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 534c74b14fab..0660ce6d08d8 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -468,3 +468,4 @@
 460	i386	lsm_set_self_attr	sys_lsm_set_self_attr
 461	i386	lsm_list_modules	sys_lsm_list_modules
 462	i386	mseal 			sys_mseal
+463	i386	fbind			sys_fbind
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7093ee21c0d1..9794347cc2e6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -386,6 +386,7 @@
 460	common	lsm_set_self_attr	sys_lsm_set_self_attr
 461	common	lsm_list_modules	sys_lsm_list_modules
 462 	common  mseal			sys_mseal
+463	common	fbind			sys_fbind
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..42042b62bdcd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2058,6 +2058,9 @@ struct file_operations {
 				   struct file *file_out, loff_t pos_out,
 				   loff_t len, unsigned int remap_flags);
 	int (*fadvise)(struct file *, loff_t, loff_t, int);
+#ifdef CONFIG_NUMA
+	int (*set_policy)(struct file *, struct mempolicy *);
+#endif
 	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 	int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
 				unsigned int poll_flags);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1add16f21612..b9023f6246a7 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -299,4 +299,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
 }
 
 #endif /* CONFIG_NUMA */
+struct mempolicy *create_mpol_from_args(unsigned char mode,
+					const unsigned long __user *nmask,
+					unsigned short maxnode);
 #endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 4bcf6754738d..2dc686921b9f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -502,6 +502,9 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *bu
 asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
 			       struct stat __user *statbuf, int flag);
 asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
+asmlinkage long sys_fbind(unsigned int fd, unsigned long mode,
+			const unsigned long __user *nmask,
+			unsigned long maxnode, unsigned int flags);
 #if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
 asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
 asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 5bf6148cac2b..550730f36dae 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -841,8 +841,11 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 #define __NR_mseal 462
 __SYSCALL(__NR_mseal, sys_mseal)
 
+#define __NR_fbind 463
+__SYSCALL(__NR_fbind, sys_fbind)
+
 #undef __NR_syscalls
-#define __NR_syscalls 463
+#define __NR_syscalls 464
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c00a86931f8c..f57350e581f6 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -195,6 +195,7 @@ COND_SYSCALL(move_pages);
 COND_SYSCALL(set_mempolicy_home_node);
 COND_SYSCALL(cachestat);
 COND_SYSCALL(mseal);
+COND_SYSCALL(fbind);
 
 COND_SYSCALL(perf_event_open);
 COND_SYSCALL(accept4);
diff --git a/mm/Makefile b/mm/Makefile
index d2915f8c9dc0..ba339ddc0be2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -79,7 +79,7 @@ obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)	+= hugetlb_vmemmap.o
-obj-$(CONFIG_NUMA) 	+= mempolicy.o
+obj-$(CONFIG_NUMA) 	+= mempolicy.o fbind.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
diff --git a/mm/fbind.c b/mm/fbind.c
new file mode 100644
index 000000000000..85ec7d13345c
--- /dev/null
+++ b/mm/fbind.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Implement fbind() syscall.
+ *
+ *  Copyright (c) 2024 AMD
+ *
+ *  Author: Shivank Garg <shivankg@amd.com>
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
+
+static long do_fbind(unsigned int fd, unsigned long mode,
+		const unsigned long __user *nmask,
+		unsigned long maxnode, unsigned int flags)
+{
+	struct mempolicy *mpol;
+	struct fd f;
+	int ret;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	mpol = create_mpol_from_args(mode, nmask, maxnode);
+	if (IS_ERR_OR_NULL(mpol)) {
+		ret = PTR_ERR(mpol);
+		goto out_putf;
+	}
+
+	if (f.file->f_op->set_policy)
+		ret = f.file->f_op->set_policy(f.file, mpol);
+	else
+		ret = -EOPNOTSUPP;
+
+	mpol_put(mpol);
+out_putf:
+	fdput(f);
+	return ret;
+}
+
+SYSCALL_DEFINE5(fbind, unsigned int, fd, unsigned long, mode,
+		const unsigned long __user *, nmask,
+		unsigned long, maxnode, unsigned int, flags)
+{
+	return do_fbind(fd, mode, nmask, maxnode, flags);
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b858e22b259d..3a697080ecad 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -3557,3 +3557,58 @@ static int __init mempolicy_sysfs_init(void)
 
 late_initcall(mempolicy_sysfs_init);
 #endif /* CONFIG_SYSFS */
+
+/**
+ * create_mpol_from_args - create a mempolicy structure from args
+ * @mode:  NUMA memory policy mode
+ * @nmask:  bitmask of NUMA nodes
+ * @maxnode:  number of bits in the nodes bitmask
+ *
+ * Create a mempolicy from given nodemask and memory policy such as
+ * default, preferred, interleave or bind.
+ *
+ * Return: error encoded in a pointer or memory policy on success.
+ */
+struct mempolicy *create_mpol_from_args(unsigned char mode,
+					const unsigned long __user *nmask,
+					unsigned short maxnode)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned short mode_flags;
+	struct mempolicy *mpol;
+	nodemask_t nodes;
+	int lmode = mode;
+	int err = -ENOMEM;
+
+	err = sanitize_mpol_flags(&lmode, &mode_flags);
+	if (err)
+		return ERR_PTR(err);
+
+	err = get_nodes(&nodes, nmask, maxnode);
+	if (err)
+		return ERR_PTR(err);
+
+	mpol = mpol_new(mode, mode_flags, &nodes);
+	if (IS_ERR_OR_NULL(mpol))
+		return mpol;
+
+	NODEMASK_SCRATCH(scratch);
+	if (!scratch) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	mmap_write_lock(mm);
+	err = mpol_set_nodemask(mpol, &nodes, scratch);
+	mmap_write_unlock(mm);
+	NODEMASK_SCRATCH_FREE(scratch);
+
+	if (err)
+		goto err_out;
+
+	return mpol;
+
+err_out:
+	mpol_put(mpol);
+	return ERR_PTR(err);
+}
-- 
2.34.1

[RFC PATCH 3/4] KVM: guest_memfd: Pass file pointer instead of inode in guest_memfd APIs

Posted by Shivank Garg 2 weeks, 4 days ago

Change the KVM guest_memfd APIs to pass file pointers instead of
inodes in the folio allocation functions. This is preparatory patch
for adding NUMA support to guest memory allocations.
The functional behavior remains unchanged.

Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 virt/kvm/guest_memfd.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index e930014b4bdc..2c6fcf7c3ec9 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -91,7 +91,7 @@ static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index,
 {
 	pgoff_t npages = 1UL << order;
 	pgoff_t huge_index = round_down(index, npages);
-	struct address_space *mapping  = inode->i_mapping;
+	struct address_space *mapping = inode->i_mapping;
 	gfp_t gfp = mapping_gfp_mask(mapping) | __GFP_NOWARN;
 	loff_t size = i_size_read(inode);
 	struct folio *folio;
@@ -125,16 +125,16 @@ static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index,
  * Ignore accessed, referenced, and dirty flags.  The memory is
  * unevictable and there is no storage to write back to.
  */
-static struct folio *__kvm_gmem_get_folio(struct inode *inode, pgoff_t index,
+static struct folio *__kvm_gmem_get_folio(struct file *file, pgoff_t index,
 					  bool allow_huge)
 {
 	struct folio *folio = NULL;
 
 	if (gmem_2m_enabled && allow_huge)
-		folio = kvm_gmem_get_huge_folio(inode, index, PMD_ORDER);
+		folio = kvm_gmem_get_huge_folio(file_inode(file), index, PMD_ORDER);
 
 	if (!folio)
-		folio = filemap_grab_folio(inode->i_mapping, index);
+		folio = filemap_grab_folio(file_inode(file)->i_mapping, index);
 
 	pr_debug("%s: allocate folio with PFN %lx order %d\n",
 		 __func__, folio_pfn(folio), folio_order(folio));
@@ -150,9 +150,9 @@ static struct folio *__kvm_gmem_get_folio(struct inode *inode, pgoff_t index,
  * Ignore accessed, referenced, and dirty flags.  The memory is
  * unevictable and there is no storage to write back to.
  */
-static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
 {
-	return __kvm_gmem_get_folio(inode, index, true);
+	return __kvm_gmem_get_folio(file, index, true);
 }
 
 static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -228,8 +228,9 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	return 0;
 }
 
-static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
+static long kvm_gmem_allocate(struct file *file, loff_t offset, loff_t len)
 {
+	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	pgoff_t start, index, end;
 	int r;
@@ -252,7 +253,7 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
 			break;
 		}
 
-		folio = kvm_gmem_get_folio(inode, index);
+		folio = kvm_gmem_get_folio(file, index);
 		if (IS_ERR(folio)) {
 			r = PTR_ERR(folio);
 			break;
@@ -292,7 +293,7 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
 	else
-		ret = kvm_gmem_allocate(file_inode(file), offset, len);
+		ret = kvm_gmem_allocate(file, offset, len);
 
 	if (!ret)
 		file_modified(file);
@@ -626,7 +627,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
 		return ERR_PTR(-EIO);
 	}
 
-	folio = __kvm_gmem_get_folio(file_inode(file), index, allow_huge);
+	folio = __kvm_gmem_get_folio(file, index, allow_huge);
 	if (IS_ERR(folio))
 		return folio;
 
-- 
2.34.1

[RFC PATCH 4/4] KVM: guest_memfd: Enforce NUMA mempolicy if available

Posted by Shivank Garg 2 weeks, 4 days ago

Enforce memory policy on guest-memfd to provide proper NUMA support.
Previously, guest-memfd allocations were following local NUMA node id
in absence of process mempolicy, resulting in random memory allocation.
Moreover, it cannot use mbind() since memory isn't mapped to userspace.

To support NUMA policies, call fbind() syscall from VMM to store
mempolicy as f_policy in struct kvm_gmem of guest_memfd. The f_policy
is retrieved to pass in filemap_grab_folio_mpol() to ensure that
allocations follow the specified memory policy.

Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 mm/mempolicy.c         |  2 ++
 virt/kvm/guest_memfd.c | 49 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3a697080ecad..af2e1ef4dae7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -347,6 +347,7 @@ void __mpol_put(struct mempolicy *pol)
 		return;
 	kmem_cache_free(policy_cache, pol);
 }
+EXPORT_SYMBOL(__mpol_put);
 
 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 {
@@ -2599,6 +2600,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
 	atomic_set(&new->refcnt, 1);
 	return new;
 }
+EXPORT_SYMBOL(__mpol_dup);
 
 /* Slow path of a mempolicy comparison */
 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 2c6fcf7c3ec9..0237bda4382c 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
 #include <linux/kvm_host.h>
 #include <linux/pagemap.h>
 #include <linux/anon_inodes.h>
+#include <linux/mempolicy.h>
 
 #include "kvm_mm.h"
 
@@ -11,6 +12,7 @@ struct kvm_gmem {
 	struct kvm *kvm;
 	struct xarray bindings;
 	struct list_head entry;
+	struct mempolicy *f_policy;
 };
 
 /**
@@ -87,7 +89,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 }
 
 static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index,
-					     unsigned int order)
+					     unsigned int order,
+					     struct mempolicy *policy)
 {
 	pgoff_t npages = 1UL << order;
 	pgoff_t huge_index = round_down(index, npages);
@@ -104,7 +107,7 @@ static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index,
 				   (loff_t)(huge_index + npages - 1) << PAGE_SHIFT))
 		return NULL;
 
-	folio = filemap_alloc_folio(gfp, order);
+	folio = filemap_alloc_folio_mpol(gfp, order, policy);
 	if (!folio)
 		return NULL;
 
@@ -129,12 +132,26 @@ static struct folio *__kvm_gmem_get_folio(struct file *file, pgoff_t index,
 					  bool allow_huge)
 {
 	struct folio *folio = NULL;
+	struct kvm_gmem *gmem = file->private_data;
+	struct mempolicy *policy = NULL;
+
+	/*
+	 * RCU lock is required to prevent any race condition with set_policy().
+	 */
+	if (IS_ENABLED(CONFIG_NUMA)) {
+		rcu_read_lock();
+		policy = READ_ONCE(gmem->f_policy);
+		mpol_get(policy);
+		rcu_read_unlock();
+	}
 
 	if (gmem_2m_enabled && allow_huge)
-		folio = kvm_gmem_get_huge_folio(file_inode(file), index, PMD_ORDER);
+		folio = kvm_gmem_get_huge_folio(file_inode(file), index, PMD_ORDER, policy);
 
 	if (!folio)
-		folio = filemap_grab_folio(file_inode(file)->i_mapping, index);
+		folio = filemap_grab_folio_mpol(file_inode(file)->i_mapping, index, policy);
+
+	mpol_put(policy);
 
 	pr_debug("%s: allocate folio with PFN %lx order %d\n",
 		 __func__, folio_pfn(folio), folio_order(folio));
@@ -338,6 +355,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
 	mutex_unlock(&kvm->slots_lock);
 
 	xa_destroy(&gmem->bindings);
+	mpol_put(gmem->f_policy);
 	kfree(gmem);
 
 	kvm_put_kvm(kvm);
@@ -356,10 +374,32 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
 	return get_file_active(&slot->gmem.file);
 }
 
+#ifdef CONFIG_NUMA
+static int kvm_gmem_set_policy(struct file *file, struct mempolicy *mpol)
+{
+	struct mempolicy *old, *new;
+	struct kvm_gmem *gmem = file->private_data;
+
+	new = mpol_dup(mpol);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	old = gmem->f_policy;
+	WRITE_ONCE(gmem->f_policy, new);
+	synchronize_rcu();
+	mpol_put(old);
+
+	return 0;
+}
+#endif
+
 static struct file_operations kvm_gmem_fops = {
 	.open		= generic_file_open,
 	.release	= kvm_gmem_release,
 	.fallocate	= kvm_gmem_fallocate,
+#ifdef CONFIG_NUMA
+	.set_policy	= kvm_gmem_set_policy,
+#endif
 };
 
 void kvm_gmem_init(struct module *module)
@@ -489,6 +529,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 
 	kvm_get_kvm(kvm);
 	gmem->kvm = kvm;
+	gmem->f_policy = NULL;
 	xa_init(&gmem->bindings);
 	list_add(&gmem->entry, &inode->i_mapping->i_private_list);
 
-- 
2.34.1

[RFC PATCH 1/4] mm: Add mempolicy support to the filemap layer
[RFC PATCH 2/4] Introduce fbind syscall