This patch introduces memfile_notifier facility so existing memory file
subsystems (e.g. tmpfs/hugetlbfs) can provide memory pages to allow a
third kernel component to make use of memory bookmarked in the memory
file and gets notified when the pages in the memory file become
allocated/invalidated.
It will be used for KVM to use a file descriptor as the guest memory
backing store and KVM will use this memfile_notifier interface to
interact with memory file subsystems. In the future there might be other
consumers (e.g. VFIO with encrypted device memory).
It consists two sets of callbacks:
- memfile_notifier_ops: callbacks for memory backing store to notify
KVM when memory gets allocated/invalidated.
- memfile_pfn_ops: callbacks for KVM to call into memory backing store
to request memory pages for guest private memory.
Userspace is in charge of guest memory lifecycle: it first allocates
pages in memory backing store and then passes the fd to KVM and lets KVM
register each memory slot to memory backing store via
memfile_register_notifier.
The supported memory backing store should maintain a memfile_notifier list
and provide routine for memfile_notifier to get the list head address and
memfile_pfn_ops callbacks for memfile_register_notifier. It also should call
memfile_notifier_fallocate/memfile_notifier_invalidate when the bookmarked
memory gets allocated/invalidated.
Co-developed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
include/linux/memfile_notifier.h | 64 +++++++++++++++++
mm/Kconfig | 4 ++
mm/Makefile | 1 +
mm/memfile_notifier.c | 114 +++++++++++++++++++++++++++++++
4 files changed, 183 insertions(+)
create mode 100644 include/linux/memfile_notifier.h
create mode 100644 mm/memfile_notifier.c
diff --git a/include/linux/memfile_notifier.h b/include/linux/memfile_notifier.h
new file mode 100644
index 000000000000..e8d400558adb
--- /dev/null
+++ b/include/linux/memfile_notifier.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMFILE_NOTIFIER_H
+#define _LINUX_MEMFILE_NOTIFIER_H
+
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/fs.h>
+
+struct memfile_notifier;
+
+struct memfile_notifier_ops {
+ void (*invalidate)(struct memfile_notifier *notifier,
+ pgoff_t start, pgoff_t end);
+ void (*fallocate)(struct memfile_notifier *notifier,
+ pgoff_t start, pgoff_t end);
+};
+
+struct memfile_pfn_ops {
+ long (*get_lock_pfn)(struct inode *inode, pgoff_t offset, int *order);
+ void (*put_unlock_pfn)(unsigned long pfn);
+};
+
+struct memfile_notifier {
+ struct list_head list;
+ struct memfile_notifier_ops *ops;
+};
+
+struct memfile_notifier_list {
+ struct list_head head;
+ spinlock_t lock;
+};
+
+struct memfile_backing_store {
+ struct list_head list;
+ struct memfile_pfn_ops pfn_ops;
+ struct memfile_notifier_list* (*get_notifier_list)(struct inode *inode);
+};
+
+#ifdef CONFIG_MEMFILE_NOTIFIER
+/* APIs for backing stores */
+static inline void memfile_notifier_list_init(struct memfile_notifier_list *list)
+{
+ INIT_LIST_HEAD(&list->head);
+ spin_lock_init(&list->lock);
+}
+
+extern void memfile_notifier_invalidate(struct memfile_notifier_list *list,
+ pgoff_t start, pgoff_t end);
+extern void memfile_notifier_fallocate(struct memfile_notifier_list *list,
+ pgoff_t start, pgoff_t end);
+extern void memfile_register_backing_store(struct memfile_backing_store *bs);
+extern void memfile_unregister_backing_store(struct memfile_backing_store *bs);
+
+/*APIs for notifier consumers */
+extern int memfile_register_notifier(struct inode *inode,
+ struct memfile_notifier *notifier,
+ struct memfile_pfn_ops **pfn_ops);
+extern void memfile_unregister_notifier(struct inode *inode,
+ struct memfile_notifier *notifier);
+
+#endif /* CONFIG_MEMFILE_NOTIFIER */
+
+#endif /* _LINUX_MEMFILE_NOTIFIER_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 3326ee3903f3..7c6b1ad3dade 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -892,6 +892,10 @@ config ANON_VMA_NAME
area from being merged with adjacent virtual memory areas due to the
difference in their name.
+config MEMFILE_NOTIFIER
+ bool
+ select SRCU
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 70d4309c9ce3..f628256dce0d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -132,3 +132,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+obj-$(CONFIG_MEMFILE_NOTIFIER) += memfile_notifier.o
diff --git a/mm/memfile_notifier.c b/mm/memfile_notifier.c
new file mode 100644
index 000000000000..a405db56fde2
--- /dev/null
+++ b/mm/memfile_notifier.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/mm/memfile_notifier.c
+ *
+ * Copyright (C) 2022 Intel Corporation.
+ * Chao Peng <chao.p.peng@linux.intel.com>
+ */
+
+#include <linux/memfile_notifier.h>
+#include <linux/srcu.h>
+
+DEFINE_STATIC_SRCU(srcu);
+static LIST_HEAD(backing_store_list);
+
+void memfile_notifier_invalidate(struct memfile_notifier_list *list,
+ pgoff_t start, pgoff_t end)
+{
+ struct memfile_notifier *notifier;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ list_for_each_entry_srcu(notifier, &list->head, list,
+ srcu_read_lock_held(&srcu)) {
+ if (notifier->ops && notifier->ops->invalidate)
+ notifier->ops->invalidate(notifier, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+
+void memfile_notifier_fallocate(struct memfile_notifier_list *list,
+ pgoff_t start, pgoff_t end)
+{
+ struct memfile_notifier *notifier;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ list_for_each_entry_srcu(notifier, &list->head, list,
+ srcu_read_lock_held(&srcu)) {
+ if (notifier->ops && notifier->ops->fallocate)
+ notifier->ops->fallocate(notifier, start, end);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+
+void memfile_register_backing_store(struct memfile_backing_store *bs)
+{
+ BUG_ON(!bs || !bs->get_notifier_list);
+
+ list_add_tail(&bs->list, &backing_store_list);
+}
+
+void memfile_unregister_backing_store(struct memfile_backing_store *bs)
+{
+ list_del(&bs->list);
+}
+
+static int memfile_get_notifier_info(struct inode *inode,
+ struct memfile_notifier_list **list,
+ struct memfile_pfn_ops **ops)
+{
+ struct memfile_backing_store *bs, *iter;
+ struct memfile_notifier_list *tmp;
+
+ list_for_each_entry_safe(bs, iter, &backing_store_list, list) {
+ tmp = bs->get_notifier_list(inode);
+ if (tmp) {
+ *list = tmp;
+ if (ops)
+ *ops = &bs->pfn_ops;
+ return 0;
+ }
+ }
+ return -EOPNOTSUPP;
+}
+
+int memfile_register_notifier(struct inode *inode,
+ struct memfile_notifier *notifier,
+ struct memfile_pfn_ops **pfn_ops)
+{
+ struct memfile_notifier_list *list;
+ int ret;
+
+ if (!inode || !notifier | !pfn_ops)
+ return -EINVAL;
+
+ ret = memfile_get_notifier_info(inode, &list, pfn_ops);
+ if (ret)
+ return ret;
+
+ spin_lock(&list->lock);
+ list_add_rcu(¬ifier->list, &list->head);
+ spin_unlock(&list->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(memfile_register_notifier);
+
+void memfile_unregister_notifier(struct inode *inode,
+ struct memfile_notifier *notifier)
+{
+ struct memfile_notifier_list *list;
+
+ if (!inode || !notifier)
+ return;
+
+ BUG_ON(memfile_get_notifier_info(inode, &list, NULL));
+
+ spin_lock(&list->lock);
+ list_del_rcu(¬ifier->list);
+ spin_unlock(&list->lock);
+
+ synchronize_srcu(&srcu);
+}
+EXPORT_SYMBOL_GPL(memfile_unregister_notifier);
--
2.17.1
On Thu, Mar 10, 2022, Chao Peng wrote:
> diff --git a/mm/Makefile b/mm/Makefile
> index 70d4309c9ce3..f628256dce0d 100644
> +void memfile_notifier_invalidate(struct memfile_notifier_list *list,
> + pgoff_t start, pgoff_t end)
> +{
> + struct memfile_notifier *notifier;
> + int id;
> +
> + id = srcu_read_lock(&srcu);
> + list_for_each_entry_srcu(notifier, &list->head, list,
> + srcu_read_lock_held(&srcu)) {
> + if (notifier->ops && notifier->ops->invalidate)
Any reason notifier->ops isn't mandatory?
> + notifier->ops->invalidate(notifier, start, end);
> + }
> + srcu_read_unlock(&srcu, id);
> +}
> +
> +void memfile_notifier_fallocate(struct memfile_notifier_list *list,
> + pgoff_t start, pgoff_t end)
> +{
> + struct memfile_notifier *notifier;
> + int id;
> +
> + id = srcu_read_lock(&srcu);
> + list_for_each_entry_srcu(notifier, &list->head, list,
> + srcu_read_lock_held(&srcu)) {
> + if (notifier->ops && notifier->ops->fallocate)
> + notifier->ops->fallocate(notifier, start, end);
> + }
> + srcu_read_unlock(&srcu, id);
> +}
> +
> +void memfile_register_backing_store(struct memfile_backing_store *bs)
> +{
> + BUG_ON(!bs || !bs->get_notifier_list);
> +
> + list_add_tail(&bs->list, &backing_store_list);
> +}
> +
> +void memfile_unregister_backing_store(struct memfile_backing_store *bs)
> +{
> + list_del(&bs->list);
Allowing unregistration of a backing store is broken. Using the _safe() variant
is not sufficient to guard against concurrent modification. I don't see any reason
to support this out of the gate, the only reason to support unregistering a backing
store is if the backing store is implemented as a module, and AFAIK none of the
backing stores we plan on supporting initially support being built as a module.
These aren't exported, so it's not like that's even possible. Registration would
also be broken if modules are allowed, I'm pretty sure module init doesn't run
under a global lock.
We can always add this complexity if it's needed in the future, but for now the
easiest thing would be to tag memfile_register_backing_store() with __init and
make backing_store_list __ro_after_init.
> +}
> +
> +static int memfile_get_notifier_info(struct inode *inode,
> + struct memfile_notifier_list **list,
> + struct memfile_pfn_ops **ops)
> +{
> + struct memfile_backing_store *bs, *iter;
> + struct memfile_notifier_list *tmp;
> +
> + list_for_each_entry_safe(bs, iter, &backing_store_list, list) {
> + tmp = bs->get_notifier_list(inode);
> + if (tmp) {
> + *list = tmp;
> + if (ops)
> + *ops = &bs->pfn_ops;
> + return 0;
> + }
> + }
> + return -EOPNOTSUPP;
> +}
> +
> +int memfile_register_notifier(struct inode *inode,
Taking an inode is a bit odd from a user perspective. Any reason not to take a
"struct file *" and get the inode here? That would give callers a hint that they
need to hold a reference to the file for the lifetime of the registration.
> + struct memfile_notifier *notifier,
> + struct memfile_pfn_ops **pfn_ops)
> +{
> + struct memfile_notifier_list *list;
> + int ret;
> +
> + if (!inode || !notifier | !pfn_ops)
Bitwise | instead of logical ||. But IMO taking in a pfn_ops pointer is silly.
More below.
> + return -EINVAL;
> +
> + ret = memfile_get_notifier_info(inode, &list, pfn_ops);
> + if (ret)
> + return ret;
> +
> + spin_lock(&list->lock);
> + list_add_rcu(¬ifier->list, &list->head);
> + spin_unlock(&list->lock);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(memfile_register_notifier);
> +
> +void memfile_unregister_notifier(struct inode *inode,
> + struct memfile_notifier *notifier)
> +{
> + struct memfile_notifier_list *list;
> +
> + if (!inode || !notifier)
> + return;
> +
> + BUG_ON(memfile_get_notifier_info(inode, &list, NULL));
Eww. Rather than force the caller to provide the inode/file and the notifier,
what about grabbing the backing store itself in the notifier?
struct memfile_notifier {
struct list_head list;
struct memfile_notifier_ops *ops;
struct memfile_backing_store *bs;
};
That also helps avoid confusing between "ops" and "pfn_ops". IMO, exposing
memfile_backing_store to the caller isn't a big deal, and is preferable to having
to rewalk multiple lists just to delete a notifier.
Then this can become:
void memfile_unregister_notifier(struct memfile_notifier *notifier)
{
spin_lock(¬ifier->bs->list->lock);
list_del_rcu(¬ifier->list);
spin_unlock(¬ifier->bs->list->lock);
synchronize_srcu(&srcu);
}
and registration can be:
int memfile_register_notifier(const struct file *file,
struct memfile_notifier *notifier)
{
struct memfile_notifier_list *list;
struct memfile_backing_store *bs;
int ret;
if (!file || !notifier)
return -EINVAL;
list_for_each_entry(bs, &backing_store_list, list) {
list = bs->get_notifier_list(file_inode(file));
if (list) {
notifier->bs = bs;
spin_lock(&list->lock);
list_add_rcu(¬ifier->list, &list->head);
spin_unlock(&list->lock);
return 0;
}
}
return -EOPNOTSUPP;
}
On Tue, Mar 29, 2022 at 06:45:16PM +0000, Sean Christopherson wrote:
> On Thu, Mar 10, 2022, Chao Peng wrote:
> > diff --git a/mm/Makefile b/mm/Makefile
> > index 70d4309c9ce3..f628256dce0d 100644
> > +void memfile_notifier_invalidate(struct memfile_notifier_list *list,
> > + pgoff_t start, pgoff_t end)
> > +{
> > + struct memfile_notifier *notifier;
> > + int id;
> > +
> > + id = srcu_read_lock(&srcu);
> > + list_for_each_entry_srcu(notifier, &list->head, list,
> > + srcu_read_lock_held(&srcu)) {
> > + if (notifier->ops && notifier->ops->invalidate)
>
> Any reason notifier->ops isn't mandatory?
Yes it's mandatory, will skip the check here.
>
> > + notifier->ops->invalidate(notifier, start, end);
> > + }
> > + srcu_read_unlock(&srcu, id);
> > +}
> > +
> > +void memfile_notifier_fallocate(struct memfile_notifier_list *list,
> > + pgoff_t start, pgoff_t end)
> > +{
> > + struct memfile_notifier *notifier;
> > + int id;
> > +
> > + id = srcu_read_lock(&srcu);
> > + list_for_each_entry_srcu(notifier, &list->head, list,
> > + srcu_read_lock_held(&srcu)) {
> > + if (notifier->ops && notifier->ops->fallocate)
> > + notifier->ops->fallocate(notifier, start, end);
> > + }
> > + srcu_read_unlock(&srcu, id);
> > +}
> > +
> > +void memfile_register_backing_store(struct memfile_backing_store *bs)
> > +{
> > + BUG_ON(!bs || !bs->get_notifier_list);
> > +
> > + list_add_tail(&bs->list, &backing_store_list);
> > +}
> > +
> > +void memfile_unregister_backing_store(struct memfile_backing_store *bs)
> > +{
> > + list_del(&bs->list);
>
> Allowing unregistration of a backing store is broken. Using the _safe() variant
> is not sufficient to guard against concurrent modification. I don't see any reason
> to support this out of the gate, the only reason to support unregistering a backing
> store is if the backing store is implemented as a module, and AFAIK none of the
> backing stores we plan on supporting initially support being built as a module.
> These aren't exported, so it's not like that's even possible. Registration would
> also be broken if modules are allowed, I'm pretty sure module init doesn't run
> under a global lock.
>
> We can always add this complexity if it's needed in the future, but for now the
> easiest thing would be to tag memfile_register_backing_store() with __init and
> make backing_store_list __ro_after_init.
The only currently supported backing store shmem does not need this so
can remove it for now.
>
> > +}
> > +
> > +static int memfile_get_notifier_info(struct inode *inode,
> > + struct memfile_notifier_list **list,
> > + struct memfile_pfn_ops **ops)
> > +{
> > + struct memfile_backing_store *bs, *iter;
> > + struct memfile_notifier_list *tmp;
> > +
> > + list_for_each_entry_safe(bs, iter, &backing_store_list, list) {
> > + tmp = bs->get_notifier_list(inode);
> > + if (tmp) {
> > + *list = tmp;
> > + if (ops)
> > + *ops = &bs->pfn_ops;
> > + return 0;
> > + }
> > + }
> > + return -EOPNOTSUPP;
> > +}
> > +
> > +int memfile_register_notifier(struct inode *inode,
>
> Taking an inode is a bit odd from a user perspective. Any reason not to take a
> "struct file *" and get the inode here? That would give callers a hint that they
> need to hold a reference to the file for the lifetime of the registration.
Yes, I can change.
>
> > + struct memfile_notifier *notifier,
> > + struct memfile_pfn_ops **pfn_ops)
> > +{
> > + struct memfile_notifier_list *list;
> > + int ret;
> > +
> > + if (!inode || !notifier | !pfn_ops)
>
> Bitwise | instead of logical ||. But IMO taking in a pfn_ops pointer is silly.
> More below.
>
> > + return -EINVAL;
> > +
> > + ret = memfile_get_notifier_info(inode, &list, pfn_ops);
> > + if (ret)
> > + return ret;
> > +
> > + spin_lock(&list->lock);
> > + list_add_rcu(¬ifier->list, &list->head);
> > + spin_unlock(&list->lock);
> > +
> > + return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(memfile_register_notifier);
> > +
> > +void memfile_unregister_notifier(struct inode *inode,
> > + struct memfile_notifier *notifier)
> > +{
> > + struct memfile_notifier_list *list;
> > +
> > + if (!inode || !notifier)
> > + return;
> > +
> > + BUG_ON(memfile_get_notifier_info(inode, &list, NULL));
>
> Eww. Rather than force the caller to provide the inode/file and the notifier,
> what about grabbing the backing store itself in the notifier?
>
> struct memfile_notifier {
> struct list_head list;
> struct memfile_notifier_ops *ops;
>
> struct memfile_backing_store *bs;
> };
>
> That also helps avoid confusing between "ops" and "pfn_ops". IMO, exposing
> memfile_backing_store to the caller isn't a big deal, and is preferable to having
> to rewalk multiple lists just to delete a notifier.
Agreed, good suggestion.
>
> Then this can become:
>
> void memfile_unregister_notifier(struct memfile_notifier *notifier)
> {
> spin_lock(¬ifier->bs->list->lock);
> list_del_rcu(¬ifier->list);
> spin_unlock(¬ifier->bs->list->lock);
>
> synchronize_srcu(&srcu);
> }
>
> and registration can be:
>
> int memfile_register_notifier(const struct file *file,
> struct memfile_notifier *notifier)
> {
> struct memfile_notifier_list *list;
> struct memfile_backing_store *bs;
> int ret;
>
> if (!file || !notifier)
> return -EINVAL;
>
> list_for_each_entry(bs, &backing_store_list, list) {
> list = bs->get_notifier_list(file_inode(file));
> if (list) {
> notifier->bs = bs;
>
> spin_lock(&list->lock);
> list_add_rcu(¬ifier->list, &list->head);
> spin_unlock(&list->lock);
> return 0;
> }
> }
>
> return -EOPNOTSUPP;
> }
© 2016 - 2026 Red Hat, Inc.