proc: Add mmap callback for /proc/<pid>/mem

[PATCH] [RFC] proc: Add mmap callback for /proc/<pid>/mem
Posted by Haider Miraj 1 year, 4 months ago
This patch introduces memory mapping (mmap) support for the /proc/<pid>/mem
interface. The new functionality allows users to map the memory of a
process into their address space reusing the same pages

The idea is to mmap another process's memory by first pinning the pages in
memory and then using `remap_pfn_range` to map them as device memory, reusing
the same pages. A list of pinned pages is maintained and released back on the
close call. This design has certain limitations.

I am seeking comments and advice on the following:
- Given that read access to `/proc/<pid>/mem` is already allowed for
  privileged users, are there specific reasons or concerns that have prevented
  the implementation of `mmap` for this interface?
- Is there a way to insert anonymous pages into a file-backed VMA so that it
  honors reverse mapping, eliminating the need to keep track of pinned pages?
- I plan to implement a page fault handler as well.

I am looking for feedback on how to improve this implementation and what
additional considerations are necessary for it to be accepted by the community.

Cc: xe-linux-external@cisco.com
Signed-off-by: Haider Miraj <hmiraj@cisco.com>
---
 fs/proc/base.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 72a1acd03675..405de47d0c1c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -117,6 +117,17 @@
 static u8 nlink_tid __ro_after_init;
 static u8 nlink_tgid __ro_after_init;
 
+struct vma_info {
+	struct list_head page_list_head;
+	uintptr_t vma_start_addr;
+	uintptr_t vma_end_addr;
+};
+
+struct page_list_item {
+	struct list_head list;
+	struct page *page;
+};
+
 struct pid_entry {
 	const char *name;
 	unsigned int len;
@@ -926,12 +937,130 @@ static int mem_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static void mem_vma_close(struct vm_area_struct *vma)
+{
+	struct vma_info *info;
+	struct page_list_item *item, *tmp;
+
+	info = vma->vm_private_data;
+
+	if (info) {
+		/* Avoid cleanup if we are being split, instead print warning */
+		if (info->vma_start_addr == vma->vm_start &&
+			info->vma_end_addr == vma->vm_end) {
+			/* Iterate over the list and free each item and call put_page */
+			list_for_each_entry_safe(item, tmp,
+						 &info->page_list_head, list) {
+				list_del(&item->list);
+				put_page(item->page);
+				kfree(item);
+			}
+
+			kfree(info);
+			vma->vm_private_data = NULL;
+		} else {
+			pr_warn("%s: VMA has been split, operation not supported\n", __func__);
+		}
+	}
+}
+
+static const struct vm_operations_struct mem_vm_ops = {
+	.close = mem_vma_close,
+};
+
+/**
+ * mem_mmap - Memory mapping function
+ *
+ * This function implements mmap call for /proc/<pid>/mem.
+ *
+ * Assumptions and Limitations:
+ * - This function does not handle reverse mapping, which is required for swapping.
+ * - The VMA is not expected to be split with an unmap call.
+ */
+static int mem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	uintptr_t addr, target_start_addr, target_end_addr;
+	struct page_list_item *item;
+	struct page *page, *zero_page;
+	unsigned long zero_page_pfn;
+	struct vma_info *info;
+	long pinned;
+	int ret;
+
+	/* Retrieve mm of the target process*/
+	struct mm_struct *mm = (struct mm_struct *)file->private_data;
+	size_t size = vma->vm_end - vma->vm_start;
+	uintptr_t start_addr = vma->vm_start;
+
+	target_start_addr = vma->vm_pgoff << PAGE_SHIFT; /* Multiply by PAGE_SIZE */
+	target_end_addr = target_start_addr + size;
+
+	if (!mm)
+		return -EINVAL;
+
+	info = kmalloc(sizeof(struct vma_info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+	INIT_LIST_HEAD(&info->page_list_head);
+	info->vma_start_addr = vma->vm_start;
+	info->vma_end_addr = vma->vm_end;
+
+	vma->vm_private_data = info;
+	vma->vm_ops = &mem_vm_ops;
+
+	zero_page = ZERO_PAGE(0);
+	zero_page_pfn = page_to_pfn(zero_page);
+
+	/* Acquire the mmap_lock before pinning the page (get_user_pages_remote) */
+	down_read(&mm->mmap_lock);
+
+	for (addr = target_start_addr; addr < target_end_addr; addr += PAGE_SIZE) {
+		unsigned long pfn;
+
+		/* Pin the user page */
+		pinned = get_user_pages_remote(mm, addr, 1, FOLL_GET | FOLL_NOFAULT,
+						&page, NULL, NULL);
+		/* Page is not resident (FOLL_NOFAULT), we will skip to the next address */
+		if (pinned <= 0) {
+			ret = remap_pfn_range(vma, start_addr, zero_page_pfn, PAGE_SIZE,
+					vma->vm_page_prot);
+			if (ret)
+				goto err_unlock;
+			start_addr += PAGE_SIZE;
+			continue;
+		}
+
+		/* We need to keep track of pages which are pinned */
+		item = kmalloc(sizeof(struct page_list_item), GFP_KERNEL);
+		if (!item) {
+			kfree(info);
+			return -ENOMEM;
+		}
+
+		item->page = page;
+		list_add(&item->list, &info->page_list_head);
+		pfn = page_to_pfn(page);
+
+		/* Remap the page frame under current vma */
+		ret = remap_pfn_range(vma, start_addr, pfn, PAGE_SIZE,
+					vma->vm_page_prot);
+		if (ret)
+			kfree(item);
+
+		start_addr += PAGE_SIZE;
+	}
+err_unlock:
+	up_read(&mm->mmap_lock);
+	return 0;
+}
+
 static const struct file_operations proc_mem_operations = {
 	.llseek		= mem_lseek,
 	.read		= mem_read,
 	.write		= mem_write,
 	.open		= mem_open,
 	.release	= mem_release,
+	.mmap		= mem_mmap,
 };
 
 static int environ_open(struct inode *inode, struct file *file)
-- 
2.35.6