[RFC PATCH v2 05/10] kvm: gmem: Refcount internal accesses to gmem

Patrick Roy posted 10 patches 2 months, 2 weeks ago
[RFC PATCH v2 05/10] kvm: gmem: Refcount internal accesses to gmem
Posted by Patrick Roy 2 months, 2 weeks ago
Currently, if KVM_GMEM_NO_DIRECT_MAP is set and KVM wants to
internally access a gmem folio, KVM needs to reinsert the folio into the
direct map, and hold the folio lock until KVM is done using the folio
(and the folio is removed from the direct map again).

This means that long-term reinsertion into the direct map, and
concurrent accesses to the same gmem folio are currently impossible.
These are needed however for data structures of paravirtual devices,
such as kvm-clock, which are shared between guest and host via guest
memory pages (and multiple vCPUs can put their kvm-clock data into the
same guest page).

Thus, introduce the concept of a "sharing refcount", which gets
incremented on every call to kvm_gmem_get_pfn with
KVM_GMEM_GET_PFN_SHARED set. Direct map manipulations are only done when
the first refcount is grabbed (direct map entries are restored), or when
the last reference goes away (direct map entries are removed). While
holding a sharing reference, the folio lock may be dropped, as the
refcounting ensures that the direct map entry will not be removed as
long as at least one reference is held. However, whoever is holding a
reference will need to listen and respond to gmem invalidation events
(such as the page being in the process of being fallocated away).

Since refcount_t does not play nicely with references dropping to 0 and
later being raised again (it will WARN), we use a refcount of 1 to mean
"no sharing references held anywhere, folio not in direct map".

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 virt/kvm/guest_memfd.c | 61 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index f637abc6045ba..6772253497e4d 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -60,10 +60,37 @@ static bool kvm_gmem_test_accessible(struct kvm *kvm)
 	return kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM;
 }
 
+static int kvm_gmem_init_sharing_count(struct folio *folio)
+{
+	refcount_t *sharing_count = kmalloc(sizeof(*sharing_count), GFP_KERNEL);
+
+	if (!sharing_count)
+		return -ENOMEM;
+
+	/*
+	 * we need to use sharing_count == 1 to mean "no sharing", because
+	 * dropping a refcount_t to 0 and later incrementing it again would
+	 * result in a WARN.
+	 */
+	refcount_set(sharing_count, 1);
+	folio_change_private(folio, (void *)sharing_count);
+
+	return 0;
+}
+
 static int kvm_gmem_folio_set_private(struct folio *folio)
 {
 	unsigned long start, npages, i;
 	int r;
+	unsigned int sharing_refcount = refcount_read(folio_get_private(folio));
+
+	/*
+	 * We must only remove direct map entries after the last internal
+	 * reference has gone away, e.g. after the refcount dropped back
+	 * to 1.
+	 */
+	WARN_ONCE(sharing_refcount != 1, "%d unexpected sharing_refcounts pfn=%lx",
+		  sharing_refcount - 1, folio_pfn(folio));
 
 	start = (unsigned long) folio_address(folio);
 	npages = folio_nr_pages(folio);
@@ -97,6 +124,15 @@ static int kvm_gmem_folio_clear_private(struct folio *folio)
 {
 	unsigned long npages, i;
 	int r = 0;
+	unsigned int sharing_refcount = refcount_read(folio_get_private(folio));
+
+	/*
+	 * We must restore direct map entries on acquiring the first "sharing
+	 * reference". The refcount is lifted _after_ the call to
+	 * kvm_gmem_folio_clear_private, so it will still be 1 here.
+	 */
+	WARN_ONCE(sharing_refcount != 1, "%d unexpected sharing_refcounts pfn=%lx",
+		  sharing_refcount - 1, folio_pfn(folio));
 
 	npages = folio_nr_pages(folio);
 
@@ -156,13 +192,21 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, unsi
 
 	if (folio_test_private(folio) && share) {
 		r = kvm_gmem_folio_clear_private(folio);
-	} else if (!folio_test_private(folio) && !share) {
-		r = kvm_gmem_folio_set_private(folio);
+	} else if (!folio_test_private(folio)) {
+		r = kvm_gmem_init_sharing_count(folio);
+		if (r)
+			goto out_err;
+
+		if (!share)
+			r = kvm_gmem_folio_set_private(folio);
 	}
 
 	if (r)
 		goto out_err;
 
+	if (share)
+		refcount_inc(folio_get_private(folio));
+
 out:
 	/*
 	 * Ignore accessed, referenced, and dirty flags.  The memory is
@@ -429,7 +473,10 @@ static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *fol
 static void kvm_gmem_invalidate_folio(struct folio *folio, size_t start, size_t end)
 {
 	if (start == 0 && end == folio_size(folio)) {
+		refcount_t *sharing_count = folio_get_private(folio);
+
 		kvm_gmem_folio_clear_private(folio);
+		kfree(sharing_count);
 	}
 }
 
@@ -699,12 +746,20 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
 
 int kvm_gmem_put_shared_pfn(kvm_pfn_t pfn) {
+	int r = 0;
 	struct folio *folio = pfn_folio(pfn);
+	refcount_t *sharing_count;
 
 	if (!kvm_gmem_test_no_direct_map(folio_inode(folio)))
 		return 0;
 
-	return kvm_gmem_folio_set_private(folio);
+	sharing_count = folio_get_private(folio);
+	refcount_dec(sharing_count);
+
+	if (refcount_read(sharing_count) == 1)
+		r = kvm_gmem_folio_set_private(folio);
+
+	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_gmem_put_shared_pfn);
 
-- 
2.46.0