PCI passthru on Hyper-V (Part I)

[PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Mukesh R 2 weeks, 4 days ago

From: Mukesh Rathor <mrathor@linux.microsoft.com>

Upon guest access, in case of missing mmio mapping, the hypervisor
generates an unmapped gpa intercept. In this path, lookup the PCI
resource pfn for the guest gpa, and ask the hypervisor to map it
via hypercall. The PCI resource pfn is maintained by the VFIO driver,
and obtained via fixup_user_fault call (similar to KVM).

Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 03f3aa9f5541..4c8bc7cd0888 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -56,6 +56,14 @@ struct hv_stats_page {
 	};
 } __packed;
 
+bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
+static int __init setup_hv_full_mmio(char *str)
+{
+	hv_nofull_mmio = true;
+	return 0;
+}
+__setup("hv_nofull_mmio", setup_hv_full_mmio);
+
 struct mshv_root mshv_root;
 
 enum hv_scheduler_type hv_scheduler_type;
@@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
 }
 
 #ifdef CONFIG_X86_64
+
+/*
+ * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
+ * else just return -errno.
+ */
+static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
+				       u64 *mmio_pfnp)
+{
+	struct vm_area_struct *vma;
+	bool is_mmio;
+	u64 uaddr;
+	struct mshv_mem_region *mreg;
+	struct follow_pfnmap_args pfnmap_args;
+	int rc = -EINVAL;
+
+	/*
+	 * Do not allow mem region to be deleted beneath us. VFIO uses
+	 * useraddr vma to lookup pci bar pfn.
+	 */
+	spin_lock(&pt->pt_mem_regions_lock);
+
+	/* Get the region again under the lock */
+	mreg = mshv_partition_region_by_gfn(pt, gfn);
+	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
+		goto unlock_pt_out;
+
+	uaddr = mreg->start_uaddr +
+		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
+
+	mmap_read_lock(current->mm);
+	vma = vma_lookup(current->mm, uaddr);
+	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
+	if (!is_mmio)
+		goto unlock_mmap_out;
+
+	pfnmap_args.vma = vma;
+	pfnmap_args.address = uaddr;
+
+	rc = follow_pfnmap_start(&pfnmap_args);
+	if (rc) {
+		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
+				      NULL);
+		if (rc)
+			goto unlock_mmap_out;
+
+		rc = follow_pfnmap_start(&pfnmap_args);
+		if (rc)
+			goto unlock_mmap_out;
+	}
+
+	*mmio_pfnp = pfnmap_args.pfn;
+	follow_pfnmap_end(&pfnmap_args);
+
+unlock_mmap_out:
+	mmap_read_unlock(current->mm);
+unlock_pt_out:
+	spin_unlock(&pt->pt_mem_regions_lock);
+	return rc;
+}
+
+/*
+ * At present, the only unmapped gpa is mmio space. Verify if it's mmio
+ * and resolve if possible.
+ * Returns: True if valid mmio intercept and it was handled, else false
+ */
+static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
+{
+	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
+	struct hv_x64_memory_intercept_message *msg;
+	union hv_x64_memory_access_info accinfo;
+	u64 gfn, mmio_spa, numpgs;
+	struct mshv_mem_region *mreg;
+	int rc;
+	struct mshv_partition *pt = vp->vp_partition;
+
+	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
+	accinfo = msg->memory_access_info;
+
+	if (!accinfo.gva_gpa_valid)
+		return false;
+
+	/* Do a fast check and bail if non mmio intercept */
+	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
+	mreg = mshv_partition_region_by_gfn(pt, gfn);
+	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
+		return false;
+
+	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
+	if (rc)
+		return false;
+
+	if (!hv_nofull_mmio) {		/* default case */
+		gfn = mreg->start_gfn;
+		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
+		numpgs = mreg->nr_pages;
+	} else
+		numpgs = 1;
+
+	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
+
+	return rc == 0;
+}
+
 static struct mshv_mem_region *
 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
 {
@@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
 
 	return ret;
 }
+
 #else  /* CONFIG_X86_64 */
+static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
 #endif /* CONFIG_X86_64 */
 
 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
 {
 	switch (vp->vp_intercept_msg_page->header.message_type) {
+	case HVMSG_UNMAPPED_GPA:
+		return mshv_handle_unmapped_gpa(vp);
 	case HVMSG_GPA_INTERCEPT:
 		return mshv_handle_gpa_intercept(vp);
 	}
-- 
2.51.2.vfs.0.1

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Stanislav Kinsburskii 2 weeks, 4 days ago

On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
> From: Mukesh Rathor <mrathor@linux.microsoft.com>
> 
> Upon guest access, in case of missing mmio mapping, the hypervisor
> generates an unmapped gpa intercept. In this path, lookup the PCI
> resource pfn for the guest gpa, and ask the hypervisor to map it
> via hypercall. The PCI resource pfn is maintained by the VFIO driver,
> and obtained via fixup_user_fault call (similar to KVM).
> 
> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> ---
>  drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 115 insertions(+)
> 
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index 03f3aa9f5541..4c8bc7cd0888 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -56,6 +56,14 @@ struct hv_stats_page {
>  	};
>  } __packed;
>  
> +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
> +static int __init setup_hv_full_mmio(char *str)
> +{
> +	hv_nofull_mmio = true;
> +	return 0;
> +}
> +__setup("hv_nofull_mmio", setup_hv_full_mmio);
> +
>  struct mshv_root mshv_root;
>  
>  enum hv_scheduler_type hv_scheduler_type;
> @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>  }
>  
>  #ifdef CONFIG_X86_64
> +
> +/*
> + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
> + * else just return -errno.
> + */
> +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
> +				       u64 *mmio_pfnp)
> +{
> +	struct vm_area_struct *vma;
> +	bool is_mmio;
> +	u64 uaddr;
> +	struct mshv_mem_region *mreg;
> +	struct follow_pfnmap_args pfnmap_args;
> +	int rc = -EINVAL;
> +
> +	/*
> +	 * Do not allow mem region to be deleted beneath us. VFIO uses
> +	 * useraddr vma to lookup pci bar pfn.
> +	 */
> +	spin_lock(&pt->pt_mem_regions_lock);
> +
> +	/* Get the region again under the lock */
> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> +		goto unlock_pt_out;
> +
> +	uaddr = mreg->start_uaddr +
> +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
> +
> +	mmap_read_lock(current->mm);

Semaphore can't be taken under spinlock.
Get it instead.

> +	vma = vma_lookup(current->mm, uaddr);
> +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;

Why this check is needed again?
The region type is stored on the region itself.
And the type is checked on the caller side.

> +	if (!is_mmio)
> +		goto unlock_mmap_out;
> +
> +	pfnmap_args.vma = vma;
> +	pfnmap_args.address = uaddr;
> +
> +	rc = follow_pfnmap_start(&pfnmap_args);
> +	if (rc) {
> +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
> +				      NULL);
> +		if (rc)
> +			goto unlock_mmap_out;
> +
> +		rc = follow_pfnmap_start(&pfnmap_args);
> +		if (rc)
> +			goto unlock_mmap_out;
> +	}
> +
> +	*mmio_pfnp = pfnmap_args.pfn;
> +	follow_pfnmap_end(&pfnmap_args);
> +
> +unlock_mmap_out:
> +	mmap_read_unlock(current->mm);
> +unlock_pt_out:
> +	spin_unlock(&pt->pt_mem_regions_lock);
> +	return rc;
> +}
> +
> +/*
> + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
> + * and resolve if possible.
> + * Returns: True if valid mmio intercept and it was handled, else false
> + */
> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
> +{
> +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
> +	struct hv_x64_memory_intercept_message *msg;
> +	union hv_x64_memory_access_info accinfo;
> +	u64 gfn, mmio_spa, numpgs;
> +	struct mshv_mem_region *mreg;
> +	int rc;
> +	struct mshv_partition *pt = vp->vp_partition;
> +
> +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
> +	accinfo = msg->memory_access_info;
> +
> +	if (!accinfo.gva_gpa_valid)
> +		return false;
> +
> +	/* Do a fast check and bail if non mmio intercept */
> +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
> +	mreg = mshv_partition_region_by_gfn(pt, gfn);

This call needs to be protected by the spinlock.

Thanks,
Stanislav 

> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> +		return false;
> +
> +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
> +	if (rc)
> +		return false;
> +
> +	if (!hv_nofull_mmio) {		/* default case */
> +		gfn = mreg->start_gfn;
> +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
> +		numpgs = mreg->nr_pages;
> +	} else
> +		numpgs = 1;
> +
> +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
> +
> +	return rc == 0;
> +}
> +
>  static struct mshv_mem_region *
>  mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>  {
> @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>  
>  	return ret;
>  }
> +
>  #else  /* CONFIG_X86_64 */
> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
>  static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>  #endif /* CONFIG_X86_64 */
>  
>  static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>  {
>  	switch (vp->vp_intercept_msg_page->header.message_type) {
> +	case HVMSG_UNMAPPED_GPA:
> +		return mshv_handle_unmapped_gpa(vp);
>  	case HVMSG_GPA_INTERCEPT:
>  		return mshv_handle_gpa_intercept(vp);
>  	}
> -- 
> 2.51.2.vfs.0.1
>

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Mukesh R 2 weeks, 1 day ago

On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> Upon guest access, in case of missing mmio mapping, the hypervisor
>> generates an unmapped gpa intercept. In this path, lookup the PCI
>> resource pfn for the guest gpa, and ask the hypervisor to map it
>> via hypercall. The PCI resource pfn is maintained by the VFIO driver,
>> and obtained via fixup_user_fault call (similar to KVM).
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>> ---
>>   drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
>>   1 file changed, 115 insertions(+)
>>
>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>> index 03f3aa9f5541..4c8bc7cd0888 100644
>> --- a/drivers/hv/mshv_root_main.c
>> +++ b/drivers/hv/mshv_root_main.c
>> @@ -56,6 +56,14 @@ struct hv_stats_page {
>>   	};
>>   } __packed;
>>   
>> +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
>> +static int __init setup_hv_full_mmio(char *str)
>> +{
>> +	hv_nofull_mmio = true;
>> +	return 0;
>> +}
>> +__setup("hv_nofull_mmio", setup_hv_full_mmio);
>> +
>>   struct mshv_root mshv_root;
>>   
>>   enum hv_scheduler_type hv_scheduler_type;
>> @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>>   }
>>   
>>   #ifdef CONFIG_X86_64
>> +
>> +/*
>> + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
>> + * else just return -errno.
>> + */
>> +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
>> +				       u64 *mmio_pfnp)
>> +{
>> +	struct vm_area_struct *vma;
>> +	bool is_mmio;
>> +	u64 uaddr;
>> +	struct mshv_mem_region *mreg;
>> +	struct follow_pfnmap_args pfnmap_args;
>> +	int rc = -EINVAL;
>> +
>> +	/*
>> +	 * Do not allow mem region to be deleted beneath us. VFIO uses
>> +	 * useraddr vma to lookup pci bar pfn.
>> +	 */
>> +	spin_lock(&pt->pt_mem_regions_lock);
>> +
>> +	/* Get the region again under the lock */
>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>> +		goto unlock_pt_out;
>> +
>> +	uaddr = mreg->start_uaddr +
>> +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
>> +
>> +	mmap_read_lock(current->mm);
> 
> Semaphore can't be taken under spinlock.
> Get it instead.

Yeah, something didn't feel right here and I meant to recheck, now regret
rushing to submit the patch.

Rethinking, I think the pt_mem_regions_lock is not needed to protect
the uaddr because unmap will properly serialize via the mm lock.


>> +	vma = vma_lookup(current->mm, uaddr);
>> +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> 
> Why this check is needed again?

To make sure region did not change. This check is under lock.

> The region type is stored on the region itself.
> And the type is checked on the caller side.
> 
>> +	if (!is_mmio)
>> +		goto unlock_mmap_out;
>> +
>> +	pfnmap_args.vma = vma;
>> +	pfnmap_args.address = uaddr;
>> +
>> +	rc = follow_pfnmap_start(&pfnmap_args);
>> +	if (rc) {
>> +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
>> +				      NULL);
>> +		if (rc)
>> +			goto unlock_mmap_out;
>> +
>> +		rc = follow_pfnmap_start(&pfnmap_args);
>> +		if (rc)
>> +			goto unlock_mmap_out;
>> +	}
>> +
>> +	*mmio_pfnp = pfnmap_args.pfn;
>> +	follow_pfnmap_end(&pfnmap_args);
>> +
>> +unlock_mmap_out:
>> +	mmap_read_unlock(current->mm);
>> +unlock_pt_out:
>> +	spin_unlock(&pt->pt_mem_regions_lock);
>> +	return rc;
>> +}
>> +
>> +/*
>> + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
>> + * and resolve if possible.
>> + * Returns: True if valid mmio intercept and it was handled, else false
>> + */
>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
>> +{
>> +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
>> +	struct hv_x64_memory_intercept_message *msg;
>> +	union hv_x64_memory_access_info accinfo;
>> +	u64 gfn, mmio_spa, numpgs;
>> +	struct mshv_mem_region *mreg;
>> +	int rc;
>> +	struct mshv_partition *pt = vp->vp_partition;
>> +
>> +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
>> +	accinfo = msg->memory_access_info;
>> +
>> +	if (!accinfo.gva_gpa_valid)
>> +		return false;
>> +
>> +	/* Do a fast check and bail if non mmio intercept */
>> +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> 
> This call needs to be protected by the spinlock.

This is sorta fast path to bail. We recheck under partition lock above.

Thanks,
-Mukesh


> Thanks,
> Stanislav
> 
>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>> +		return false;
>> +
>> +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
>> +	if (rc)
>> +		return false;
>> +
>> +	if (!hv_nofull_mmio) {		/* default case */
>> +		gfn = mreg->start_gfn;
>> +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
>> +		numpgs = mreg->nr_pages;
>> +	} else
>> +		numpgs = 1;
>> +
>> +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
>> +
>> +	return rc == 0;
>> +}
>> +
>>   static struct mshv_mem_region *
>>   mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>>   {
>> @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>   
>>   	return ret;
>>   }
>> +
>>   #else  /* CONFIG_X86_64 */
>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
>>   static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>>   #endif /* CONFIG_X86_64 */
>>   
>>   static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>>   {
>>   	switch (vp->vp_intercept_msg_page->header.message_type) {
>> +	case HVMSG_UNMAPPED_GPA:
>> +		return mshv_handle_unmapped_gpa(vp);
>>   	case HVMSG_GPA_INTERCEPT:
>>   		return mshv_handle_gpa_intercept(vp);
>>   	}
>> -- 
>> 2.51.2.vfs.0.1
>>

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Stanislav Kinsburskii 1 week, 5 days ago

On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
> On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> > On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
> > > From: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > 
> > > Upon guest access, in case of missing mmio mapping, the hypervisor
> > > generates an unmapped gpa intercept. In this path, lookup the PCI
> > > resource pfn for the guest gpa, and ask the hypervisor to map it
> > > via hypercall. The PCI resource pfn is maintained by the VFIO driver,
> > > and obtained via fixup_user_fault call (similar to KVM).
> > > 
> > > Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > ---
> > >   drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
> > >   1 file changed, 115 insertions(+)
> > > 
> > > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > > index 03f3aa9f5541..4c8bc7cd0888 100644
> > > --- a/drivers/hv/mshv_root_main.c
> > > +++ b/drivers/hv/mshv_root_main.c
> > > @@ -56,6 +56,14 @@ struct hv_stats_page {
> > >   	};
> > >   } __packed;
> > > +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
> > > +static int __init setup_hv_full_mmio(char *str)
> > > +{
> > > +	hv_nofull_mmio = true;
> > > +	return 0;
> > > +}
> > > +__setup("hv_nofull_mmio", setup_hv_full_mmio);
> > > +
> > >   struct mshv_root mshv_root;
> > >   enum hv_scheduler_type hv_scheduler_type;
> > > @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> > >   }
> > >   #ifdef CONFIG_X86_64
> > > +
> > > +/*
> > > + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
> > > + * else just return -errno.
> > > + */
> > > +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
> > > +				       u64 *mmio_pfnp)
> > > +{
> > > +	struct vm_area_struct *vma;
> > > +	bool is_mmio;
> > > +	u64 uaddr;
> > > +	struct mshv_mem_region *mreg;
> > > +	struct follow_pfnmap_args pfnmap_args;
> > > +	int rc = -EINVAL;
> > > +
> > > +	/*
> > > +	 * Do not allow mem region to be deleted beneath us. VFIO uses
> > > +	 * useraddr vma to lookup pci bar pfn.
> > > +	 */
> > > +	spin_lock(&pt->pt_mem_regions_lock);
> > > +
> > > +	/* Get the region again under the lock */
> > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > +		goto unlock_pt_out;
> > > +
> > > +	uaddr = mreg->start_uaddr +
> > > +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
> > > +
> > > +	mmap_read_lock(current->mm);
> > 
> > Semaphore can't be taken under spinlock.

> 
> Yeah, something didn't feel right here and I meant to recheck, now regret
> rushing to submit the patch.
> 
> Rethinking, I think the pt_mem_regions_lock is not needed to protect
> the uaddr because unmap will properly serialize via the mm lock.
> 
> 
> > > +	vma = vma_lookup(current->mm, uaddr);
> > > +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> > 
> > Why this check is needed again?
> 
> To make sure region did not change. This check is under lock.
> 

How can this happen? One can't change VMA type without unmapping it
first. And unmapping it leads to a kernel MMIO region state dangling
around without corresponding user space mapping.

This is similar to dangling pinned regions and should likely be
addressed the same way by utilizing MMU notifiers to destpoy memoty
regions is VMA is detached.

> > The region type is stored on the region itself.
> > And the type is checked on the caller side.
> > 
> > > +	if (!is_mmio)
> > > +		goto unlock_mmap_out;
> > > +
> > > +	pfnmap_args.vma = vma;
> > > +	pfnmap_args.address = uaddr;
> > > +
> > > +	rc = follow_pfnmap_start(&pfnmap_args);
> > > +	if (rc) {
> > > +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
> > > +				      NULL);
> > > +		if (rc)
> > > +			goto unlock_mmap_out;
> > > +
> > > +		rc = follow_pfnmap_start(&pfnmap_args);
> > > +		if (rc)
> > > +			goto unlock_mmap_out;
> > > +	}
> > > +
> > > +	*mmio_pfnp = pfnmap_args.pfn;
> > > +	follow_pfnmap_end(&pfnmap_args);
> > > +d
> > > +unlock_mmap_out:
> > > +	mmap_read_unlock(current->mm);
> > > +unlock_pt_out:
> > > +	spin_unlock(&pt->pt_mem_regions_lock);
> > > +	return rc;
> > > +}
> > > +
> > > +/*
> > > + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
> > > + * and resolve if possible.
> > > + * Returns: True if valid mmio intercept and it was handled, else false
> > > + */
> > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
> > > +{
> > > +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
> > > +	struct hv_x64_memory_intercept_message *msg;
> > > +	union hv_x64_memory_access_info accinfo;
> > > +	u64 gfn, mmio_spa, numpgs;
> > > +	struct mshv_mem_region *mreg;
> > > +	int rc;
> > > +	struct mshv_partition *pt = vp->vp_partition;
> > > +
> > > +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
> > > +	accinfo = msg->memory_access_info;
> > > +
> > > +	if (!accinfo.gva_gpa_valid)
> > > +		return false;
> > > +
> > > +	/* Do a fast check and bail if non mmio intercept */
> > > +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
> > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > 
> > This call needs to be protected by the spinlock.
> 
> This is sorta fast path to bail. We recheck under partition lock above.
> 

Accessing the list of regions without lock is unsafe.

Thanks,
Stanislav

> Thanks,
> -Mukesh
> 
> 
> > Thanks,
> > Stanislav
> > 
> > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > +		return false;
> > > +
> > > +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
> > > +	if (rc)
> > > +		return false;
> > > +
> > > +	if (!hv_nofull_mmio) {		/* default case */
> > > +		gfn = mreg->start_gfn;
> > > +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
> > > +		numpgs = mreg->nr_pages;
> > > +	} else
> > > +		numpgs = 1;
> > > +
> > > +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
> > > +
> > > +	return rc == 0;
> > > +}
> > > +
> > >   static struct mshv_mem_region *
> > >   mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> > >   {
> > > @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > >   	return ret;
> > >   }
> > > +
> > >   #else  /* CONFIG_X86_64 */
> > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
> > >   static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > >   #endif /* CONFIG_X86_64 */
> > >   static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> > >   {
> > >   	switch (vp->vp_intercept_msg_page->header.message_type) {
> > > +	case HVMSG_UNMAPPED_GPA:
> > > +		return mshv_handle_unmapped_gpa(vp);
> > >   	case HVMSG_GPA_INTERCEPT:
> > >   		return mshv_handle_gpa_intercept(vp);
> > >   	}
> > > -- 
> > > 2.51.2.vfs.0.1
> > >

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Mukesh R 1 week, 5 days ago

On 1/26/26 10:15, Stanislav Kinsburskii wrote:
> On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
>> On 1/20/26 17:53, Stanislav Kinsburskii wrote:
>>> On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
>>>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>
>>>> Upon guest access, in case of missing mmio mapping, the hypervisor
>>>> generates an unmapped gpa intercept. In this path, lookup the PCI
>>>> resource pfn for the guest gpa, and ask the hypervisor to map it
>>>> via hypercall. The PCI resource pfn is maintained by the VFIO driver,
>>>> and obtained via fixup_user_fault call (similar to KVM).
>>>>
>>>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>> ---
>>>>    drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 115 insertions(+)
>>>>
>>>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>>>> index 03f3aa9f5541..4c8bc7cd0888 100644
>>>> --- a/drivers/hv/mshv_root_main.c
>>>> +++ b/drivers/hv/mshv_root_main.c
>>>> @@ -56,6 +56,14 @@ struct hv_stats_page {
>>>>    	};
>>>>    } __packed;
>>>> +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
>>>> +static int __init setup_hv_full_mmio(char *str)
>>>> +{
>>>> +	hv_nofull_mmio = true;
>>>> +	return 0;
>>>> +}
>>>> +__setup("hv_nofull_mmio", setup_hv_full_mmio);
>>>> +
>>>>    struct mshv_root mshv_root;
>>>>    enum hv_scheduler_type hv_scheduler_type;
>>>> @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>>>>    }
>>>>    #ifdef CONFIG_X86_64
>>>> +
>>>> +/*
>>>> + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
>>>> + * else just return -errno.
>>>> + */
>>>> +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
>>>> +				       u64 *mmio_pfnp)
>>>> +{
>>>> +	struct vm_area_struct *vma;
>>>> +	bool is_mmio;
>>>> +	u64 uaddr;
>>>> +	struct mshv_mem_region *mreg;
>>>> +	struct follow_pfnmap_args pfnmap_args;
>>>> +	int rc = -EINVAL;
>>>> +
>>>> +	/*
>>>> +	 * Do not allow mem region to be deleted beneath us. VFIO uses
>>>> +	 * useraddr vma to lookup pci bar pfn.
>>>> +	 */
>>>> +	spin_lock(&pt->pt_mem_regions_lock);
>>>> +
>>>> +	/* Get the region again under the lock */
>>>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>> +		goto unlock_pt_out;
>>>> +
>>>> +	uaddr = mreg->start_uaddr +
>>>> +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
>>>> +
>>>> +	mmap_read_lock(current->mm);
>>>
>>> Semaphore can't be taken under spinlock.
> 
>>
>> Yeah, something didn't feel right here and I meant to recheck, now regret
>> rushing to submit the patch.
>>
>> Rethinking, I think the pt_mem_regions_lock is not needed to protect
>> the uaddr because unmap will properly serialize via the mm lock.
>>
>>
>>>> +	vma = vma_lookup(current->mm, uaddr);
>>>> +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
>>>
>>> Why this check is needed again?
>>
>> To make sure region did not change. This check is under lock.
>>
> 
> How can this happen? One can't change VMA type without unmapping it
> first. And unmapping it leads to a kernel MMIO region state dangling
> around without corresponding user space mapping.

Right, and vm_flags would not be mmio expected then.

> This is similar to dangling pinned regions and should likely be
> addressed the same way by utilizing MMU notifiers to destpoy memoty
> regions is VMA is detached.

I don't think we need that. Either it succeeds if the region did not
change at all, or just fails.


>>> The region type is stored on the region itself.
>>> And the type is checked on the caller side.
>>>
>>>> +	if (!is_mmio)
>>>> +		goto unlock_mmap_out;
>>>> +
>>>> +	pfnmap_args.vma = vma;
>>>> +	pfnmap_args.address = uaddr;
>>>> +
>>>> +	rc = follow_pfnmap_start(&pfnmap_args);
>>>> +	if (rc) {
>>>> +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
>>>> +				      NULL);
>>>> +		if (rc)
>>>> +			goto unlock_mmap_out;
>>>> +
>>>> +		rc = follow_pfnmap_start(&pfnmap_args);
>>>> +		if (rc)
>>>> +			goto unlock_mmap_out;
>>>> +	}
>>>> +
>>>> +	*mmio_pfnp = pfnmap_args.pfn;
>>>> +	follow_pfnmap_end(&pfnmap_args);
>>>> +d
>>>> +unlock_mmap_out:
>>>> +	mmap_read_unlock(current->mm);
>>>> +unlock_pt_out:
>>>> +	spin_unlock(&pt->pt_mem_regions_lock);
>>>> +	return rc;
>>>> +}
>>>> +
>>>> +/*
>>>> + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
>>>> + * and resolve if possible.
>>>> + * Returns: True if valid mmio intercept and it was handled, else false
>>>> + */
>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
>>>> +{
>>>> +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
>>>> +	struct hv_x64_memory_intercept_message *msg;
>>>> +	union hv_x64_memory_access_info accinfo;
>>>> +	u64 gfn, mmio_spa, numpgs;
>>>> +	struct mshv_mem_region *mreg;
>>>> +	int rc;
>>>> +	struct mshv_partition *pt = vp->vp_partition;
>>>> +
>>>> +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
>>>> +	accinfo = msg->memory_access_info;
>>>> +
>>>> +	if (!accinfo.gva_gpa_valid)
>>>> +		return false;
>>>> +
>>>> +	/* Do a fast check and bail if non mmio intercept */
>>>> +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
>>>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>
>>> This call needs to be protected by the spinlock.
>>
>> This is sorta fast path to bail. We recheck under partition lock above.
>>
> 
> Accessing the list of regions without lock is unsafe.

I am not sure why? This check is done by a vcpu thread, so regions
will not have just gone away.

Thanks,
-Mukesh


> Thanks,
> Stanislav
> 
>> Thanks,
>> -Mukesh
>>
>>
>>> Thanks,
>>> Stanislav
>>>
>>>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>> +		return false;
>>>> +
>>>> +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
>>>> +	if (rc)
>>>> +		return false;
>>>> +
>>>> +	if (!hv_nofull_mmio) {		/* default case */
>>>> +		gfn = mreg->start_gfn;
>>>> +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
>>>> +		numpgs = mreg->nr_pages;
>>>> +	} else
>>>> +		numpgs = 1;
>>>> +
>>>> +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
>>>> +
>>>> +	return rc == 0;
>>>> +}
>>>> +
>>>>    static struct mshv_mem_region *
>>>>    mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>>>>    {
>>>> @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>>>    	return ret;
>>>>    }
>>>> +
>>>>    #else  /* CONFIG_X86_64 */
>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
>>>>    static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>>>>    #endif /* CONFIG_X86_64 */
>>>>    static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>>>>    {
>>>>    	switch (vp->vp_intercept_msg_page->header.message_type) {
>>>> +	case HVMSG_UNMAPPED_GPA:
>>>> +		return mshv_handle_unmapped_gpa(vp);
>>>>    	case HVMSG_GPA_INTERCEPT:
>>>>    		return mshv_handle_gpa_intercept(vp);
>>>>    	}
>>>> -- 
>>>> 2.51.2.vfs.0.1
>>>>

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Stanislav Kinsburskii 1 week, 4 days ago

On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
> On 1/26/26 10:15, Stanislav Kinsburskii wrote:
> > On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
> > > On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> > > > On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
> > > > > From: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > 
> > > > > Upon guest access, in case of missing mmio mapping, the hypervisor
> > > > > generates an unmapped gpa intercept. In this path, lookup the PCI
> > > > > resource pfn for the guest gpa, and ask the hypervisor to map it
> > > > > via hypercall. The PCI resource pfn is maintained by the VFIO driver,
> > > > > and obtained via fixup_user_fault call (similar to KVM).
> > > > > 
> > > > > Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > ---
> > > > >    drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
> > > > >    1 file changed, 115 insertions(+)
> > > > > 
> > > > > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > > > > index 03f3aa9f5541..4c8bc7cd0888 100644
> > > > > --- a/drivers/hv/mshv_root_main.c
> > > > > +++ b/drivers/hv/mshv_root_main.c
> > > > > @@ -56,6 +56,14 @@ struct hv_stats_page {
> > > > >    	};
> > > > >    } __packed;
> > > > > +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
> > > > > +static int __init setup_hv_full_mmio(char *str)
> > > > > +{
> > > > > +	hv_nofull_mmio = true;
> > > > > +	return 0;
> > > > > +}
> > > > > +__setup("hv_nofull_mmio", setup_hv_full_mmio);
> > > > > +
> > > > >    struct mshv_root mshv_root;
> > > > >    enum hv_scheduler_type hv_scheduler_type;
> > > > > @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> > > > >    }
> > > > >    #ifdef CONFIG_X86_64
> > > > > +
> > > > > +/*
> > > > > + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
> > > > > + * else just return -errno.
> > > > > + */
> > > > > +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
> > > > > +				       u64 *mmio_pfnp)
> > > > > +{
> > > > > +	struct vm_area_struct *vma;
> > > > > +	bool is_mmio;
> > > > > +	u64 uaddr;
> > > > > +	struct mshv_mem_region *mreg;
> > > > > +	struct follow_pfnmap_args pfnmap_args;
> > > > > +	int rc = -EINVAL;
> > > > > +
> > > > > +	/*
> > > > > +	 * Do not allow mem region to be deleted beneath us. VFIO uses
> > > > > +	 * useraddr vma to lookup pci bar pfn.
> > > > > +	 */
> > > > > +	spin_lock(&pt->pt_mem_regions_lock);
> > > > > +
> > > > > +	/* Get the region again under the lock */
> > > > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > +		goto unlock_pt_out;
> > > > > +
> > > > > +	uaddr = mreg->start_uaddr +
> > > > > +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
> > > > > +
> > > > > +	mmap_read_lock(current->mm);
> > > > 
> > > > Semaphore can't be taken under spinlock.
> > 
> > > 
> > > Yeah, something didn't feel right here and I meant to recheck, now regret
> > > rushing to submit the patch.
> > > 
> > > Rethinking, I think the pt_mem_regions_lock is not needed to protect
> > > the uaddr because unmap will properly serialize via the mm lock.
> > > 
> > > 
> > > > > +	vma = vma_lookup(current->mm, uaddr);
> > > > > +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> > > > 
> > > > Why this check is needed again?
> > > 
> > > To make sure region did not change. This check is under lock.
> > > 
> > 
> > How can this happen? One can't change VMA type without unmapping it
> > first. And unmapping it leads to a kernel MMIO region state dangling
> > around without corresponding user space mapping.
> 
> Right, and vm_flags would not be mmio expected then.
> 
> > This is similar to dangling pinned regions and should likely be
> > addressed the same way by utilizing MMU notifiers to destpoy memoty
> > regions is VMA is detached.
> 
> I don't think we need that. Either it succeeds if the region did not
> change at all, or just fails.
> 

I'm afraid we do, as if the driver mapped a page with the previous
memory region, and then the region is unmapped, the page will stay
mapped in the hypervisor, but will be considered free by kernel, which
in turn will lead to GPF upn next allocation.

With pinned regions we issue is similar but less impacting: pages can't
be released by user space unmapping and thus will be simply leaked, but
the system stays intact.

MMIO regions are simila to movable region in this regard: they don't
reference the user pages, and thus this guest region replaement is a
stright wat to kernel panic.

> 
> > > > The region type is stored on the region itself.
> > > > And the type is checked on the caller side.
> > > > 
> > > > > +	if (!is_mmio)
> > > > > +		goto unlock_mmap_out;
> > > > > +
> > > > > +	pfnmap_args.vma = vma;
> > > > > +	pfnmap_args.address = uaddr;
> > > > > +
> > > > > +	rc = follow_pfnmap_start(&pfnmap_args);
> > > > > +	if (rc) {
> > > > > +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
> > > > > +				      NULL);
> > > > > +		if (rc)
> > > > > +			goto unlock_mmap_out;
> > > > > +
> > > > > +		rc = follow_pfnmap_start(&pfnmap_args);
> > > > > +		if (rc)
> > > > > +			goto unlock_mmap_out;
> > > > > +	}
> > > > > +
> > > > > +	*mmio_pfnp = pfnmap_args.pfn;
> > > > > +	follow_pfnmap_end(&pfnmap_args);
> > > > > +d
> > > > > +unlock_mmap_out:
> > > > > +	mmap_read_unlock(current->mm);
> > > > > +unlock_pt_out:
> > > > > +	spin_unlock(&pt->pt_mem_regions_lock);
> > > > > +	return rc;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
> > > > > + * and resolve if possible.
> > > > > + * Returns: True if valid mmio intercept and it was handled, else false
> > > > > + */
> > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
> > > > > +{
> > > > > +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
> > > > > +	struct hv_x64_memory_intercept_message *msg;
> > > > > +	union hv_x64_memory_access_info accinfo;
> > > > > +	u64 gfn, mmio_spa, numpgs;
> > > > > +	struct mshv_mem_region *mreg;
> > > > > +	int rc;
> > > > > +	struct mshv_partition *pt = vp->vp_partition;
> > > > > +
> > > > > +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
> > > > > +	accinfo = msg->memory_access_info;
> > > > > +
> > > > > +	if (!accinfo.gva_gpa_valid)
> > > > > +		return false;
> > > > > +
> > > > > +	/* Do a fast check and bail if non mmio intercept */
> > > > > +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
> > > > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > 
> > > > This call needs to be protected by the spinlock.
> > > 
> > > This is sorta fast path to bail. We recheck under partition lock above.
> > > 
> > 
> > Accessing the list of regions without lock is unsafe.
> 
> I am not sure why? This check is done by a vcpu thread, so regions
> will not have just gone away.
> 

This is shared resources. Multiple VP thread get into this function
simultaneously, so there is a race already. But this one we can live
with without locking as they don't mutate the list of the regions.

The issue happens when VMM adds or removed another region as it mutates
the list and races with VP threads doing this lookup.

Thanks,
Stanislav


> Thanks,
> -Mukesh
> 
> 
> > Thanks,
> > Stanislav
> > 
> > > Thanks,
> > > -Mukesh
> > > 
> > > 
> > > > Thanks,
> > > > Stanislav
> > > > 
> > > > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > +		return false;
> > > > > +
> > > > > +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
> > > > > +	if (rc)
> > > > > +		return false;
> > > > > +
> > > > > +	if (!hv_nofull_mmio) {		/* default case */
> > > > > +		gfn = mreg->start_gfn;
> > > > > +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
> > > > > +		numpgs = mreg->nr_pages;
> > > > > +	} else
> > > > > +		numpgs = 1;
> > > > > +
> > > > > +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
> > > > > +
> > > > > +	return rc == 0;
> > > > > +}
> > > > > +
> > > > >    static struct mshv_mem_region *
> > > > >    mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> > > > >    {
> > > > > @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > > > >    	return ret;
> > > > >    }
> > > > > +
> > > > >    #else  /* CONFIG_X86_64 */
> > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
> > > > >    static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > > > >    #endif /* CONFIG_X86_64 */
> > > > >    static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> > > > >    {
> > > > >    	switch (vp->vp_intercept_msg_page->header.message_type) {
> > > > > +	case HVMSG_UNMAPPED_GPA:
> > > > > +		return mshv_handle_unmapped_gpa(vp);
> > > > >    	case HVMSG_GPA_INTERCEPT:
> > > > >    		return mshv_handle_gpa_intercept(vp);
> > > > >    	}
> > > > > -- 
> > > > > 2.51.2.vfs.0.1
> > > > >

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Mukesh R 1 week, 1 day ago

On 1/27/26 10:57, Stanislav Kinsburskii wrote:
> On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
>> On 1/26/26 10:15, Stanislav Kinsburskii wrote:
>>> On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
>>>> On 1/20/26 17:53, Stanislav Kinsburskii wrote:
>>>>> On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
>>>>>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>>
>>>>>> Upon guest access, in case of missing mmio mapping, the hypervisor
>>>>>> generates an unmapped gpa intercept. In this path, lookup the PCI
>>>>>> resource pfn for the guest gpa, and ask the hypervisor to map it
>>>>>> via hypercall. The PCI resource pfn is maintained by the VFIO driver,
>>>>>> and obtained via fixup_user_fault call (similar to KVM).
>>>>>>
>>>>>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>> ---
>>>>>>     drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
>>>>>>     1 file changed, 115 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>>>>>> index 03f3aa9f5541..4c8bc7cd0888 100644
>>>>>> --- a/drivers/hv/mshv_root_main.c
>>>>>> +++ b/drivers/hv/mshv_root_main.c
>>>>>> @@ -56,6 +56,14 @@ struct hv_stats_page {
>>>>>>     	};
>>>>>>     } __packed;
>>>>>> +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
>>>>>> +static int __init setup_hv_full_mmio(char *str)
>>>>>> +{
>>>>>> +	hv_nofull_mmio = true;
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +__setup("hv_nofull_mmio", setup_hv_full_mmio);
>>>>>> +
>>>>>>     struct mshv_root mshv_root;
>>>>>>     enum hv_scheduler_type hv_scheduler_type;
>>>>>> @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>>>>>>     }
>>>>>>     #ifdef CONFIG_X86_64
>>>>>> +
>>>>>> +/*
>>>>>> + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
>>>>>> + * else just return -errno.
>>>>>> + */
>>>>>> +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
>>>>>> +				       u64 *mmio_pfnp)
>>>>>> +{
>>>>>> +	struct vm_area_struct *vma;
>>>>>> +	bool is_mmio;
>>>>>> +	u64 uaddr;
>>>>>> +	struct mshv_mem_region *mreg;
>>>>>> +	struct follow_pfnmap_args pfnmap_args;
>>>>>> +	int rc = -EINVAL;
>>>>>> +
>>>>>> +	/*
>>>>>> +	 * Do not allow mem region to be deleted beneath us. VFIO uses
>>>>>> +	 * useraddr vma to lookup pci bar pfn.
>>>>>> +	 */
>>>>>> +	spin_lock(&pt->pt_mem_regions_lock);
>>>>>> +
>>>>>> +	/* Get the region again under the lock */
>>>>>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>> +		goto unlock_pt_out;
>>>>>> +
>>>>>> +	uaddr = mreg->start_uaddr +
>>>>>> +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
>>>>>> +
>>>>>> +	mmap_read_lock(current->mm);
>>>>>
>>>>> Semaphore can't be taken under spinlock.
>>>
>>>>
>>>> Yeah, something didn't feel right here and I meant to recheck, now regret
>>>> rushing to submit the patch.
>>>>
>>>> Rethinking, I think the pt_mem_regions_lock is not needed to protect
>>>> the uaddr because unmap will properly serialize via the mm lock.
>>>>
>>>>
>>>>>> +	vma = vma_lookup(current->mm, uaddr);
>>>>>> +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
>>>>>
>>>>> Why this check is needed again?
>>>>
>>>> To make sure region did not change. This check is under lock.
>>>>
>>>
>>> How can this happen? One can't change VMA type without unmapping it
>>> first. And unmapping it leads to a kernel MMIO region state dangling
>>> around without corresponding user space mapping.
>>
>> Right, and vm_flags would not be mmio expected then.
>>
>>> This is similar to dangling pinned regions and should likely be
>>> addressed the same way by utilizing MMU notifiers to destpoy memoty
>>> regions is VMA is detached.
>>
>> I don't think we need that. Either it succeeds if the region did not
>> change at all, or just fails.
>>
> 
> I'm afraid we do, as if the driver mapped a page with the previous
> memory region, and then the region is unmapped, the page will stay
> mapped in the hypervisor, but will be considered free by kernel, which
> in turn will lead to GPF upn next allocation.

There are no ram pages for mmio regions. Also, we don't do much with
mmio regions other than tell the hyp about it.

Thanks,
-Mukesh


> With pinned regions we issue is similar but less impacting: pages can't
> be released by user space unmapping and thus will be simply leaked, but
> the system stays intact.
> 
> MMIO regions are simila to movable region in this regard: they don't
> reference the user pages, and thus this guest region replaement is a
> stright wat to kernel panic.
> 
>>
>>>>> The region type is stored on the region itself.
>>>>> And the type is checked on the caller side.
>>>>>
>>>>>> +	if (!is_mmio)
>>>>>> +		goto unlock_mmap_out;
>>>>>> +
>>>>>> +	pfnmap_args.vma = vma;
>>>>>> +	pfnmap_args.address = uaddr;
>>>>>> +
>>>>>> +	rc = follow_pfnmap_start(&pfnmap_args);
>>>>>> +	if (rc) {
>>>>>> +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
>>>>>> +				      NULL);
>>>>>> +		if (rc)
>>>>>> +			goto unlock_mmap_out;
>>>>>> +
>>>>>> +		rc = follow_pfnmap_start(&pfnmap_args);
>>>>>> +		if (rc)
>>>>>> +			goto unlock_mmap_out;
>>>>>> +	}
>>>>>> +
>>>>>> +	*mmio_pfnp = pfnmap_args.pfn;
>>>>>> +	follow_pfnmap_end(&pfnmap_args);
>>>>>> +d
>>>>>> +unlock_mmap_out:
>>>>>> +	mmap_read_unlock(current->mm);
>>>>>> +unlock_pt_out:
>>>>>> +	spin_unlock(&pt->pt_mem_regions_lock);
>>>>>> +	return rc;
>>>>>> +}
>>>>>> +
>>>>>> +/*
>>>>>> + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
>>>>>> + * and resolve if possible.
>>>>>> + * Returns: True if valid mmio intercept and it was handled, else false
>>>>>> + */
>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
>>>>>> +{
>>>>>> +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
>>>>>> +	struct hv_x64_memory_intercept_message *msg;
>>>>>> +	union hv_x64_memory_access_info accinfo;
>>>>>> +	u64 gfn, mmio_spa, numpgs;
>>>>>> +	struct mshv_mem_region *mreg;
>>>>>> +	int rc;
>>>>>> +	struct mshv_partition *pt = vp->vp_partition;
>>>>>> +
>>>>>> +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
>>>>>> +	accinfo = msg->memory_access_info;
>>>>>> +
>>>>>> +	if (!accinfo.gva_gpa_valid)
>>>>>> +		return false;
>>>>>> +
>>>>>> +	/* Do a fast check and bail if non mmio intercept */
>>>>>> +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
>>>>>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>
>>>>> This call needs to be protected by the spinlock.
>>>>
>>>> This is sorta fast path to bail. We recheck under partition lock above.
>>>>
>>>
>>> Accessing the list of regions without lock is unsafe.
>>
>> I am not sure why? This check is done by a vcpu thread, so regions
>> will not have just gone away.
>>
> 
> This is shared resources. Multiple VP thread get into this function
> simultaneously, so there is a race already. But this one we can live
> with without locking as they don't mutate the list of the regions.
> 
> The issue happens when VMM adds or removed another region as it mutates
> the list and races with VP threads doing this lookup.
> 
> Thanks,
> Stanislav
> 
> 
>> Thanks,
>> -Mukesh
>>
>>
>>> Thanks,
>>> Stanislav
>>>
>>>> Thanks,
>>>> -Mukesh
>>>>
>>>>
>>>>> Thanks,
>>>>> Stanislav
>>>>>
>>>>>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>> +		return false;
>>>>>> +
>>>>>> +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
>>>>>> +	if (rc)
>>>>>> +		return false;
>>>>>> +
>>>>>> +	if (!hv_nofull_mmio) {		/* default case */
>>>>>> +		gfn = mreg->start_gfn;
>>>>>> +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
>>>>>> +		numpgs = mreg->nr_pages;
>>>>>> +	} else
>>>>>> +		numpgs = 1;
>>>>>> +
>>>>>> +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
>>>>>> +
>>>>>> +	return rc == 0;
>>>>>> +}
>>>>>> +
>>>>>>     static struct mshv_mem_region *
>>>>>>     mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>>>>>>     {
>>>>>> @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>>>>>     	return ret;
>>>>>>     }
>>>>>> +
>>>>>>     #else  /* CONFIG_X86_64 */
>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
>>>>>>     static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>>>>>>     #endif /* CONFIG_X86_64 */
>>>>>>     static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>>>>>>     {
>>>>>>     	switch (vp->vp_intercept_msg_page->header.message_type) {
>>>>>> +	case HVMSG_UNMAPPED_GPA:
>>>>>> +		return mshv_handle_unmapped_gpa(vp);
>>>>>>     	case HVMSG_GPA_INTERCEPT:
>>>>>>     		return mshv_handle_gpa_intercept(vp);
>>>>>>     	}
>>>>>> -- 
>>>>>> 2.51.2.vfs.0.1
>>>>>>

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Stanislav Kinsburskii 5 days, 12 hours ago

On Fri, Jan 30, 2026 at 02:17:24PM -0800, Mukesh R wrote:
> On 1/27/26 10:57, Stanislav Kinsburskii wrote:
> > On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
> > > On 1/26/26 10:15, Stanislav Kinsburskii wrote:
> > > > On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
> > > > > On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> > > > > > On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
> > > > > > > From: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > 
> > > > > > > Upon guest access, in case of missing mmio mapping, the hypervisor
> > > > > > > generates an unmapped gpa intercept. In this path, lookup the PCI
> > > > > > > resource pfn for the guest gpa, and ask the hypervisor to map it
> > > > > > > via hypercall. The PCI resource pfn is maintained by the VFIO driver,
> > > > > > > and obtained via fixup_user_fault call (similar to KVM).
> > > > > > > 
> > > > > > > Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > ---
> > > > > > >     drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
> > > > > > >     1 file changed, 115 insertions(+)
> > > > > > > 
> > > > > > > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > > > > > > index 03f3aa9f5541..4c8bc7cd0888 100644
> > > > > > > --- a/drivers/hv/mshv_root_main.c
> > > > > > > +++ b/drivers/hv/mshv_root_main.c
> > > > > > > @@ -56,6 +56,14 @@ struct hv_stats_page {
> > > > > > >     	};
> > > > > > >     } __packed;
> > > > > > > +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
> > > > > > > +static int __init setup_hv_full_mmio(char *str)
> > > > > > > +{
> > > > > > > +	hv_nofull_mmio = true;
> > > > > > > +	return 0;
> > > > > > > +}
> > > > > > > +__setup("hv_nofull_mmio", setup_hv_full_mmio);
> > > > > > > +
> > > > > > >     struct mshv_root mshv_root;
> > > > > > >     enum hv_scheduler_type hv_scheduler_type;
> > > > > > > @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> > > > > > >     }
> > > > > > >     #ifdef CONFIG_X86_64
> > > > > > > +
> > > > > > > +/*
> > > > > > > + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
> > > > > > > + * else just return -errno.
> > > > > > > + */
> > > > > > > +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
> > > > > > > +				       u64 *mmio_pfnp)
> > > > > > > +{
> > > > > > > +	struct vm_area_struct *vma;
> > > > > > > +	bool is_mmio;
> > > > > > > +	u64 uaddr;
> > > > > > > +	struct mshv_mem_region *mreg;
> > > > > > > +	struct follow_pfnmap_args pfnmap_args;
> > > > > > > +	int rc = -EINVAL;
> > > > > > > +
> > > > > > > +	/*
> > > > > > > +	 * Do not allow mem region to be deleted beneath us. VFIO uses
> > > > > > > +	 * useraddr vma to lookup pci bar pfn.
> > > > > > > +	 */
> > > > > > > +	spin_lock(&pt->pt_mem_regions_lock);
> > > > > > > +
> > > > > > > +	/* Get the region again under the lock */
> > > > > > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > +		goto unlock_pt_out;
> > > > > > > +
> > > > > > > +	uaddr = mreg->start_uaddr +
> > > > > > > +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
> > > > > > > +
> > > > > > > +	mmap_read_lock(current->mm);
> > > > > > 
> > > > > > Semaphore can't be taken under spinlock.
> > > > 
> > > > > 
> > > > > Yeah, something didn't feel right here and I meant to recheck, now regret
> > > > > rushing to submit the patch.
> > > > > 
> > > > > Rethinking, I think the pt_mem_regions_lock is not needed to protect
> > > > > the uaddr because unmap will properly serialize via the mm lock.
> > > > > 
> > > > > 
> > > > > > > +	vma = vma_lookup(current->mm, uaddr);
> > > > > > > +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> > > > > > 
> > > > > > Why this check is needed again?
> > > > > 
> > > > > To make sure region did not change. This check is under lock.
> > > > > 
> > > > 
> > > > How can this happen? One can't change VMA type without unmapping it
> > > > first. And unmapping it leads to a kernel MMIO region state dangling
> > > > around without corresponding user space mapping.
> > > 
> > > Right, and vm_flags would not be mmio expected then.
> > > 
> > > > This is similar to dangling pinned regions and should likely be
> > > > addressed the same way by utilizing MMU notifiers to destpoy memoty
> > > > regions is VMA is detached.
> > > 
> > > I don't think we need that. Either it succeeds if the region did not
> > > change at all, or just fails.
> > > 
> > 
> > I'm afraid we do, as if the driver mapped a page with the previous
> > memory region, and then the region is unmapped, the page will stay
> > mapped in the hypervisor, but will be considered free by kernel, which
> > in turn will lead to GPF upn next allocation.
> 
> There are no ram pages for mmio regions. Also, we don't do much with
> mmio regions other than tell the hyp about it.
> 

So, are you saying that the hypervisor does not use these pages and only
tracks them? That would make things easier.
However, if we later try to map a GPA that is already mapped, will the
hypervisor return an error?

Thanks,
Stanislav

> Thanks,
> -Mukesh
> 
> 
> > With pinned regions we issue is similar but less impacting: pages can't
> > be released by user space unmapping and thus will be simply leaked, but
> > the system stays intact.
> > 
> > MMIO regions are simila to movable region in this regard: they don't
> > reference the user pages, and thus this guest region replaement is a
> > stright wat to kernel panic.
> > 
> > > 
> > > > > > The region type is stored on the region itself.
> > > > > > And the type is checked on the caller side.
> > > > > > 
> > > > > > > +	if (!is_mmio)
> > > > > > > +		goto unlock_mmap_out;
> > > > > > > +
> > > > > > > +	pfnmap_args.vma = vma;
> > > > > > > +	pfnmap_args.address = uaddr;
> > > > > > > +
> > > > > > > +	rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > +	if (rc) {
> > > > > > > +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
> > > > > > > +				      NULL);
> > > > > > > +		if (rc)
> > > > > > > +			goto unlock_mmap_out;
> > > > > > > +
> > > > > > > +		rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > +		if (rc)
> > > > > > > +			goto unlock_mmap_out;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	*mmio_pfnp = pfnmap_args.pfn;
> > > > > > > +	follow_pfnmap_end(&pfnmap_args);
> > > > > > > +d
> > > > > > > +unlock_mmap_out:
> > > > > > > +	mmap_read_unlock(current->mm);
> > > > > > > +unlock_pt_out:
> > > > > > > +	spin_unlock(&pt->pt_mem_regions_lock);
> > > > > > > +	return rc;
> > > > > > > +}
> > > > > > > +
> > > > > > > +/*
> > > > > > > + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
> > > > > > > + * and resolve if possible.
> > > > > > > + * Returns: True if valid mmio intercept and it was handled, else false
> > > > > > > + */
> > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
> > > > > > > +{
> > > > > > > +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
> > > > > > > +	struct hv_x64_memory_intercept_message *msg;
> > > > > > > +	union hv_x64_memory_access_info accinfo;
> > > > > > > +	u64 gfn, mmio_spa, numpgs;
> > > > > > > +	struct mshv_mem_region *mreg;
> > > > > > > +	int rc;
> > > > > > > +	struct mshv_partition *pt = vp->vp_partition;
> > > > > > > +
> > > > > > > +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
> > > > > > > +	accinfo = msg->memory_access_info;
> > > > > > > +
> > > > > > > +	if (!accinfo.gva_gpa_valid)
> > > > > > > +		return false;
> > > > > > > +
> > > > > > > +	/* Do a fast check and bail if non mmio intercept */
> > > > > > > +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
> > > > > > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > 
> > > > > > This call needs to be protected by the spinlock.
> > > > > 
> > > > > This is sorta fast path to bail. We recheck under partition lock above.
> > > > > 
> > > > 
> > > > Accessing the list of regions without lock is unsafe.
> > > 
> > > I am not sure why? This check is done by a vcpu thread, so regions
> > > will not have just gone away.
> > > 
> > 
> > This is shared resources. Multiple VP thread get into this function
> > simultaneously, so there is a race already. But this one we can live
> > with without locking as they don't mutate the list of the regions.
> > 
> > The issue happens when VMM adds or removed another region as it mutates
> > the list and races with VP threads doing this lookup.
> > 
> > Thanks,
> > Stanislav
> > 
> > 
> > > Thanks,
> > > -Mukesh
> > > 
> > > 
> > > > Thanks,
> > > > Stanislav
> > > > 
> > > > > Thanks,
> > > > > -Mukesh
> > > > > 
> > > > > 
> > > > > > Thanks,
> > > > > > Stanislav
> > > > > > 
> > > > > > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > +		return false;
> > > > > > > +
> > > > > > > +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
> > > > > > > +	if (rc)
> > > > > > > +		return false;
> > > > > > > +
> > > > > > > +	if (!hv_nofull_mmio) {		/* default case */
> > > > > > > +		gfn = mreg->start_gfn;
> > > > > > > +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
> > > > > > > +		numpgs = mreg->nr_pages;
> > > > > > > +	} else
> > > > > > > +		numpgs = 1;
> > > > > > > +
> > > > > > > +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
> > > > > > > +
> > > > > > > +	return rc == 0;
> > > > > > > +}
> > > > > > > +
> > > > > > >     static struct mshv_mem_region *
> > > > > > >     mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> > > > > > >     {
> > > > > > > @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > > > > > >     	return ret;
> > > > > > >     }
> > > > > > > +
> > > > > > >     #else  /* CONFIG_X86_64 */
> > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
> > > > > > >     static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > > > > > >     #endif /* CONFIG_X86_64 */
> > > > > > >     static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> > > > > > >     {
> > > > > > >     	switch (vp->vp_intercept_msg_page->header.message_type) {
> > > > > > > +	case HVMSG_UNMAPPED_GPA:
> > > > > > > +		return mshv_handle_unmapped_gpa(vp);
> > > > > > >     	case HVMSG_GPA_INTERCEPT:
> > > > > > >     		return mshv_handle_gpa_intercept(vp);
> > > > > > >     	}
> > > > > > > -- 
> > > > > > > 2.51.2.vfs.0.1
> > > > > > >

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Mukesh R 3 days, 6 hours ago

On 2/2/26 08:30, Stanislav Kinsburskii wrote:
> On Fri, Jan 30, 2026 at 02:17:24PM -0800, Mukesh R wrote:
>> On 1/27/26 10:57, Stanislav Kinsburskii wrote:
>>> On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
>>>> On 1/26/26 10:15, Stanislav Kinsburskii wrote:
>>>>> On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
>>>>>> On 1/20/26 17:53, Stanislav Kinsburskii wrote:
>>>>>>> On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
>>>>>>>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>>>>
>>>>>>>> Upon guest access, in case of missing mmio mapping, the hypervisor
>>>>>>>> generates an unmapped gpa intercept. In this path, lookup the PCI
>>>>>>>> resource pfn for the guest gpa, and ask the hypervisor to map it
>>>>>>>> via hypercall. The PCI resource pfn is maintained by the VFIO driver,
>>>>>>>> and obtained via fixup_user_fault call (similar to KVM).
>>>>>>>>
>>>>>>>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>>>> ---
>>>>>>>>      drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
>>>>>>>>      1 file changed, 115 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>>>>>>>> index 03f3aa9f5541..4c8bc7cd0888 100644
>>>>>>>> --- a/drivers/hv/mshv_root_main.c
>>>>>>>> +++ b/drivers/hv/mshv_root_main.c
>>>>>>>> @@ -56,6 +56,14 @@ struct hv_stats_page {
>>>>>>>>      	};
>>>>>>>>      } __packed;
>>>>>>>> +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
>>>>>>>> +static int __init setup_hv_full_mmio(char *str)
>>>>>>>> +{
>>>>>>>> +	hv_nofull_mmio = true;
>>>>>>>> +	return 0;
>>>>>>>> +}
>>>>>>>> +__setup("hv_nofull_mmio", setup_hv_full_mmio);
>>>>>>>> +
>>>>>>>>      struct mshv_root mshv_root;
>>>>>>>>      enum hv_scheduler_type hv_scheduler_type;
>>>>>>>> @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>>>>>>>>      }
>>>>>>>>      #ifdef CONFIG_X86_64
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
>>>>>>>> + * else just return -errno.
>>>>>>>> + */
>>>>>>>> +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
>>>>>>>> +				       u64 *mmio_pfnp)
>>>>>>>> +{
>>>>>>>> +	struct vm_area_struct *vma;
>>>>>>>> +	bool is_mmio;
>>>>>>>> +	u64 uaddr;
>>>>>>>> +	struct mshv_mem_region *mreg;
>>>>>>>> +	struct follow_pfnmap_args pfnmap_args;
>>>>>>>> +	int rc = -EINVAL;
>>>>>>>> +
>>>>>>>> +	/*
>>>>>>>> +	 * Do not allow mem region to be deleted beneath us. VFIO uses
>>>>>>>> +	 * useraddr vma to lookup pci bar pfn.
>>>>>>>> +	 */
>>>>>>>> +	spin_lock(&pt->pt_mem_regions_lock);
>>>>>>>> +
>>>>>>>> +	/* Get the region again under the lock */
>>>>>>>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>>>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>>>> +		goto unlock_pt_out;
>>>>>>>> +
>>>>>>>> +	uaddr = mreg->start_uaddr +
>>>>>>>> +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
>>>>>>>> +
>>>>>>>> +	mmap_read_lock(current->mm);
>>>>>>>
>>>>>>> Semaphore can't be taken under spinlock.
>>>>>
>>>>>>
>>>>>> Yeah, something didn't feel right here and I meant to recheck, now regret
>>>>>> rushing to submit the patch.
>>>>>>
>>>>>> Rethinking, I think the pt_mem_regions_lock is not needed to protect
>>>>>> the uaddr because unmap will properly serialize via the mm lock.
>>>>>>
>>>>>>
>>>>>>>> +	vma = vma_lookup(current->mm, uaddr);
>>>>>>>> +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
>>>>>>>
>>>>>>> Why this check is needed again?
>>>>>>
>>>>>> To make sure region did not change. This check is under lock.
>>>>>>
>>>>>
>>>>> How can this happen? One can't change VMA type without unmapping it
>>>>> first. And unmapping it leads to a kernel MMIO region state dangling
>>>>> around without corresponding user space mapping.
>>>>
>>>> Right, and vm_flags would not be mmio expected then.
>>>>
>>>>> This is similar to dangling pinned regions and should likely be
>>>>> addressed the same way by utilizing MMU notifiers to destpoy memoty
>>>>> regions is VMA is detached.
>>>>
>>>> I don't think we need that. Either it succeeds if the region did not
>>>> change at all, or just fails.
>>>>
>>>
>>> I'm afraid we do, as if the driver mapped a page with the previous
>>> memory region, and then the region is unmapped, the page will stay
>>> mapped in the hypervisor, but will be considered free by kernel, which
>>> in turn will lead to GPF upn next allocation.
>>
>> There are no ram pages for mmio regions. Also, we don't do much with
>> mmio regions other than tell the hyp about it.
>>
> 
> So, are you saying that the hypervisor does not use these pages and only
> tracks them? That would make things easier.
> However, if we later try to map a GPA that is already mapped, will the
> hypervisor return an error?

Hypervisor does not return an error.



> Thanks,
> Stanislav
> 
>> Thanks,
>> -Mukesh
>>
>>
>>> With pinned regions we issue is similar but less impacting: pages can't
>>> be released by user space unmapping and thus will be simply leaked, but
>>> the system stays intact.
>>>
>>> MMIO regions are simila to movable region in this regard: they don't
>>> reference the user pages, and thus this guest region replaement is a
>>> stright wat to kernel panic.
>>>
>>>>
>>>>>>> The region type is stored on the region itself.
>>>>>>> And the type is checked on the caller side.
>>>>>>>
>>>>>>>> +	if (!is_mmio)
>>>>>>>> +		goto unlock_mmap_out;
>>>>>>>> +
>>>>>>>> +	pfnmap_args.vma = vma;
>>>>>>>> +	pfnmap_args.address = uaddr;
>>>>>>>> +
>>>>>>>> +	rc = follow_pfnmap_start(&pfnmap_args);
>>>>>>>> +	if (rc) {
>>>>>>>> +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
>>>>>>>> +				      NULL);
>>>>>>>> +		if (rc)
>>>>>>>> +			goto unlock_mmap_out;
>>>>>>>> +
>>>>>>>> +		rc = follow_pfnmap_start(&pfnmap_args);
>>>>>>>> +		if (rc)
>>>>>>>> +			goto unlock_mmap_out;
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	*mmio_pfnp = pfnmap_args.pfn;
>>>>>>>> +	follow_pfnmap_end(&pfnmap_args);
>>>>>>>> +d
>>>>>>>> +unlock_mmap_out:
>>>>>>>> +	mmap_read_unlock(current->mm);
>>>>>>>> +unlock_pt_out:
>>>>>>>> +	spin_unlock(&pt->pt_mem_regions_lock);
>>>>>>>> +	return rc;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
>>>>>>>> + * and resolve if possible.
>>>>>>>> + * Returns: True if valid mmio intercept and it was handled, else false
>>>>>>>> + */
>>>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
>>>>>>>> +{
>>>>>>>> +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
>>>>>>>> +	struct hv_x64_memory_intercept_message *msg;
>>>>>>>> +	union hv_x64_memory_access_info accinfo;
>>>>>>>> +	u64 gfn, mmio_spa, numpgs;
>>>>>>>> +	struct mshv_mem_region *mreg;
>>>>>>>> +	int rc;
>>>>>>>> +	struct mshv_partition *pt = vp->vp_partition;
>>>>>>>> +
>>>>>>>> +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
>>>>>>>> +	accinfo = msg->memory_access_info;
>>>>>>>> +
>>>>>>>> +	if (!accinfo.gva_gpa_valid)
>>>>>>>> +		return false;
>>>>>>>> +
>>>>>>>> +	/* Do a fast check and bail if non mmio intercept */
>>>>>>>> +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
>>>>>>>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>>>
>>>>>>> This call needs to be protected by the spinlock.
>>>>>>
>>>>>> This is sorta fast path to bail. We recheck under partition lock above.
>>>>>>
>>>>>
>>>>> Accessing the list of regions without lock is unsafe.
>>>>
>>>> I am not sure why? This check is done by a vcpu thread, so regions
>>>> will not have just gone away.
>>>>
>>>
>>> This is shared resources. Multiple VP thread get into this function
>>> simultaneously, so there is a race already. But this one we can live
>>> with without locking as they don't mutate the list of the regions.
>>>
>>> The issue happens when VMM adds or removed another region as it mutates
>>> the list and races with VP threads doing this lookup.
>>>
>>> Thanks,
>>> Stanislav
>>>
>>>
>>>> Thanks,
>>>> -Mukesh
>>>>
>>>>
>>>>> Thanks,
>>>>> Stanislav
>>>>>
>>>>>> Thanks,
>>>>>> -Mukesh
>>>>>>
>>>>>>
>>>>>>> Thanks,
>>>>>>> Stanislav
>>>>>>>
>>>>>>>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>>>> +		return false;
>>>>>>>> +
>>>>>>>> +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
>>>>>>>> +	if (rc)
>>>>>>>> +		return false;
>>>>>>>> +
>>>>>>>> +	if (!hv_nofull_mmio) {		/* default case */
>>>>>>>> +		gfn = mreg->start_gfn;
>>>>>>>> +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
>>>>>>>> +		numpgs = mreg->nr_pages;
>>>>>>>> +	} else
>>>>>>>> +		numpgs = 1;
>>>>>>>> +
>>>>>>>> +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
>>>>>>>> +
>>>>>>>> +	return rc == 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>      static struct mshv_mem_region *
>>>>>>>>      mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>>>>>>>>      {
>>>>>>>> @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>>>>>>>      	return ret;
>>>>>>>>      }
>>>>>>>> +
>>>>>>>>      #else  /* CONFIG_X86_64 */
>>>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
>>>>>>>>      static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>>>>>>>>      #endif /* CONFIG_X86_64 */
>>>>>>>>      static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>>>>>>>>      {
>>>>>>>>      	switch (vp->vp_intercept_msg_page->header.message_type) {
>>>>>>>> +	case HVMSG_UNMAPPED_GPA:
>>>>>>>> +		return mshv_handle_unmapped_gpa(vp);
>>>>>>>>      	case HVMSG_GPA_INTERCEPT:
>>>>>>>>      		return mshv_handle_gpa_intercept(vp);
>>>>>>>>      	}
>>>>>>>> -- 
>>>>>>>> 2.51.2.vfs.0.1
>>>>>>>>

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Stanislav Kinsburskii 2 days, 13 hours ago

On Wed, Feb 04, 2026 at 02:52:54PM -0800, Mukesh R wrote:
> On 2/2/26 08:30, Stanislav Kinsburskii wrote:
> > On Fri, Jan 30, 2026 at 02:17:24PM -0800, Mukesh R wrote:
> > > On 1/27/26 10:57, Stanislav Kinsburskii wrote:
> > > > On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
> > > > > On 1/26/26 10:15, Stanislav Kinsburskii wrote:
> > > > > > On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
> > > > > > > On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> > > > > > > > On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
> > > > > > > > > From: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > > > 
> > > > > > > > > Upon guest access, in case of missing mmio mapping, the hypervisor
> > > > > > > > > generates an unmapped gpa intercept. In this path, lookup the PCI
> > > > > > > > > resource pfn for the guest gpa, and ask the hypervisor to map it
> > > > > > > > > via hypercall. The PCI resource pfn is maintained by the VFIO driver,
> > > > > > > > > and obtained via fixup_user_fault call (similar to KVM).
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > > > ---
> > > > > > > > >      drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
> > > > > > > > >      1 file changed, 115 insertions(+)
> > > > > > > > > 
> > > > > > > > > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > > > > > > > > index 03f3aa9f5541..4c8bc7cd0888 100644
> > > > > > > > > --- a/drivers/hv/mshv_root_main.c
> > > > > > > > > +++ b/drivers/hv/mshv_root_main.c
> > > > > > > > > @@ -56,6 +56,14 @@ struct hv_stats_page {
> > > > > > > > >      	};
> > > > > > > > >      } __packed;
> > > > > > > > > +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
> > > > > > > > > +static int __init setup_hv_full_mmio(char *str)
> > > > > > > > > +{
> > > > > > > > > +	hv_nofull_mmio = true;
> > > > > > > > > +	return 0;
> > > > > > > > > +}
> > > > > > > > > +__setup("hv_nofull_mmio", setup_hv_full_mmio);
> > > > > > > > > +
> > > > > > > > >      struct mshv_root mshv_root;
> > > > > > > > >      enum hv_scheduler_type hv_scheduler_type;
> > > > > > > > > @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> > > > > > > > >      }
> > > > > > > > >      #ifdef CONFIG_X86_64
> > > > > > > > > +
> > > > > > > > > +/*
> > > > > > > > > + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
> > > > > > > > > + * else just return -errno.
> > > > > > > > > + */
> > > > > > > > > +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
> > > > > > > > > +				       u64 *mmio_pfnp)
> > > > > > > > > +{
> > > > > > > > > +	struct vm_area_struct *vma;
> > > > > > > > > +	bool is_mmio;
> > > > > > > > > +	u64 uaddr;
> > > > > > > > > +	struct mshv_mem_region *mreg;
> > > > > > > > > +	struct follow_pfnmap_args pfnmap_args;
> > > > > > > > > +	int rc = -EINVAL;
> > > > > > > > > +
> > > > > > > > > +	/*
> > > > > > > > > +	 * Do not allow mem region to be deleted beneath us. VFIO uses
> > > > > > > > > +	 * useraddr vma to lookup pci bar pfn.
> > > > > > > > > +	 */
> > > > > > > > > +	spin_lock(&pt->pt_mem_regions_lock);
> > > > > > > > > +
> > > > > > > > > +	/* Get the region again under the lock */
> > > > > > > > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > > > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > > > +		goto unlock_pt_out;
> > > > > > > > > +
> > > > > > > > > +	uaddr = mreg->start_uaddr +
> > > > > > > > > +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
> > > > > > > > > +
> > > > > > > > > +	mmap_read_lock(current->mm);
> > > > > > > > 
> > > > > > > > Semaphore can't be taken under spinlock.
> > > > > > 
> > > > > > > 
> > > > > > > Yeah, something didn't feel right here and I meant to recheck, now regret
> > > > > > > rushing to submit the patch.
> > > > > > > 
> > > > > > > Rethinking, I think the pt_mem_regions_lock is not needed to protect
> > > > > > > the uaddr because unmap will properly serialize via the mm lock.
> > > > > > > 
> > > > > > > 
> > > > > > > > > +	vma = vma_lookup(current->mm, uaddr);
> > > > > > > > > +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> > > > > > > > 
> > > > > > > > Why this check is needed again?
> > > > > > > 
> > > > > > > To make sure region did not change. This check is under lock.
> > > > > > > 
> > > > > > 
> > > > > > How can this happen? One can't change VMA type without unmapping it
> > > > > > first. And unmapping it leads to a kernel MMIO region state dangling
> > > > > > around without corresponding user space mapping.
> > > > > 
> > > > > Right, and vm_flags would not be mmio expected then.
> > > > > 
> > > > > > This is similar to dangling pinned regions and should likely be
> > > > > > addressed the same way by utilizing MMU notifiers to destpoy memoty
> > > > > > regions is VMA is detached.
> > > > > 
> > > > > I don't think we need that. Either it succeeds if the region did not
> > > > > change at all, or just fails.
> > > > > 
> > > > 
> > > > I'm afraid we do, as if the driver mapped a page with the previous
> > > > memory region, and then the region is unmapped, the page will stay
> > > > mapped in the hypervisor, but will be considered free by kernel, which
> > > > in turn will lead to GPF upn next allocation.
> > > 
> > > There are no ram pages for mmio regions. Also, we don't do much with
> > > mmio regions other than tell the hyp about it.
> > > 
> > 
> > So, are you saying that the hypervisor does not use these pages and only
> > tracks them? That would make things easier.
> > However, if we later try to map a GPA that is already mapped, will the
> > hypervisor return an error?
> 
> Hypervisor does not return an error.
> 

So, what happenes if we map a GPA that is already mapped? Does it just
remap it to the new PFN?

Thanks,
Stanislav

> 
> 
> > Thanks,
> > Stanislav
> > 
> > > Thanks,
> > > -Mukesh
> > > 
> > > 
> > > > With pinned regions we issue is similar but less impacting: pages can't
> > > > be released by user space unmapping and thus will be simply leaked, but
> > > > the system stays intact.
> > > > 
> > > > MMIO regions are simila to movable region in this regard: they don't
> > > > reference the user pages, and thus this guest region replaement is a
> > > > stright wat to kernel panic.
> > > > 
> > > > > 
> > > > > > > > The region type is stored on the region itself.
> > > > > > > > And the type is checked on the caller side.
> > > > > > > > 
> > > > > > > > > +	if (!is_mmio)
> > > > > > > > > +		goto unlock_mmap_out;
> > > > > > > > > +
> > > > > > > > > +	pfnmap_args.vma = vma;
> > > > > > > > > +	pfnmap_args.address = uaddr;
> > > > > > > > > +
> > > > > > > > > +	rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > > > +	if (rc) {
> > > > > > > > > +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
> > > > > > > > > +				      NULL);
> > > > > > > > > +		if (rc)
> > > > > > > > > +			goto unlock_mmap_out;
> > > > > > > > > +
> > > > > > > > > +		rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > > > +		if (rc)
> > > > > > > > > +			goto unlock_mmap_out;
> > > > > > > > > +	}
> > > > > > > > > +
> > > > > > > > > +	*mmio_pfnp = pfnmap_args.pfn;
> > > > > > > > > +	follow_pfnmap_end(&pfnmap_args);
> > > > > > > > > +d
> > > > > > > > > +unlock_mmap_out:
> > > > > > > > > +	mmap_read_unlock(current->mm);
> > > > > > > > > +unlock_pt_out:
> > > > > > > > > +	spin_unlock(&pt->pt_mem_regions_lock);
> > > > > > > > > +	return rc;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +/*
> > > > > > > > > + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
> > > > > > > > > + * and resolve if possible.
> > > > > > > > > + * Returns: True if valid mmio intercept and it was handled, else false
> > > > > > > > > + */
> > > > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
> > > > > > > > > +{
> > > > > > > > > +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
> > > > > > > > > +	struct hv_x64_memory_intercept_message *msg;
> > > > > > > > > +	union hv_x64_memory_access_info accinfo;
> > > > > > > > > +	u64 gfn, mmio_spa, numpgs;
> > > > > > > > > +	struct mshv_mem_region *mreg;
> > > > > > > > > +	int rc;
> > > > > > > > > +	struct mshv_partition *pt = vp->vp_partition;
> > > > > > > > > +
> > > > > > > > > +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
> > > > > > > > > +	accinfo = msg->memory_access_info;
> > > > > > > > > +
> > > > > > > > > +	if (!accinfo.gva_gpa_valid)
> > > > > > > > > +		return false;
> > > > > > > > > +
> > > > > > > > > +	/* Do a fast check and bail if non mmio intercept */
> > > > > > > > > +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
> > > > > > > > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > > > 
> > > > > > > > This call needs to be protected by the spinlock.
> > > > > > > 
> > > > > > > This is sorta fast path to bail. We recheck under partition lock above.
> > > > > > > 
> > > > > > 
> > > > > > Accessing the list of regions without lock is unsafe.
> > > > > 
> > > > > I am not sure why? This check is done by a vcpu thread, so regions
> > > > > will not have just gone away.
> > > > > 
> > > > 
> > > > This is shared resources. Multiple VP thread get into this function
> > > > simultaneously, so there is a race already. But this one we can live
> > > > with without locking as they don't mutate the list of the regions.
> > > > 
> > > > The issue happens when VMM adds or removed another region as it mutates
> > > > the list and races with VP threads doing this lookup.
> > > > 
> > > > Thanks,
> > > > Stanislav
> > > > 
> > > > 
> > > > > Thanks,
> > > > > -Mukesh
> > > > > 
> > > > > 
> > > > > > Thanks,
> > > > > > Stanislav
> > > > > > 
> > > > > > > Thanks,
> > > > > > > -Mukesh
> > > > > > > 
> > > > > > > 
> > > > > > > > Thanks,
> > > > > > > > Stanislav
> > > > > > > > 
> > > > > > > > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > > > +		return false;
> > > > > > > > > +
> > > > > > > > > +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
> > > > > > > > > +	if (rc)
> > > > > > > > > +		return false;
> > > > > > > > > +
> > > > > > > > > +	if (!hv_nofull_mmio) {		/* default case */
> > > > > > > > > +		gfn = mreg->start_gfn;
> > > > > > > > > +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
> > > > > > > > > +		numpgs = mreg->nr_pages;
> > > > > > > > > +	} else
> > > > > > > > > +		numpgs = 1;
> > > > > > > > > +
> > > > > > > > > +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
> > > > > > > > > +
> > > > > > > > > +	return rc == 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >      static struct mshv_mem_region *
> > > > > > > > >      mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> > > > > > > > >      {
> > > > > > > > > @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > > > > > > > >      	return ret;
> > > > > > > > >      }
> > > > > > > > > +
> > > > > > > > >      #else  /* CONFIG_X86_64 */
> > > > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
> > > > > > > > >      static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > > > > > > > >      #endif /* CONFIG_X86_64 */
> > > > > > > > >      static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> > > > > > > > >      {
> > > > > > > > >      	switch (vp->vp_intercept_msg_page->header.message_type) {
> > > > > > > > > +	case HVMSG_UNMAPPED_GPA:
> > > > > > > > > +		return mshv_handle_unmapped_gpa(vp);
> > > > > > > > >      	case HVMSG_GPA_INTERCEPT:
> > > > > > > > >      		return mshv_handle_gpa_intercept(vp);
> > > > > > > > >      	}
> > > > > > > > > -- 
> > > > > > > > > 2.51.2.vfs.0.1
> > > > > > > > >

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Mukesh R 2 days, 11 hours ago

On 2/5/26 08:28, Stanislav Kinsburskii wrote:
> On Wed, Feb 04, 2026 at 02:52:54PM -0800, Mukesh R wrote:
>> On 2/2/26 08:30, Stanislav Kinsburskii wrote:
>>> On Fri, Jan 30, 2026 at 02:17:24PM -0800, Mukesh R wrote:
>>>> On 1/27/26 10:57, Stanislav Kinsburskii wrote:
>>>>> On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
>>>>>> On 1/26/26 10:15, Stanislav Kinsburskii wrote:
>>>>>>> On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
>>>>>>>> On 1/20/26 17:53, Stanislav Kinsburskii wrote:
>>>>>>>>> On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
>>>>>>>>>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>>>>>>
>>>>>>>>>> Upon guest access, in case of missing mmio mapping, the hypervisor
>>>>>>>>>> generates an unmapped gpa intercept. In this path, lookup the PCI
>>>>>>>>>> resource pfn for the guest gpa, and ask the hypervisor to map it
>>>>>>>>>> via hypercall. The PCI resource pfn is maintained by the VFIO driver,
>>>>>>>>>> and obtained via fixup_user_fault call (similar to KVM).
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>>>>>> ---
>>>>>>>>>>       drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
>>>>>>>>>>       1 file changed, 115 insertions(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>>>>>>>>>> index 03f3aa9f5541..4c8bc7cd0888 100644
>>>>>>>>>> --- a/drivers/hv/mshv_root_main.c
>>>>>>>>>> +++ b/drivers/hv/mshv_root_main.c
>>>>>>>>>> @@ -56,6 +56,14 @@ struct hv_stats_page {
>>>>>>>>>>       	};
>>>>>>>>>>       } __packed;
>>>>>>>>>> +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
>>>>>>>>>> +static int __init setup_hv_full_mmio(char *str)
>>>>>>>>>> +{
>>>>>>>>>> +	hv_nofull_mmio = true;
>>>>>>>>>> +	return 0;
>>>>>>>>>> +}
>>>>>>>>>> +__setup("hv_nofull_mmio", setup_hv_full_mmio);
>>>>>>>>>> +
>>>>>>>>>>       struct mshv_root mshv_root;
>>>>>>>>>>       enum hv_scheduler_type hv_scheduler_type;
>>>>>>>>>> @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>>>>>>>>>>       }
>>>>>>>>>>       #ifdef CONFIG_X86_64
>>>>>>>>>> +
>>>>>>>>>> +/*
>>>>>>>>>> + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
>>>>>>>>>> + * else just return -errno.
>>>>>>>>>> + */
>>>>>>>>>> +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
>>>>>>>>>> +				       u64 *mmio_pfnp)
>>>>>>>>>> +{
>>>>>>>>>> +	struct vm_area_struct *vma;
>>>>>>>>>> +	bool is_mmio;
>>>>>>>>>> +	u64 uaddr;
>>>>>>>>>> +	struct mshv_mem_region *mreg;
>>>>>>>>>> +	struct follow_pfnmap_args pfnmap_args;
>>>>>>>>>> +	int rc = -EINVAL;
>>>>>>>>>> +
>>>>>>>>>> +	/*
>>>>>>>>>> +	 * Do not allow mem region to be deleted beneath us. VFIO uses
>>>>>>>>>> +	 * useraddr vma to lookup pci bar pfn.
>>>>>>>>>> +	 */
>>>>>>>>>> +	spin_lock(&pt->pt_mem_regions_lock);
>>>>>>>>>> +
>>>>>>>>>> +	/* Get the region again under the lock */
>>>>>>>>>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>>>>>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>>>>>> +		goto unlock_pt_out;
>>>>>>>>>> +
>>>>>>>>>> +	uaddr = mreg->start_uaddr +
>>>>>>>>>> +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
>>>>>>>>>> +
>>>>>>>>>> +	mmap_read_lock(current->mm);
>>>>>>>>>
>>>>>>>>> Semaphore can't be taken under spinlock.
>>>>>>>
>>>>>>>>
>>>>>>>> Yeah, something didn't feel right here and I meant to recheck, now regret
>>>>>>>> rushing to submit the patch.
>>>>>>>>
>>>>>>>> Rethinking, I think the pt_mem_regions_lock is not needed to protect
>>>>>>>> the uaddr because unmap will properly serialize via the mm lock.
>>>>>>>>
>>>>>>>>
>>>>>>>>>> +	vma = vma_lookup(current->mm, uaddr);
>>>>>>>>>> +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
>>>>>>>>>
>>>>>>>>> Why this check is needed again?
>>>>>>>>
>>>>>>>> To make sure region did not change. This check is under lock.
>>>>>>>>
>>>>>>>
>>>>>>> How can this happen? One can't change VMA type without unmapping it
>>>>>>> first. And unmapping it leads to a kernel MMIO region state dangling
>>>>>>> around without corresponding user space mapping.
>>>>>>
>>>>>> Right, and vm_flags would not be mmio expected then.
>>>>>>
>>>>>>> This is similar to dangling pinned regions and should likely be
>>>>>>> addressed the same way by utilizing MMU notifiers to destpoy memoty
>>>>>>> regions is VMA is detached.
>>>>>>
>>>>>> I don't think we need that. Either it succeeds if the region did not
>>>>>> change at all, or just fails.
>>>>>>
>>>>>
>>>>> I'm afraid we do, as if the driver mapped a page with the previous
>>>>> memory region, and then the region is unmapped, the page will stay
>>>>> mapped in the hypervisor, but will be considered free by kernel, which
>>>>> in turn will lead to GPF upn next allocation.
>>>>
>>>> There are no ram pages for mmio regions. Also, we don't do much with
>>>> mmio regions other than tell the hyp about it.
>>>>
>>>
>>> So, are you saying that the hypervisor does not use these pages and only
>>> tracks them? That would make things easier.
>>> However, if we later try to map a GPA that is already mapped, will the
>>> hypervisor return an error?
>>
>> Hypervisor does not return an error.
>>
> 
> So, what happenes if we map a GPA that is already mapped? Does it just
> remap it to the new PFN?

yes, otherwise it would return error, right?

> Thanks,
> Stanislav
> 
>>
>>
>>> Thanks,
>>> Stanislav
>>>
>>>> Thanks,
>>>> -Mukesh
>>>>
>>>>
>>>>> With pinned regions we issue is similar but less impacting: pages can't
>>>>> be released by user space unmapping and thus will be simply leaked, but
>>>>> the system stays intact.
>>>>>
>>>>> MMIO regions are simila to movable region in this regard: they don't
>>>>> reference the user pages, and thus this guest region replaement is a
>>>>> stright wat to kernel panic.
>>>>>
>>>>>>
>>>>>>>>> The region type is stored on the region itself.
>>>>>>>>> And the type is checked on the caller side.
>>>>>>>>>
>>>>>>>>>> +	if (!is_mmio)
>>>>>>>>>> +		goto unlock_mmap_out;
>>>>>>>>>> +
>>>>>>>>>> +	pfnmap_args.vma = vma;
>>>>>>>>>> +	pfnmap_args.address = uaddr;
>>>>>>>>>> +
>>>>>>>>>> +	rc = follow_pfnmap_start(&pfnmap_args);
>>>>>>>>>> +	if (rc) {
>>>>>>>>>> +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
>>>>>>>>>> +				      NULL);
>>>>>>>>>> +		if (rc)
>>>>>>>>>> +			goto unlock_mmap_out;
>>>>>>>>>> +
>>>>>>>>>> +		rc = follow_pfnmap_start(&pfnmap_args);
>>>>>>>>>> +		if (rc)
>>>>>>>>>> +			goto unlock_mmap_out;
>>>>>>>>>> +	}
>>>>>>>>>> +
>>>>>>>>>> +	*mmio_pfnp = pfnmap_args.pfn;
>>>>>>>>>> +	follow_pfnmap_end(&pfnmap_args);
>>>>>>>>>> +d
>>>>>>>>>> +unlock_mmap_out:
>>>>>>>>>> +	mmap_read_unlock(current->mm);
>>>>>>>>>> +unlock_pt_out:
>>>>>>>>>> +	spin_unlock(&pt->pt_mem_regions_lock);
>>>>>>>>>> +	return rc;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +/*
>>>>>>>>>> + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
>>>>>>>>>> + * and resolve if possible.
>>>>>>>>>> + * Returns: True if valid mmio intercept and it was handled, else false
>>>>>>>>>> + */
>>>>>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
>>>>>>>>>> +{
>>>>>>>>>> +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
>>>>>>>>>> +	struct hv_x64_memory_intercept_message *msg;
>>>>>>>>>> +	union hv_x64_memory_access_info accinfo;
>>>>>>>>>> +	u64 gfn, mmio_spa, numpgs;
>>>>>>>>>> +	struct mshv_mem_region *mreg;
>>>>>>>>>> +	int rc;
>>>>>>>>>> +	struct mshv_partition *pt = vp->vp_partition;
>>>>>>>>>> +
>>>>>>>>>> +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
>>>>>>>>>> +	accinfo = msg->memory_access_info;
>>>>>>>>>> +
>>>>>>>>>> +	if (!accinfo.gva_gpa_valid)
>>>>>>>>>> +		return false;
>>>>>>>>>> +
>>>>>>>>>> +	/* Do a fast check and bail if non mmio intercept */
>>>>>>>>>> +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
>>>>>>>>>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>>>>>
>>>>>>>>> This call needs to be protected by the spinlock.
>>>>>>>>
>>>>>>>> This is sorta fast path to bail. We recheck under partition lock above.
>>>>>>>>
>>>>>>>
>>>>>>> Accessing the list of regions without lock is unsafe.
>>>>>>
>>>>>> I am not sure why? This check is done by a vcpu thread, so regions
>>>>>> will not have just gone away.
>>>>>>
>>>>>
>>>>> This is shared resources. Multiple VP thread get into this function
>>>>> simultaneously, so there is a race already. But this one we can live
>>>>> with without locking as they don't mutate the list of the regions.
>>>>>
>>>>> The issue happens when VMM adds or removed another region as it mutates
>>>>> the list and races with VP threads doing this lookup.
>>>>>
>>>>> Thanks,
>>>>> Stanislav
>>>>>
>>>>>
>>>>>> Thanks,
>>>>>> -Mukesh
>>>>>>
>>>>>>
>>>>>>> Thanks,
>>>>>>> Stanislav
>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> -Mukesh
>>>>>>>>
>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> Stanislav
>>>>>>>>>
>>>>>>>>>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>>>>>> +		return false;
>>>>>>>>>> +
>>>>>>>>>> +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
>>>>>>>>>> +	if (rc)
>>>>>>>>>> +		return false;
>>>>>>>>>> +
>>>>>>>>>> +	if (!hv_nofull_mmio) {		/* default case */
>>>>>>>>>> +		gfn = mreg->start_gfn;
>>>>>>>>>> +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
>>>>>>>>>> +		numpgs = mreg->nr_pages;
>>>>>>>>>> +	} else
>>>>>>>>>> +		numpgs = 1;
>>>>>>>>>> +
>>>>>>>>>> +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
>>>>>>>>>> +
>>>>>>>>>> +	return rc == 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>       static struct mshv_mem_region *
>>>>>>>>>>       mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>>>>>>>>>>       {
>>>>>>>>>> @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>>>>>>>>>       	return ret;
>>>>>>>>>>       }
>>>>>>>>>> +
>>>>>>>>>>       #else  /* CONFIG_X86_64 */
>>>>>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
>>>>>>>>>>       static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>>>>>>>>>>       #endif /* CONFIG_X86_64 */
>>>>>>>>>>       static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>>>>>>>>>>       {
>>>>>>>>>>       	switch (vp->vp_intercept_msg_page->header.message_type) {
>>>>>>>>>> +	case HVMSG_UNMAPPED_GPA:
>>>>>>>>>> +		return mshv_handle_unmapped_gpa(vp);
>>>>>>>>>>       	case HVMSG_GPA_INTERCEPT:
>>>>>>>>>>       		return mshv_handle_gpa_intercept(vp);
>>>>>>>>>>       	}
>>>>>>>>>> -- 
>>>>>>>>>> 2.51.2.vfs.0.1
>>>>>>>>>>

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by Stanislav Kinsburskii 2 days, 10 hours ago

On Thu, Feb 05, 2026 at 09:57:20AM -0800, Mukesh R wrote:
> On 2/5/26 08:28, Stanislav Kinsburskii wrote:
> > On Wed, Feb 04, 2026 at 02:52:54PM -0800, Mukesh R wrote:
> > > On 2/2/26 08:30, Stanislav Kinsburskii wrote:
> > > > On Fri, Jan 30, 2026 at 02:17:24PM -0800, Mukesh R wrote:
> > > > > On 1/27/26 10:57, Stanislav Kinsburskii wrote:
> > > > > > On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
> > > > > > > On 1/26/26 10:15, Stanislav Kinsburskii wrote:
> > > > > > > > On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
> > > > > > > > > On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> > > > > > > > > > On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
> > > > > > > > > > > From: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > > > > > 
> > > > > > > > > > > Upon guest access, in case of missing mmio mapping, the hypervisor
> > > > > > > > > > > generates an unmapped gpa intercept. In this path, lookup the PCI
> > > > > > > > > > > resource pfn for the guest gpa, and ask the hypervisor to map it
> > > > > > > > > > > via hypercall. The PCI resource pfn is maintained by the VFIO driver,
> > > > > > > > > > > and obtained via fixup_user_fault call (similar to KVM).
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > > > > > ---
> > > > > > > > > > >       drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
> > > > > > > > > > >       1 file changed, 115 insertions(+)
> > > > > > > > > > > 
> > > > > > > > > > > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > > > > > > > > > > index 03f3aa9f5541..4c8bc7cd0888 100644
> > > > > > > > > > > --- a/drivers/hv/mshv_root_main.c
> > > > > > > > > > > +++ b/drivers/hv/mshv_root_main.c
> > > > > > > > > > > @@ -56,6 +56,14 @@ struct hv_stats_page {
> > > > > > > > > > >       	};
> > > > > > > > > > >       } __packed;
> > > > > > > > > > > +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
> > > > > > > > > > > +static int __init setup_hv_full_mmio(char *str)
> > > > > > > > > > > +{
> > > > > > > > > > > +	hv_nofull_mmio = true;
> > > > > > > > > > > +	return 0;
> > > > > > > > > > > +}
> > > > > > > > > > > +__setup("hv_nofull_mmio", setup_hv_full_mmio);
> > > > > > > > > > > +
> > > > > > > > > > >       struct mshv_root mshv_root;
> > > > > > > > > > >       enum hv_scheduler_type hv_scheduler_type;
> > > > > > > > > > > @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> > > > > > > > > > >       }
> > > > > > > > > > >       #ifdef CONFIG_X86_64
> > > > > > > > > > > +
> > > > > > > > > > > +/*
> > > > > > > > > > > + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
> > > > > > > > > > > + * else just return -errno.
> > > > > > > > > > > + */
> > > > > > > > > > > +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
> > > > > > > > > > > +				       u64 *mmio_pfnp)
> > > > > > > > > > > +{
> > > > > > > > > > > +	struct vm_area_struct *vma;
> > > > > > > > > > > +	bool is_mmio;
> > > > > > > > > > > +	u64 uaddr;
> > > > > > > > > > > +	struct mshv_mem_region *mreg;
> > > > > > > > > > > +	struct follow_pfnmap_args pfnmap_args;
> > > > > > > > > > > +	int rc = -EINVAL;
> > > > > > > > > > > +
> > > > > > > > > > > +	/*
> > > > > > > > > > > +	 * Do not allow mem region to be deleted beneath us. VFIO uses
> > > > > > > > > > > +	 * useraddr vma to lookup pci bar pfn.
> > > > > > > > > > > +	 */
> > > > > > > > > > > +	spin_lock(&pt->pt_mem_regions_lock);
> > > > > > > > > > > +
> > > > > > > > > > > +	/* Get the region again under the lock */
> > > > > > > > > > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > > > > > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > > > > > +		goto unlock_pt_out;
> > > > > > > > > > > +
> > > > > > > > > > > +	uaddr = mreg->start_uaddr +
> > > > > > > > > > > +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
> > > > > > > > > > > +
> > > > > > > > > > > +	mmap_read_lock(current->mm);
> > > > > > > > > > 
> > > > > > > > > > Semaphore can't be taken under spinlock.
> > > > > > > > 
> > > > > > > > > 
> > > > > > > > > Yeah, something didn't feel right here and I meant to recheck, now regret
> > > > > > > > > rushing to submit the patch.
> > > > > > > > > 
> > > > > > > > > Rethinking, I think the pt_mem_regions_lock is not needed to protect
> > > > > > > > > the uaddr because unmap will properly serialize via the mm lock.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > > > +	vma = vma_lookup(current->mm, uaddr);
> > > > > > > > > > > +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> > > > > > > > > > 
> > > > > > > > > > Why this check is needed again?
> > > > > > > > > 
> > > > > > > > > To make sure region did not change. This check is under lock.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > How can this happen? One can't change VMA type without unmapping it
> > > > > > > > first. And unmapping it leads to a kernel MMIO region state dangling
> > > > > > > > around without corresponding user space mapping.
> > > > > > > 
> > > > > > > Right, and vm_flags would not be mmio expected then.
> > > > > > > 
> > > > > > > > This is similar to dangling pinned regions and should likely be
> > > > > > > > addressed the same way by utilizing MMU notifiers to destpoy memoty
> > > > > > > > regions is VMA is detached.
> > > > > > > 
> > > > > > > I don't think we need that. Either it succeeds if the region did not
> > > > > > > change at all, or just fails.
> > > > > > > 
> > > > > > 
> > > > > > I'm afraid we do, as if the driver mapped a page with the previous
> > > > > > memory region, and then the region is unmapped, the page will stay
> > > > > > mapped in the hypervisor, but will be considered free by kernel, which
> > > > > > in turn will lead to GPF upn next allocation.
> > > > > 
> > > > > There are no ram pages for mmio regions. Also, we don't do much with
> > > > > mmio regions other than tell the hyp about it.
> > > > > 
> > > > 
> > > > So, are you saying that the hypervisor does not use these pages and only
> > > > tracks them? That would make things easier.
> > > > However, if we later try to map a GPA that is already mapped, will the
> > > > hypervisor return an error?
> > > 
> > > Hypervisor does not return an error.
> > > 
> > 
> > So, what happenes if we map a GPA that is already mapped? Does it just
> > remap it to the new PFN?
> 
> yes, otherwise it would return error, right?
> 

I see.
Please summarize and document this behaviour in the commit message.

Thanks,
Stanislav

> > Thanks,
> > Stanislav
> > 
> > > 
> > > 
> > > > Thanks,
> > > > Stanislav
> > > > 
> > > > > Thanks,
> > > > > -Mukesh
> > > > > 
> > > > > 
> > > > > > With pinned regions we issue is similar but less impacting: pages can't
> > > > > > be released by user space unmapping and thus will be simply leaked, but
> > > > > > the system stays intact.
> > > > > > 
> > > > > > MMIO regions are simila to movable region in this regard: they don't
> > > > > > reference the user pages, and thus this guest region replaement is a
> > > > > > stright wat to kernel panic.
> > > > > > 
> > > > > > > 
> > > > > > > > > > The region type is stored on the region itself.
> > > > > > > > > > And the type is checked on the caller side.
> > > > > > > > > > 
> > > > > > > > > > > +	if (!is_mmio)
> > > > > > > > > > > +		goto unlock_mmap_out;
> > > > > > > > > > > +
> > > > > > > > > > > +	pfnmap_args.vma = vma;
> > > > > > > > > > > +	pfnmap_args.address = uaddr;
> > > > > > > > > > > +
> > > > > > > > > > > +	rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > > > > > +	if (rc) {
> > > > > > > > > > > +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
> > > > > > > > > > > +				      NULL);
> > > > > > > > > > > +		if (rc)
> > > > > > > > > > > +			goto unlock_mmap_out;
> > > > > > > > > > > +
> > > > > > > > > > > +		rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > > > > > +		if (rc)
> > > > > > > > > > > +			goto unlock_mmap_out;
> > > > > > > > > > > +	}
> > > > > > > > > > > +
> > > > > > > > > > > +	*mmio_pfnp = pfnmap_args.pfn;
> > > > > > > > > > > +	follow_pfnmap_end(&pfnmap_args);
> > > > > > > > > > > +d
> > > > > > > > > > > +unlock_mmap_out:
> > > > > > > > > > > +	mmap_read_unlock(current->mm);
> > > > > > > > > > > +unlock_pt_out:
> > > > > > > > > > > +	spin_unlock(&pt->pt_mem_regions_lock);
> > > > > > > > > > > +	return rc;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +/*
> > > > > > > > > > > + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
> > > > > > > > > > > + * and resolve if possible.
> > > > > > > > > > > + * Returns: True if valid mmio intercept and it was handled, else false
> > > > > > > > > > > + */
> > > > > > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
> > > > > > > > > > > +{
> > > > > > > > > > > +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
> > > > > > > > > > > +	struct hv_x64_memory_intercept_message *msg;
> > > > > > > > > > > +	union hv_x64_memory_access_info accinfo;
> > > > > > > > > > > +	u64 gfn, mmio_spa, numpgs;
> > > > > > > > > > > +	struct mshv_mem_region *mreg;
> > > > > > > > > > > +	int rc;
> > > > > > > > > > > +	struct mshv_partition *pt = vp->vp_partition;
> > > > > > > > > > > +
> > > > > > > > > > > +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
> > > > > > > > > > > +	accinfo = msg->memory_access_info;
> > > > > > > > > > > +
> > > > > > > > > > > +	if (!accinfo.gva_gpa_valid)
> > > > > > > > > > > +		return false;
> > > > > > > > > > > +
> > > > > > > > > > > +	/* Do a fast check and bail if non mmio intercept */
> > > > > > > > > > > +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
> > > > > > > > > > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > > > > > 
> > > > > > > > > > This call needs to be protected by the spinlock.
> > > > > > > > > 
> > > > > > > > > This is sorta fast path to bail. We recheck under partition lock above.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > Accessing the list of regions without lock is unsafe.
> > > > > > > 
> > > > > > > I am not sure why? This check is done by a vcpu thread, so regions
> > > > > > > will not have just gone away.
> > > > > > > 
> > > > > > 
> > > > > > This is shared resources. Multiple VP thread get into this function
> > > > > > simultaneously, so there is a race already. But this one we can live
> > > > > > with without locking as they don't mutate the list of the regions.
> > > > > > 
> > > > > > The issue happens when VMM adds or removed another region as it mutates
> > > > > > the list and races with VP threads doing this lookup.
> > > > > > 
> > > > > > Thanks,
> > > > > > Stanislav
> > > > > > 
> > > > > > 
> > > > > > > Thanks,
> > > > > > > -Mukesh
> > > > > > > 
> > > > > > > 
> > > > > > > > Thanks,
> > > > > > > > Stanislav
> > > > > > > > 
> > > > > > > > > Thanks,
> > > > > > > > > -Mukesh
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > > Thanks,
> > > > > > > > > > Stanislav
> > > > > > > > > > 
> > > > > > > > > > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > > > > > +		return false;
> > > > > > > > > > > +
> > > > > > > > > > > +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
> > > > > > > > > > > +	if (rc)
> > > > > > > > > > > +		return false;
> > > > > > > > > > > +
> > > > > > > > > > > +	if (!hv_nofull_mmio) {		/* default case */
> > > > > > > > > > > +		gfn = mreg->start_gfn;
> > > > > > > > > > > +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
> > > > > > > > > > > +		numpgs = mreg->nr_pages;
> > > > > > > > > > > +	} else
> > > > > > > > > > > +		numpgs = 1;
> > > > > > > > > > > +
> > > > > > > > > > > +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
> > > > > > > > > > > +
> > > > > > > > > > > +	return rc == 0;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >       static struct mshv_mem_region *
> > > > > > > > > > >       mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> > > > > > > > > > >       {
> > > > > > > > > > > @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > > > > > > > > > >       	return ret;
> > > > > > > > > > >       }
> > > > > > > > > > > +
> > > > > > > > > > >       #else  /* CONFIG_X86_64 */
> > > > > > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
> > > > > > > > > > >       static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > > > > > > > > > >       #endif /* CONFIG_X86_64 */
> > > > > > > > > > >       static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> > > > > > > > > > >       {
> > > > > > > > > > >       	switch (vp->vp_intercept_msg_page->header.message_type) {
> > > > > > > > > > > +	case HVMSG_UNMAPPED_GPA:
> > > > > > > > > > > +		return mshv_handle_unmapped_gpa(vp);
> > > > > > > > > > >       	case HVMSG_GPA_INTERCEPT:
> > > > > > > > > > >       		return mshv_handle_gpa_intercept(vp);
> > > > > > > > > > >       	}
> > > > > > > > > > > -- 
> > > > > > > > > > > 2.51.2.vfs.0.1
> > > > > > > > > > >

Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru

Posted by kernel test robot 2 weeks, 4 days ago

Hi Mukesh,

kernel test robot noticed the following build warnings:

[auto build test WARNING on tip/x86/core]
[also build test WARNING on pci/next pci/for-linus arm64/for-next/core soc/for-next linus/master v6.19-rc6]
[cannot apply to clk/clk-next arnd-asm-generic/master next-20260119]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mukesh-R/iommu-hyperv-rename-hyperv-iommu-c-to-hyperv-irq-c/20260120-145832
base:   tip/x86/core
patch link:    https://lore.kernel.org/r/20260120064230.3602565-16-mrathor%40linux.microsoft.com
patch subject: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru
config: x86_64-randconfig-003-20260120 (https://download.01.org/0day-ci/archive/20260121/202601210255.2ZZOLtMV-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260121/202601210255.2ZZOLtMV-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601210255.2ZZOLtMV-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/hv/mshv_root_main.c:60:19: warning: 'setup_hv_full_mmio' defined but not used [-Wunused-function]
      60 | static int __init setup_hv_full_mmio(char *str)
         |                   ^~~~~~~~~~~~~~~~~~


vim +/setup_hv_full_mmio +60 drivers/hv/mshv_root_main.c

    58	
    59	bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
  > 60	static int __init setup_hv_full_mmio(char *str)
    61	{
    62		hv_nofull_mmio = true;
    63		return 0;
    64	}
    65	__setup("hv_nofull_mmio", setup_hv_full_mmio);
    66	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki