[PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest

Douglas Freimuth posted 3 patches 1 month ago
There is a newer version of this series
[PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest
Posted by Douglas Freimuth 1 month ago
Patch 1: This patch adds map/unmap ioctls which map the adapter set
indicator pages so the pages can be accessed when interrupts are
disabled. The mappings are cleaned up when the guest is removed.

Fencing of Fast Inject in Secure Execution environments is enabled in
this patch by not mapping adapter indicator pages. In Secure Execution
environments the path of execution available before this patch is followed.
Statistical counters to count map/unmap functions for adapter indicator
pages are added in this patch. The counters can be used to analyze
map/unmap functions in non-Secure Execution environments and similarly
can be used to analyze Secure Execution environments where the counters
should not be incremented as the adapter indicator pages are not mapped.

Signed-off-by: Douglas Freimuth <freimuth@linux.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |   5 ++
 arch/s390/kvm/interrupt.c        | 143 +++++++++++++++++++++++++------
 arch/s390/kvm/kvm-s390.c         |   2 +
 3 files changed, 124 insertions(+), 26 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 64a50f0862aa..616be8ca4614 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -448,6 +448,8 @@ struct kvm_vcpu_arch {
 struct kvm_vm_stat {
 	struct kvm_vm_stat_generic generic;
 	u64 inject_io;
+	u64 io_390_adapter_map;
+	u64 io_390_adapter_unmap;
 	u64 inject_float_mchk;
 	u64 inject_pfault_done;
 	u64 inject_service_signal;
@@ -479,6 +481,9 @@ struct s390_io_adapter {
 	bool masked;
 	bool swap;
 	bool suppressible;
+	struct rw_semaphore maps_lock;
+	struct list_head maps;
+	unsigned int nr_maps;
 };
 
 #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 18932a65ca68..cafc03e20f8f 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2426,6 +2426,9 @@ static int register_io_adapter(struct kvm_device *dev,
 	if (!adapter)
 		return -ENOMEM;
 
+	INIT_LIST_HEAD(&adapter->maps);
+	init_rwsem(&adapter->maps_lock);
+	adapter->nr_maps = 0;
 	adapter->id = adapter_info.id;
 	adapter->isc = adapter_info.isc;
 	adapter->maskable = adapter_info.maskable;
@@ -2450,12 +2453,103 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
 	return ret;
 }
 
+static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
+{
+	struct mm_struct *mm = kvm->mm;
+	struct page *page = NULL;
+	int locked = 1;
+
+	if (mmget_not_zero(mm)) {
+		mmap_read_lock(mm);
+		get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE,
+				      &page, &locked);
+		if (locked)
+			mmap_read_unlock(mm);
+		mmput(mm);
+	}
+
+	return page;
+}
+
+static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+	struct s390_map_info *map;
+	unsigned long flags;
+	int ret;
+
+	if (!adapter || !addr)
+		return -EINVAL;
+
+	map = kzalloc_obj(*map, GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	map->page = get_map_page(kvm, addr);
+	if (!map->page) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&map->list);
+	map->guest_addr = addr;
+	map->addr = addr;
+	down_write(&adapter->maps_lock);
+	if (adapter->nr_maps++ < MAX_S390_ADAPTER_MAPS) {
+		list_add_tail(&map->list, &adapter->maps);
+		ret = 0;
+	} else {
+		put_page(map->page);
+		ret = -EINVAL;
+	}
+	up_write(&adapter->maps_lock);
+out:
+	if (ret)
+		kfree(map);
+	return ret;
+}
+
+static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+	struct s390_map_info *map, *tmp;
+	int found = 0;
+
+	if (!adapter || !addr)
+		return -EINVAL;
+
+	down_write(&adapter->maps_lock);
+	list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
+		if (map->guest_addr == addr) {
+			found = 1;
+			adapter->nr_maps--;
+			list_del(&map->list);
+			put_page(map->page);
+			kfree(map);
+			break;
+		}
+	}
+	up_write(&adapter->maps_lock);
+
+	return found ? 0 : -ENOENT;
+}
+
 void kvm_s390_destroy_adapters(struct kvm *kvm)
 {
 	int i;
+	struct s390_map_info *map, *tmp;
 
-	for (i = 0; i < MAX_S390_IO_ADAPTERS; i++)
+	for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
+		if (!kvm->arch.adapters[i])
+			continue;
+		list_for_each_entry_safe(map, tmp,
+					 &kvm->arch.adapters[i]->maps, list) {
+			list_del(&map->list);
+			put_page(map->page);
+			kfree(map);
+		}
 		kfree(kvm->arch.adapters[i]);
+	}
 }
 
 static int modify_io_adapter(struct kvm_device *dev,
@@ -2463,7 +2557,8 @@ static int modify_io_adapter(struct kvm_device *dev,
 {
 	struct kvm_s390_io_adapter_req req;
 	struct s390_io_adapter *adapter;
-	int ret;
+	__u64 host_addr;
+	int ret, idx;
 
 	if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
 		return -EFAULT;
@@ -2477,14 +2572,28 @@ static int modify_io_adapter(struct kvm_device *dev,
 		if (ret > 0)
 			ret = 0;
 		break;
-	/*
-	 * The following operations are no longer needed and therefore no-ops.
-	 * The gpa to hva translation is done when an IRQ route is set up. The
-	 * set_irq code uses get_user_pages_remote() to do the actual write.
-	 */
 	case KVM_S390_IO_ADAPTER_MAP:
 	case KVM_S390_IO_ADAPTER_UNMAP:
-		ret = 0;
+		mutex_lock(&dev->kvm->lock);
+		if (kvm_s390_pv_is_protected(dev->kvm)) {
+			mutex_unlock(&dev->kvm->lock);
+			break;
+		}
+		mutex_unlock(&dev->kvm->lock);
+		idx = srcu_read_lock(&dev->kvm->srcu);
+		host_addr = gpa_to_hva(dev->kvm, req.addr);
+		if (kvm_is_error_hva(host_addr)) {
+			srcu_read_unlock(&dev->kvm->srcu, idx);
+			return -EFAULT;
+			}
+		srcu_read_unlock(&dev->kvm->srcu, idx);
+		if (req.type == KVM_S390_IO_ADAPTER_MAP) {
+			dev->kvm->stat.io_390_adapter_map++;
+			ret = kvm_s390_adapter_map(dev->kvm, req.id, host_addr);
+		} else {
+			dev->kvm->stat.io_390_adapter_unmap++;
+			ret = kvm_s390_adapter_unmap(dev->kvm, req.id, host_addr);
+		}
 		break;
 	default:
 		ret = -EINVAL;
@@ -2727,24 +2836,6 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
 	return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
 }
 
-static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
-{
-	struct mm_struct *mm = kvm->mm;
-	struct page *page = NULL;
-	int locked = 1;
-
-	if (mmget_not_zero(mm)) {
-		mmap_read_lock(mm);
-		get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE,
-				      &page, &locked);
-		if (locked)
-			mmap_read_unlock(mm);
-		mmput(mm);
-	}
-
-	return page;
-}
-
 static int adapter_indicators_set(struct kvm *kvm,
 				  struct s390_io_adapter *adapter,
 				  struct kvm_s390_adapter_int *adapter_int)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index bc7d6fa66eaf..8e6532f55a5a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -68,6 +68,8 @@
 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	KVM_GENERIC_VM_STATS(),
 	STATS_DESC_COUNTER(VM, inject_io),
+	STATS_DESC_COUNTER(VM, io_390_adapter_map),
+	STATS_DESC_COUNTER(VM, io_390_adapter_unmap),
 	STATS_DESC_COUNTER(VM, inject_float_mchk),
 	STATS_DESC_COUNTER(VM, inject_pfault_done),
 	STATS_DESC_COUNTER(VM, inject_service_signal),
-- 
2.52.0
Re: [PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest
Posted by Matthew Rosato 4 weeks ago
On 3/7/26 10:04 PM, Douglas Freimuth wrote:
> Patch 1: This patch adds map/unmap ioctls which map the adapter set

This comment applies to all patches in the series:

Drop the Patch #: intro, it would otherwise end up in git history.

Also remove phrases like 'this patch' and switch to imperative phrasing
ref: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes

Also +1 to what Sean said, "KVM: s390: Add map..."

> indicator pages so the pages can be accessed when interrupts are
> disabled. The mappings are cleaned up when the guest is removed.
> 
> Fencing of Fast Inject in Secure Execution environments is enabled in
This is a bit misleading.  You haven't implemented kvm_ach_set_irq_inatomic at this point, so this patch can't be fencing it yet.

What you are doing is fencing the map/unmap ioctls so that you avoid the longterm pin in SE.  I'd focus on that part here.

And then in patch 3, when you enable the feature, you can point out that it is fenced in SE because the mappings will never be available.

> this patch by not mapping adapter indicator pages. In Secure Execution
> environments the path of execution available before this patch is followed.
> Statistical counters to count map/unmap functions for adapter indicator
> pages are added in this patch. The counters can be used to analyze
> map/unmap functions in non-Secure Execution environments and similarly
> can be used to analyze Secure Execution environments where the counters
> should not be incremented as the adapter indicator pages are not mapped.
> 
> Signed-off-by: Douglas Freimuth <freimuth@linux.ibm.com>


> @@ -2477,14 +2572,28 @@ static int modify_io_adapter(struct kvm_device *dev,
>  		if (ret > 0)
>  			ret = 0;
>  		break;
> -	/*
> -	 * The following operations are no longer needed and therefore no-ops.
> -	 * The gpa to hva translation is done when an IRQ route is set up. The
> -	 * set_irq code uses get_user_pages_remote() to do the actual write.
> -	 */
>  	case KVM_S390_IO_ADAPTER_MAP:
>  	case KVM_S390_IO_ADAPTER_UNMAP:
> -		ret = 0;
> +		mutex_lock(&dev->kvm->lock);
> +		if (kvm_s390_pv_is_protected(dev->kvm)) {
> +			mutex_unlock(&dev->kvm->lock);
> +			break;

Was mentioned by the kernel bot also -- because you removed the initialization of ret above you will now return something uninitialized if you take this if statement.

Either initialize ret above or return the intended value (0?) here.

Also I think this is worth a comment block above this check stating why you are doing it (e.g. avoid long-term pin for secure exeuction guest)
Re: [PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest
Posted by Sean Christopherson 1 month ago
Please update the shortlogs on the patches to set the scope to:

  KVM: s390:

I had a moment of confusion because I was like "but kvm_arch_set_irq_inatomic()
is already a thing!?!?" :-)
Re: [PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest
Posted by Christian Borntraeger 1 month ago
Am 08.03.26 um 04:04 schrieb Douglas Freimuth:
> Fencing of Fast Inject in Secure Execution environments is enabled in
> this patch by not mapping adapter indicator pages. In Secure Execution
[...]

> @@ -2477,14 +2572,28 @@ static int modify_io_adapter(struct kvm_device *dev,
>   		if (ret > 0)
>   			ret = 0;
>   		break;
> -	/*
> -	 * The following operations are no longer needed and therefore no-ops.
> -	 * The gpa to hva translation is done when an IRQ route is set up. The
> -	 * set_irq code uses get_user_pages_remote() to do the actual write.
> -	 */
>   	case KVM_S390_IO_ADAPTER_MAP:
>   	case KVM_S390_IO_ADAPTER_UNMAP:
> -		ret = 0;
> +		mutex_lock(&dev->kvm->lock);
> +		if (kvm_s390_pv_is_protected(dev->kvm)) {
> +			mutex_unlock(&dev->kvm->lock);
> +			break;
> +		}


I guess this works for a well behaving userspaces, but a bad QEMU could in theory
not do the unmap on switch to secure.
Shall we maybe do -EINVAL on KVM_PV_ENABLE if there are still mapping left, or
to make it easier for userspace remove the old ADAPTER maps?
Re: [PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest
Posted by Douglas Freimuth 1 month ago

On 3/9/26 5:27 AM, Christian Borntraeger wrote:
> Am 08.03.26 um 04:04 schrieb Douglas Freimuth:
>> Fencing of Fast Inject in Secure Execution environments is enabled in
>> this patch by not mapping adapter indicator pages. In Secure Execution
> [...]
> 
>> @@ -2477,14 +2572,28 @@ static int modify_io_adapter(struct kvm_device 
>> *dev,
>>           if (ret > 0)
>>               ret = 0;
>>           break;
>> -    /*
>> -     * The following operations are no longer needed and therefore 
>> no-ops.
>> -     * The gpa to hva translation is done when an IRQ route is set 
>> up. The
>> -     * set_irq code uses get_user_pages_remote() to do the actual write.
>> -     */
>>       case KVM_S390_IO_ADAPTER_MAP:
>>       case KVM_S390_IO_ADAPTER_UNMAP:
>> -        ret = 0;
>> +        mutex_lock(&dev->kvm->lock);
>> +        if (kvm_s390_pv_is_protected(dev->kvm)) {
>> +            mutex_unlock(&dev->kvm->lock);
>> +            break;
>> +        }
> 
> 
> I guess this works for a well behaving userspaces, but a bad QEMU could 
> in theory
> not do the unmap on switch to secure.
> Shall we maybe do -EINVAL on KVM_PV_ENABLE if there are still mapping 
> left, or
> to make it easier for userspace remove the old ADAPTER maps?
> 

Christian, thank you for your input. For this scenario, I will look into 
adding/testing removing the old adapter maps. I will start in 
kvm_s390_handle_pv() for CASE KVM_PV_ENABLE and I will essentially use 
most of the functionality in kvm_s390_destroy_adapters() where the maps 
are deleted if they exist.

Discussion: During development and test I realized it appears a guest 
can only change state between non-SE and SE during a reboot. Thus the 
unmap and map is called which hits the fencing in the current patch. 
Additionally, a more draconian fencing could possibly be done if needed, 
by checking for the existence of SE firmware in the CMDLINE and prevent 
any mapping from occurring on those systems that support SE.
Re: [PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest
Posted by Christian Borntraeger 1 month ago

Am 09.03.26 um 18:12 schrieb Douglas Freimuth:
> 
> 
> On 3/9/26 5:27 AM, Christian Borntraeger wrote:
>> Am 08.03.26 um 04:04 schrieb Douglas Freimuth:
>>> Fencing of Fast Inject in Secure Execution environments is enabled in
>>> this patch by not mapping adapter indicator pages. In Secure Execution
>> [...]
>>
>>> @@ -2477,14 +2572,28 @@ static int modify_io_adapter(struct kvm_device *dev,
>>>           if (ret > 0)
>>>               ret = 0;
>>>           break;
>>> -    /*
>>> -     * The following operations are no longer needed and therefore no-ops.
>>> -     * The gpa to hva translation is done when an IRQ route is set up. The
>>> -     * set_irq code uses get_user_pages_remote() to do the actual write.
>>> -     */
>>>       case KVM_S390_IO_ADAPTER_MAP:
>>>       case KVM_S390_IO_ADAPTER_UNMAP:
>>> -        ret = 0;
>>> +        mutex_lock(&dev->kvm->lock);
>>> +        if (kvm_s390_pv_is_protected(dev->kvm)) {
>>> +            mutex_unlock(&dev->kvm->lock);
>>> +            break;
>>> +        }
>>
>>
>> I guess this works for a well behaving userspaces, but a bad QEMU could in theory
>> not do the unmap on switch to secure.
>> Shall we maybe do -EINVAL on KVM_PV_ENABLE if there are still mapping left, or
>> to make it easier for userspace remove the old ADAPTER maps?
>>
> 
> Christian, thank you for your input. For this scenario, I will look into adding/testing removing the old adapter maps. I will start in kvm_s390_handle_pv() for CASE KVM_PV_ENABLE and I will essentially use most of the functionality in kvm_s390_destroy_adapters() where the maps are deleted if they exist.

Right, maybe just add a unmap_all_adapters function and call that.

> 
> Discussion: During development and test I realized it appears a guest can only change state between non-SE and SE during a reboot. Thus the unmap and map is called which hits the fencing in the current patch. Additionally, a more draconian fencing could possibly be done if needed, by checking for the existence of SE firmware in the CMDLINE and prevent any mapping from occurring on those systems that support SE.

Yes, diagnose 308 code 10 switches in a reboot like style to secure and diagnose 308 code 3 or 4 switch back to non secure.

Re: [PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest
Posted by kernel test robot 1 month ago
Hi Douglas,

kernel test robot noticed the following build warnings:

[auto build test WARNING on v7.0-rc2]
[also build test WARNING on linus/master next-20260306]
[cannot apply to kvms390/next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Douglas-Freimuth/Add-map-unmap-ioctl-and-clean-mappings-post-guest/20260308-110653
base:   v7.0-rc2
patch link:    https://lore.kernel.org/r/20260308030438.88580-2-freimuth%40linux.ibm.com
patch subject: [PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest
config: s390-allmodconfig (https://download.01.org/0day-ci/archive/20260308/202603082108.iY5mWhWR-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260308/202603082108.iY5mWhWR-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603082108.iY5mWhWR-lkp@intel.com/

All warnings (new ones prefixed by >>):

   arch/s390/kvm/interrupt.c:2478:16: warning: unused variable 'flags' [-Wunused-variable]
    2478 |         unsigned long flags;
         |                       ^~~~~
>> arch/s390/kvm/interrupt.c:2578:7: warning: variable 'ret' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
    2578 |                 if (kvm_s390_pv_is_protected(dev->kvm)) {
         |                     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/s390/kvm/interrupt.c:2602:9: note: uninitialized use occurs here
    2602 |         return ret;
         |                ^~~
   arch/s390/kvm/interrupt.c:2578:3: note: remove the 'if' if its condition is always false
    2578 |                 if (kvm_s390_pv_is_protected(dev->kvm)) {
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    2579 |                         mutex_unlock(&dev->kvm->lock);
         |                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    2580 |                         break;
         |                         ~~~~~~
    2581 |                 }
         |                 ~
   arch/s390/kvm/interrupt.c:2561:9: note: initialize the variable 'ret' to silence this warning
    2561 |         int ret, idx;
         |                ^
         |                 = 0
   2 warnings generated.


vim +2578 arch/s390/kvm/interrupt.c

  2554	
  2555	static int modify_io_adapter(struct kvm_device *dev,
  2556				     struct kvm_device_attr *attr)
  2557	{
  2558		struct kvm_s390_io_adapter_req req;
  2559		struct s390_io_adapter *adapter;
  2560		__u64 host_addr;
  2561		int ret, idx;
  2562	
  2563		if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
  2564			return -EFAULT;
  2565	
  2566		adapter = get_io_adapter(dev->kvm, req.id);
  2567		if (!adapter)
  2568			return -EINVAL;
  2569		switch (req.type) {
  2570		case KVM_S390_IO_ADAPTER_MASK:
  2571			ret = kvm_s390_mask_adapter(dev->kvm, req.id, req.mask);
  2572			if (ret > 0)
  2573				ret = 0;
  2574			break;
  2575		case KVM_S390_IO_ADAPTER_MAP:
  2576		case KVM_S390_IO_ADAPTER_UNMAP:
  2577			mutex_lock(&dev->kvm->lock);
> 2578			if (kvm_s390_pv_is_protected(dev->kvm)) {
  2579				mutex_unlock(&dev->kvm->lock);
  2580				break;
  2581			}
  2582			mutex_unlock(&dev->kvm->lock);
  2583			idx = srcu_read_lock(&dev->kvm->srcu);
  2584			host_addr = gpa_to_hva(dev->kvm, req.addr);
  2585			if (kvm_is_error_hva(host_addr)) {
  2586				srcu_read_unlock(&dev->kvm->srcu, idx);
  2587				return -EFAULT;
  2588				}
  2589			srcu_read_unlock(&dev->kvm->srcu, idx);
  2590			if (req.type == KVM_S390_IO_ADAPTER_MAP) {
  2591				dev->kvm->stat.io_390_adapter_map++;
  2592				ret = kvm_s390_adapter_map(dev->kvm, req.id, host_addr);
  2593			} else {
  2594				dev->kvm->stat.io_390_adapter_unmap++;
  2595				ret = kvm_s390_adapter_unmap(dev->kvm, req.id, host_addr);
  2596			}
  2597			break;
  2598		default:
  2599			ret = -EINVAL;
  2600		}
  2601	
  2602		return ret;
  2603	}
  2604	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki