[v1] KVM: s390: Introducing kvm_arch_set_irq_inatomic Fast Inject

[PATCH v1 3/3] Introducing kvm_arch_set_irq_inatomic fast inject

Posted by Douglas Freimuth 1 month ago

Patch 3: This patch introduces kvm_arch_set_irq_inatomic, a fast path
for irq injection. Instead of placing all interrupts on the global work
queue as it does today, this patch provides a fast path for irq
injection. Statistical counters have been added to enable analysis of irq injection on
the fast path and slow path including io_390_inatomic,
io_flic_inject_airq, io_set_adapter_int and io_390_inatomic_adapter_masked.

In order to use this kernel series with virtio-pci, a QEMU that includes
the 's390x/pci: set kvm_msi_via_irqfd_allowed' fix is needed.
Additionally, the guest xml needs a thread pool and threads explicitly
assigned per disk device using the common way of defining threads for
disks.


Signed-off-by: Douglas Freimuth <freimuth@linux.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |   8 +-
 arch/s390/kvm/interrupt.c        | 169 ++++++++++++++++++++++++++-----
 arch/s390/kvm/kvm-s390.c         |  24 ++++-
 arch/s390/kvm/kvm-s390.h         |   3 +-
 4 files changed, 170 insertions(+), 34 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 616be8ca4614..a194e9808ae3 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -359,7 +359,7 @@ struct kvm_s390_float_interrupt {
 	struct kvm_s390_mchk_info mchk;
 	struct kvm_s390_ext_info srv_signal;
 	int last_sleep_cpu;
-	struct mutex ais_lock;
+	spinlock_t ais_lock;
 	u8 simm;
 	u8 nimm;
 };
@@ -450,6 +450,10 @@ struct kvm_vm_stat {
 	u64 inject_io;
 	u64 io_390_adapter_map;
 	u64 io_390_adapter_unmap;
+	u64 io_390_inatomic;
+	u64 io_flic_inject_airq;
+	u64 io_set_adapter_int;
+	u64 io_390_inatomic_adapter_masked;
 	u64 inject_float_mchk;
 	u64 inject_pfault_done;
 	u64 inject_service_signal;
@@ -481,7 +485,7 @@ struct s390_io_adapter {
 	bool masked;
 	bool swap;
 	bool suppressible;
-	struct rw_semaphore maps_lock;
+	spinlock_t maps_lock;
 	struct list_head maps;
 	unsigned int nr_maps;
 };
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 350106f84763..0429a1bc4b7a 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1963,15 +1963,10 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 }
 
 int kvm_s390_inject_vm(struct kvm *kvm,
-		       struct kvm_s390_interrupt *s390int)
+		       struct kvm_s390_interrupt *s390int, struct kvm_s390_interrupt_info *inti)
 {
-	struct kvm_s390_interrupt_info *inti;
 	int rc;
 
-	inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT);
-	if (!inti)
-		return -ENOMEM;
-
 	inti->type = s390int->type;
 	switch (inti->type) {
 	case KVM_S390_INT_VIRTIO:
@@ -2284,6 +2279,7 @@ static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
 	struct kvm_s390_ais_all ais;
+	unsigned long flags;
 
 	if (attr->attr < sizeof(ais))
 		return -EINVAL;
@@ -2291,10 +2287,10 @@ static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr)
 	if (!test_kvm_facility(kvm, 72))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&fi->ais_lock);
+	spin_lock_irqsave(&fi->ais_lock, flags);
 	ais.simm = fi->simm;
 	ais.nimm = fi->nimm;
-	mutex_unlock(&fi->ais_lock);
+	spin_unlock_irqrestore(&fi->ais_lock, flags);
 
 	if (copy_to_user((void __user *)attr->addr, &ais, sizeof(ais)))
 		return -EFAULT;
@@ -2427,7 +2423,7 @@ static int register_io_adapter(struct kvm_device *dev,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&adapter->maps);
-	init_rwsem(&adapter->maps_lock);
+	spin_lock_init(&adapter->maps_lock);
 	adapter->nr_maps = 0;
 	adapter->id = adapter_info.id;
 	adapter->isc = adapter_info.isc;
@@ -2494,7 +2490,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
 	INIT_LIST_HEAD(&map->list);
 	map->guest_addr = addr;
 	map->addr = addr;
-	down_write(&adapter->maps_lock);
+	spin_lock_irqsave(&adapter->maps_lock, flags);
 	if (adapter->nr_maps++ < MAX_S390_ADAPTER_MAPS) {
 		list_add_tail(&map->list, &adapter->maps);
 		ret = 0;
@@ -2502,7 +2498,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
 		put_page(map->page);
 		ret = -EINVAL;
 	}
-	up_write(&adapter->maps_lock);
+	spin_unlock_irqrestore(&adapter->maps_lock, flags);
 out:
 	if (ret)
 		kfree(map);
@@ -2514,11 +2510,12 @@ static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
 	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
 	struct s390_map_info *map, *tmp;
 	int found = 0;
+	unsigned long flags;
 
 	if (!adapter || !addr)
 		return -EINVAL;
 
-	down_write(&adapter->maps_lock);
+	spin_lock_irqsave(&adapter->maps_lock, flags);
 	list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
 		if (map->guest_addr == addr) {
 			found = 1;
@@ -2529,7 +2526,7 @@ static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
 			break;
 		}
 	}
-	up_write(&adapter->maps_lock);
+	spin_unlock_irqrestore(&adapter->maps_lock, flags);
 
 	return found ? 0 : -ENOENT;
 }
@@ -2630,6 +2627,7 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
 	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
 	struct kvm_s390_ais_req req;
 	int ret = 0;
+	unsigned long flags;
 
 	if (!test_kvm_facility(kvm, 72))
 		return -EOPNOTSUPP;
@@ -2646,7 +2644,7 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
 				       2 : KVM_S390_AIS_MODE_SINGLE :
 				       KVM_S390_AIS_MODE_ALL, req.mode);
 
-	mutex_lock(&fi->ais_lock);
+	spin_lock_irqsave(&fi->ais_lock, flags);
 	switch (req.mode) {
 	case KVM_S390_AIS_MODE_ALL:
 		fi->simm &= ~AIS_MODE_MASK(req.isc);
@@ -2659,7 +2657,7 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr)
 	default:
 		ret = -EINVAL;
 	}
-	mutex_unlock(&fi->ais_lock);
+	spin_unlock_irqrestore(&fi->ais_lock, flags);
 
 	return ret;
 }
@@ -2673,25 +2671,33 @@ static int kvm_s390_inject_airq(struct kvm *kvm,
 		.parm = 0,
 		.parm64 = isc_to_int_word(adapter->isc),
 	};
+	struct kvm_s390_interrupt_info *inti;
+	unsigned long flags;
+
 	int ret = 0;
 
+	inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT);
+	if (!inti)
+		return -ENOMEM;
+
 	if (!test_kvm_facility(kvm, 72) || !adapter->suppressible)
-		return kvm_s390_inject_vm(kvm, &s390int);
+		return kvm_s390_inject_vm(kvm, &s390int, inti);
 
-	mutex_lock(&fi->ais_lock);
+	spin_lock_irqsave(&fi->ais_lock, flags);
 	if (fi->nimm & AIS_MODE_MASK(adapter->isc)) {
 		trace_kvm_s390_airq_suppressed(adapter->id, adapter->isc);
+		kfree(inti);
 		goto out;
 	}
 
-	ret = kvm_s390_inject_vm(kvm, &s390int);
+	ret = kvm_s390_inject_vm(kvm, &s390int, inti);
 	if (!ret && (fi->simm & AIS_MODE_MASK(adapter->isc))) {
 		fi->nimm |= AIS_MODE_MASK(adapter->isc);
 		trace_kvm_s390_modify_ais_mode(adapter->isc,
 					       KVM_S390_AIS_MODE_SINGLE, 2);
 	}
 out:
-	mutex_unlock(&fi->ais_lock);
+	spin_unlock_irqrestore(&fi->ais_lock, flags);
 	return ret;
 }
 
@@ -2700,6 +2706,8 @@ static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr)
 	unsigned int id = attr->attr;
 	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
 
+	kvm->stat.io_flic_inject_airq++;
+
 	if (!adapter)
 		return -EINVAL;
 
@@ -2710,6 +2718,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
 	struct kvm_s390_ais_all ais;
+	unsigned long flags;
 
 	if (!test_kvm_facility(kvm, 72))
 		return -EOPNOTSUPP;
@@ -2717,10 +2726,10 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr)
 	if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais)))
 		return -EFAULT;
 
-	mutex_lock(&fi->ais_lock);
+	spin_lock_irqsave(&fi->ais_lock, flags);
 	fi->simm = ais.simm;
 	fi->nimm = ais.nimm;
-	mutex_unlock(&fi->ais_lock);
+	spin_unlock_irqrestore(&fi->ais_lock, flags);
 
 	return 0;
 }
@@ -2852,17 +2861,20 @@ static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
 }
 
 static int adapter_indicators_set(struct kvm *kvm,
-				struct s390_io_adapter *adapter,
-				struct kvm_s390_adapter_int *adapter_int)
+				  struct s390_io_adapter *adapter,
+				  struct kvm_s390_adapter_int *adapter_int)
 {
 	unsigned long bit;
 	int summary_set, idx;
 	struct s390_map_info *ind_info, *summary_info;
 	void *map;
 	struct page *ind_page, *summary_page;
+	unsigned long flags;
 
+	spin_lock_irqsave(&adapter->maps_lock, flags);
 	ind_info = get_map_info(adapter, adapter_int->ind_addr);
 	if (!ind_info) {
+		spin_unlock_irqrestore(&adapter->maps_lock, flags);
 		ind_page = get_map_page(kvm, adapter_int->ind_addr);
 		if (!ind_page)
 			return -1;
@@ -2878,8 +2890,13 @@ static int adapter_indicators_set(struct kvm *kvm,
 		map = page_address(ind_info->page);
 		bit = get_ind_bit(ind_info->addr, adapter_int->ind_offset, adapter->swap);
 		set_bit(bit, map);
+		spin_unlock_irqrestore(&adapter->maps_lock, flags);
 	}
 
+	spin_lock_irqsave(&adapter->maps_lock, flags);
+	summary_info = get_map_info(adapter, adapter_int->summary_addr);
+	if (!summary_info) {
+		spin_unlock_irqrestore(&adapter->maps_lock, flags);
 		summary_page = get_map_page(kvm, adapter_int->summary_addr);
 		if (!summary_page) {
 			put_page(ind_page);
@@ -2898,6 +2915,7 @@ static int adapter_indicators_set(struct kvm *kvm,
 		bit = get_ind_bit(summary_info->addr, adapter_int->summary_offset,
 				  adapter->swap);
 		summary_set = test_and_set_bit(bit, map);
+		spin_unlock_irqrestore(&adapter->maps_lock, flags);
 }
 
 	if (!ind_info)
@@ -2907,6 +2925,37 @@ static int adapter_indicators_set(struct kvm *kvm,
 	return summary_set ? 0 : 1;
 }
 
+static int adapter_indicators_set_fast(struct kvm *kvm,
+				       struct s390_io_adapter *adapter,
+				       struct kvm_s390_adapter_int *adapter_int)
+{
+	unsigned long bit;
+	int summary_set;
+	struct s390_map_info *ind_info, *summary_info;
+	void *map;
+
+	spin_lock(&adapter->maps_lock);
+	ind_info = get_map_info(adapter, adapter_int->ind_addr);
+	if (!ind_info) {
+		spin_unlock(&adapter->maps_lock);
+		return -EWOULDBLOCK;
+	}
+	map = page_address(ind_info->page);
+	bit = get_ind_bit(ind_info->addr, adapter_int->ind_offset, adapter->swap);
+	set_bit(bit, map);
+	summary_info = get_map_info(adapter, adapter_int->summary_addr);
+	if (!summary_info) {
+		spin_unlock(&adapter->maps_lock);
+		return -EWOULDBLOCK;
+	}
+	map = page_address(summary_info->page);
+	bit = get_ind_bit(summary_info->addr, adapter_int->summary_offset,
+			  adapter->swap);
+	summary_set = test_and_set_bit(bit, map);
+	spin_unlock(&adapter->maps_lock);
+	return summary_set ? 0 : 1;
+}
+
 /*
  * < 0 - not injected due to error
  * = 0 - coalesced, summary indicator already active
@@ -2919,15 +2968,15 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
 	int ret;
 	struct s390_io_adapter *adapter;
 
+	kvm->stat.io_set_adapter_int++;
+
 	/* We're only interested in the 0->1 transition. */
 	if (!level)
 		return 0;
 	adapter = get_io_adapter(kvm, e->adapter.adapter_id);
 	if (!adapter)
 		return -1;
-	down_read(&adapter->maps_lock);
 	ret = adapter_indicators_set(kvm, adapter, &e->adapter);
-	up_read(&adapter->maps_lock);
 	if ((ret > 0) && !adapter->masked) {
 		ret = kvm_s390_inject_airq(kvm, adapter);
 		if (ret == 0)
@@ -2982,7 +3031,6 @@ int kvm_set_routing_entry(struct kvm *kvm,
 	int idx;
 
 	switch (ue->type) {
-	/* we store the userspace addresses instead of the guest addresses */
 	case KVM_IRQ_ROUTING_S390_ADAPTER:
 		if (kvm_is_ucontrol(kvm))
 			return -EINVAL;
@@ -3565,3 +3613,72 @@ int __init kvm_s390_gib_init(u8 nisc)
 out:
 	return rc;
 }
+
+/*
+ * kvm_arch_set_irq_inatomic: fast-path for irqfd injection
+ */
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+			      struct kvm *kvm, int irq_source_id, int level,
+			      bool line_status)
+{
+	int ret;
+	struct s390_io_adapter *adapter;
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
+	struct kvm_s390_interrupt_info *inti;
+	struct kvm_s390_interrupt s390int = {
+			.type = KVM_S390_INT_IO(1, 0, 0, 0),
+			.parm = 0,
+	};
+
+	kvm->stat.io_390_inatomic++;
+
+	/* We're only interested in the 0->1 transition. */
+	if (!level)
+		return -EWOULDBLOCK;
+	if (e->type != KVM_IRQ_ROUTING_S390_ADAPTER)
+		return -EWOULDBLOCK;
+
+	adapter = get_io_adapter(kvm, e->adapter.adapter_id);
+	if (!adapter)
+		return -EWOULDBLOCK;
+
+	s390int.parm64 = isc_to_int_word(adapter->isc);
+	ret = adapter_indicators_set_fast(kvm, adapter, &e->adapter);
+	if (ret < 0)
+		return -EWOULDBLOCK;
+	if (!ret || adapter->masked) {
+		kvm->stat.io_390_inatomic_adapter_masked++;
+		return 0;
+	}
+
+	inti = kzalloc_obj(*inti, GFP_ATOMIC);
+	if (!inti)
+		return -EWOULDBLOCK;
+
+	if (!test_kvm_facility(kvm, 72) || !adapter->suppressible) {
+		ret = kvm_s390_inject_vm(kvm, &s390int, inti);
+			if (ret == 0)
+				return ret;
+			else
+				return -EWOULDBLOCK;
+		}
+
+	spin_lock(&fi->ais_lock);
+	if (fi->nimm & AIS_MODE_MASK(adapter->isc)) {
+		trace_kvm_s390_airq_suppressed(adapter->id, adapter->isc);
+		kfree(inti);
+		goto out;
+	}
+
+	ret = kvm_s390_inject_vm(kvm, &s390int, inti);
+	if (!ret && (fi->simm & AIS_MODE_MASK(adapter->isc))) {
+		fi->nimm |= AIS_MODE_MASK(adapter->isc);
+		trace_kvm_s390_modify_ais_mode(adapter->isc,
+					       KVM_S390_AIS_MODE_SINGLE, 2);
+	}
+	goto out;
+
+out:
+	spin_unlock(&fi->ais_lock);
+	return 0;
+}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8e6532f55a5a..29386afe6b29 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -70,6 +70,10 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
 	STATS_DESC_COUNTER(VM, inject_io),
 	STATS_DESC_COUNTER(VM, io_390_adapter_map),
 	STATS_DESC_COUNTER(VM, io_390_adapter_unmap),
+	STATS_DESC_COUNTER(VM, io_390_inatomic),
+	STATS_DESC_COUNTER(VM, io_flic_inject_airq),
+	STATS_DESC_COUNTER(VM, io_set_adapter_int),
+	STATS_DESC_COUNTER(VM, io_390_inatomic_adapter_masked),
 	STATS_DESC_COUNTER(VM, inject_float_mchk),
 	STATS_DESC_COUNTER(VM, inject_pfault_done),
 	STATS_DESC_COUNTER(VM, inject_service_signal),
@@ -2844,6 +2848,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 	void __user *argp = (void __user *)arg;
 	struct kvm_device_attr attr;
 	int r;
+	struct kvm_s390_interrupt_info *inti;
 
 	switch (ioctl) {
 	case KVM_S390_INTERRUPT: {
@@ -2852,7 +2857,10 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 		r = -EFAULT;
 		if (copy_from_user(&s390int, argp, sizeof(s390int)))
 			break;
-		r = kvm_s390_inject_vm(kvm, &s390int);
+		inti = kzalloc_obj(*inti, GFP_KERNEL_ACCOUNT);
+		if (!inti)
+			return -ENOMEM;
+		r = kvm_s390_inject_vm(kvm, &s390int, inti);
 		break;
 	}
 	case KVM_CREATE_IRQCHIP: {
@@ -3250,7 +3258,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 		mutex_unlock(&kvm->lock);
 	}
 
-	mutex_init(&kvm->arch.float_int.ais_lock);
+	spin_lock_init(&kvm->arch.float_int.ais_lock);
 	spin_lock_init(&kvm->arch.float_int.lock);
 	for (i = 0; i < FIRQ_LIST_COUNT; i++)
 		INIT_LIST_HEAD(&kvm->arch.float_int.lists[i]);
@@ -4371,11 +4379,16 @@ int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clo
 	return 1;
 }
 
-static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
-				      unsigned long token)
+static int __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
+				     unsigned long token)
 {
 	struct kvm_s390_interrupt inti;
 	struct kvm_s390_irq irq;
+	struct kvm_s390_interrupt_info *inti_mem;
+
+	inti_mem = kzalloc_obj(*inti_mem, GFP_KERNEL_ACCOUNT);
+	if (!inti_mem)
+		return -ENOMEM;
 
 	if (start_token) {
 		irq.u.ext.ext_params2 = token;
@@ -4384,8 +4397,9 @@ static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
 	} else {
 		inti.type = KVM_S390_INT_PFAULT_DONE;
 		inti.parm64 = token;
-		WARN_ON_ONCE(kvm_s390_inject_vm(vcpu->kvm, &inti));
+		WARN_ON_ONCE(kvm_s390_inject_vm(vcpu->kvm, &inti, inti_mem));
 	}
+	return true;
 }
 
 bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index bf1d7798c1af..2f2da868a040 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -373,7 +373,8 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
 void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu);
 void kvm_s390_clear_float_irqs(struct kvm *kvm);
 int __must_check kvm_s390_inject_vm(struct kvm *kvm,
-				    struct kvm_s390_interrupt *s390int);
+				    struct kvm_s390_interrupt *s390int,
+				    struct kvm_s390_interrupt_info *inti);
 int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 				      struct kvm_s390_irq *irq);
 static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
-- 
2.52.0

Re: [PATCH v1 3/3] Introducing kvm_arch_set_irq_inatomic fast inject

Posted by Matthew Rosato 4 weeks ago

On 3/7/26 10:04 PM, Douglas Freimuth wrote:
> Patch 3: This patch introduces kvm_arch_set_irq_inatomic, a fast path

See comments from patch 1 of this series.

> for irq injection. Instead of placing all interrupts on the global work
> queue as it does today, this patch provides a fast path for irq

Maybe add a bit that this depends on creating a path that cannot lose control, namely by pre-pinning the associated indicator pages, conditionally attempting allocations via GFP_ATOMIC and switching a mutex to a spinlock.

This would also be a good point to work in the fact that this is fenced for Secure Execution guests rather than patch 1 beacuse the indicator pages will not be pre-pinned.

> injection. Statistical counters have been added to enable analysis of irq injection on
> the fast path and slow path including io_390_inatomic,
> io_flic_inject_airq, io_set_adapter_int and io_390_inatomic_adapter_masked.
> > In order to use this kernel series with virtio-pci, a QEMU that includes
> the 's390x/pci: set kvm_msi_via_irqfd_allowed' fix is needed.
> Additionally, the guest xml needs a thread pool and threads explicitly
> assigned per disk device using the common way of defining threads for
> disks.

This last paragraph, while relevant to testing you were doing, delves a bit too much into the specifics of when QEMU will/won't drive this code and doesn't need to be in kernel git history.  I'd suggest either removing it or moving it to the cover-letter.

> 
> 
> Signed-off-by: Douglas Freimuth <freimuth@linux.ibm.com>
> ---
>  arch/s390/include/asm/kvm_host.h |   8 +-
>  arch/s390/kvm/interrupt.c        | 169 ++++++++++++++++++++++++++-----
>  arch/s390/kvm/kvm-s390.c         |  24 ++++-
>  arch/s390/kvm/kvm-s390.h         |   3 +-
>  4 files changed, 170 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
> index 616be8ca4614..a194e9808ae3 100644
> --- a/arch/s390/include/asm/kvm_host.h
> +++ b/arch/s390/include/asm/kvm_host.h
> @@ -359,7 +359,7 @@ struct kvm_s390_float_interrupt {
>  	struct kvm_s390_mchk_info mchk;
>  	struct kvm_s390_ext_info srv_signal;
>  	int last_sleep_cpu;
> -	struct mutex ais_lock;
> +	spinlock_t ais_lock;
>  	u8 simm;
>  	u8 nimm;
>  };
> @@ -450,6 +450,10 @@ struct kvm_vm_stat {
>  	u64 inject_io;
>  	u64 io_390_adapter_map;
>  	u64 io_390_adapter_unmap;
> +	u64 io_390_inatomic;
> +	u64 io_flic_inject_airq;
> +	u64 io_set_adapter_int;
> +	u64 io_390_inatomic_adapter_masked;
>  	u64 inject_float_mchk;
>  	u64 inject_pfault_done;
>  	u64 inject_service_signal;
> @@ -481,7 +485,7 @@ struct s390_io_adapter {
>  	bool masked;
>  	bool swap;
>  	bool suppressible;
> -	struct rw_semaphore maps_lock;
> +	spinlock_t maps_lock;

Patch 1 (re-)introduced the maps_lock, now you are converting it to a spinlock 2 patches later.

I realize that you were bringing back code that was previously removed by

f65470661f36 KVM: s390/interrupt: do not pin adapter interrupt pages

but for this series wouldn't it make sense to just start by introducing maps_lock as a spinlock from patch 1 vs re-introducing the semaphore for the span of 2 commits?

> @@ -2700,6 +2706,8 @@ static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr)
>  	unsigned int id = attr->attr;
>  	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
>  
> +	kvm->stat.io_flic_inject_airq++;
> +

This just tracks how often the function is called, and includes instances where the adapter is NULL or the call to kvm_s390_inject_airq failed.

Do you want to actually track the number of successful injects only vs every time we call this routine?

> @@ -2919,15 +2968,15 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
>  	int ret;
>  	struct s390_io_adapter *adapter;
>  
> +	kvm->stat.io_set_adapter_int++;
> +

Same comment, would we rather track only the successful cases or only count times we enter the function including things like the 0->1 transition below?

Actually, the addition of this counter as well as io_flic_inject_airq seems like it should be a separate patch.

>  	/* We're only interested in the 0->1 transition. */
>  	if (!level)
>  		return 0;

...

> +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
> +			      struct kvm *kvm, int irq_source_id, int level,
> +			      bool line_status)
> +{
> +	int ret;
> +	struct s390_io_adapter *adapter;
> +	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
> +	struct kvm_s390_interrupt_info *inti;
> +	struct kvm_s390_interrupt s390int = {
> +			.type = KVM_S390_INT_IO(1, 0, 0, 0),
> +			.parm = 0,
> +	};
> +
> +	kvm->stat.io_390_inatomic++;

Is this the right time to increment this value?  There are plenty of reasons we could -EWOULDBLOCK after this point and fall back to scheduling it.

So this will only count how often we call here from irqfd_wakeup() and not how often we actually successfully make it thru the inatomic operation.

> +
> +	/* We're only interested in the 0->1 transition. */
> +	if (!level)
> +		return -EWOULDBLOCK;
> +	if (e->type != KVM_IRQ_ROUTING_S390_ADAPTER)
> +		return -EWOULDBLOCK;
> +
> +	adapter = get_io_adapter(kvm, e->adapter.adapter_id);
> +	if (!adapter)
> +		return -EWOULDBLOCK;
> +
> +	s390int.parm64 = isc_to_int_word(adapter->isc);
> +	ret = adapter_indicators_set_fast(kvm, adapter, &e->adapter);
> +	if (ret < 0)
> +		return -EWOULDBLOCK;

We know for sure that a guest that is running in SE will always hit this because no mappings will be available. 
Did you test if it would be more efficient to check the kvm for SE at the start of kvm_arch_set_irq_inatomic() and immediately return -EWOULDBLOCK there?

> +	if (!ret || adapter->masked) {
> +		kvm->stat.io_390_inatomic_adapter_masked++;
> +		return 0;
> +	}
> +
> +	inti = kzalloc_obj(*inti, GFP_ATOMIC);
> +	if (!inti)
> +		return -EWOULDBLOCK;
> +
> +	if (!test_kvm_facility(kvm, 72) || !adapter->suppressible) {
> +		ret = kvm_s390_inject_vm(kvm, &s390int, inti);
> +			if (ret == 0)

This to me seems like the right spot to kvm->stat.io_390_inatomic++ unless I misunderstand the intent of the counter.

Re: [PATCH v1 3/3] Introducing kvm_arch_set_irq_inatomic fast inject

Posted by Douglas Freimuth 3 weeks, 6 days ago


On 3/11/26 4:24 PM, Matthew Rosato wrote:
> On 3/7/26 10:04 PM, Douglas Freimuth wrote:
>> Patch 3: This patch introduces kvm_arch_set_irq_inatomic, a fast path
> 
> See comments from patch 1 of this series.
> 
>> for irq injection. Instead of placing all interrupts on the global work
>> queue as it does today, this patch provides a fast path for irq
> 
> Maybe add a bit that this depends on creating a path that cannot lose control, namely by pre-pinning the associated indicator pages, conditionally attempting allocations via GFP_ATOMIC and switching a mutex to a spinlock.
> 
> This would also be a good point to work in the fact that this is fenced for Secure Execution guests rather than patch 1 beacuse the indicator pages will not be pre-pinned.
> 
> 
>> injection. Statistical counters have been added to enable analysis of irq injection on
>> the fast path and slow path including io_390_inatomic,
>> io_flic_inject_airq, io_set_adapter_int and io_390_inatomic_adapter_masked.
>>> In order to use this kernel series with virtio-pci, a QEMU that includes
>> the 's390x/pci: set kvm_msi_via_irqfd_allowed' fix is needed.
>> Additionally, the guest xml needs a thread pool and threads explicitly
>> assigned per disk device using the common way of defining threads for
>> disks.
> 
> This last paragraph, while relevant to testing you were doing, delves a bit too much into the specifics of when QEMU will/won't drive this code and doesn't need to be in kernel git history.  I'd suggest either removing it or moving it to the cover-letter.
> 
>>
>>
>> Signed-off-by: Douglas Freimuth <freimuth@linux.ibm.com>
>> ---
>>   arch/s390/include/asm/kvm_host.h |   8 +-
>>   arch/s390/kvm/interrupt.c        | 169 ++++++++++++++++++++++++++-----
>>   arch/s390/kvm/kvm-s390.c         |  24 ++++-
>>   arch/s390/kvm/kvm-s390.h         |   3 +-
>>   4 files changed, 170 insertions(+), 34 deletions(-)
>>
>> diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
>> index 616be8ca4614..a194e9808ae3 100644
>> --- a/arch/s390/include/asm/kvm_host.h
>> +++ b/arch/s390/include/asm/kvm_host.h
>> @@ -359,7 +359,7 @@ struct kvm_s390_float_interrupt {
>>   	struct kvm_s390_mchk_info mchk;
>>   	struct kvm_s390_ext_info srv_signal;
>>   	int last_sleep_cpu;
>> -	struct mutex ais_lock;
>> +	spinlock_t ais_lock;
>>   	u8 simm;
>>   	u8 nimm;
>>   };
>> @@ -450,6 +450,10 @@ struct kvm_vm_stat {
>>   	u64 inject_io;
>>   	u64 io_390_adapter_map;
>>   	u64 io_390_adapter_unmap;
>> +	u64 io_390_inatomic;
>> +	u64 io_flic_inject_airq;
>> +	u64 io_set_adapter_int;
>> +	u64 io_390_inatomic_adapter_masked;
>>   	u64 inject_float_mchk;
>>   	u64 inject_pfault_done;
>>   	u64 inject_service_signal;
>> @@ -481,7 +485,7 @@ struct s390_io_adapter {
>>   	bool masked;
>>   	bool swap;
>>   	bool suppressible;
>> -	struct rw_semaphore maps_lock;
>> +	spinlock_t maps_lock;
> 
> Patch 1 (re-)introduced the maps_lock, now you are converting it to a spinlock 2 patches later.
> 
> I realize that you were bringing back code that was previously removed by
> 
> f65470661f36 KVM: s390/interrupt: do not pin adapter interrupt pages
> 
> but for this series wouldn't it make sense to just start by introducing maps_lock as a spinlock from patch 1 vs re-introducing the semaphore for the span of 2 commits?
> 

Matt, thank you for your input. The policy of individual patches not 
only compiling but having individual utility and standing on their own 
applies here. In patches 1 and 2 the maps_lock is more flexible as a 
semaphore providing utility for the patch. While in patch 3 the 
maps_lock has to be a spin_lock so inatomic can use it with interrupts 
disabled.
> 
> 
>> @@ -2700,6 +2706,8 @@ static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr)
>>   	unsigned int id = attr->attr;
>>   	struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
>>   
>> +	kvm->stat.io_flic_inject_airq++;
>> +
> 
> This just tracks how often the function is called, and includes instances where the adapter is NULL or the call to kvm_s390_inject_airq failed.
> 
> Do you want to actually track the number of successful injects only vs every time we call this routine?
> 

This response applies to all of the following comments about counters 
including this one. At this stage I do want to focus on the statistics 
for the number of times we enter the top of the relevant functions. For 
example, if we put the counter at the bottom of flic_inject_airq or 
kvm_arch_set_irq_inatomic, it will be silent in cases of a failed 
conditional. On failure of inatomic we would only see the flic counter 
going up which could be misleading. The current counter placement 
informs the correct path of execution is being taken, and the volume, 
which is valuable for initial deployments so we know the system is being 
configured correctly and provides performance insights. We expect to see 
a performance improvement with inatomic fast inject. If the system 
performance doesn't improve when it is deployed the first thing to check 
is the volume of each of these counters. As we observe steady state 
usage behavior in the field we could then consider a different focus of 
the counters.

> 
> 
>> @@ -2919,15 +2968,15 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
>>   	int ret;
>>   	struct s390_io_adapter *adapter;
>>   
>> +	kvm->stat.io_set_adapter_int++;
>> +
> 
> Same comment, would we rather track only the successful cases or only count times we enter the function including things like the 0->1 transition below?
> 
> Actually, the addition of this counter as well as io_flic_inject_airq seems like it should be a separate patch.
> 
>>   	/* We're only interested in the 0->1 transition. */
>>   	if (!level)
>>   		return 0;
> 
> ...
> 
>> +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
>> +			      struct kvm *kvm, int irq_source_id, int level,
>> +			      bool line_status)
>> +{
>> +	int ret;
>> +	struct s390_io_adapter *adapter;
>> +	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
>> +	struct kvm_s390_interrupt_info *inti;
>> +	struct kvm_s390_interrupt s390int = {
>> +			.type = KVM_S390_INT_IO(1, 0, 0, 0),
>> +			.parm = 0,
>> +	};
>> +
>> +	kvm->stat.io_390_inatomic++;
> 
> Is this the right time to increment this value?  There are plenty of reasons we could -EWOULDBLOCK after this point and fall back to scheduling it.
> 
> So this will only count how often we call here from irqfd_wakeup() and not how often we actually successfully make it thru the inatomic operation.
> 
>> +
>> +	/* We're only interested in the 0->1 transition. */
>> +	if (!level)
>> +		return -EWOULDBLOCK;
>> +	if (e->type != KVM_IRQ_ROUTING_S390_ADAPTER)
>> +		return -EWOULDBLOCK;
>> +
>> +	adapter = get_io_adapter(kvm, e->adapter.adapter_id);
>> +	if (!adapter)
>> +		return -EWOULDBLOCK;
>> +
>> +	s390int.parm64 = isc_to_int_word(adapter->isc);
>> +	ret = adapter_indicators_set_fast(kvm, adapter, &e->adapter);
>> +	if (ret < 0)
>> +		return -EWOULDBLOCK;
> 
> We know for sure that a guest that is running in SE will always hit this because no mappings will be available.
> Did you test if it would be more efficient to check the kvm for SE at the start of kvm_arch_set_irq_inatomic() and immediately return -EWOULDBLOCK there?
> 

It might be slightly more efficient in SE environments to immediately 
return -EWOULDBLOCK at the start of kvm_arch_set_irq_inatomic. But the 
change would be fairly broad since it would require changing the mutex 
for kvm->lock to a spin_lock to allow checking if pv is present with 
interrupts disabled. I would recommend this for a follow-on if behavior 
in the field requires it.


>> +	if (!ret || adapter->masked) {
>> +		kvm->stat.io_390_inatomic_adapter_masked++;
>> +		return 0;
>> +	}
>> +
>> +	inti = kzalloc_obj(*inti, GFP_ATOMIC);
>> +	if (!inti)
>> +		return -EWOULDBLOCK;
>> +
>> +	if (!test_kvm_facility(kvm, 72) || !adapter->suppressible) {
>> +		ret = kvm_s390_inject_vm(kvm, &s390int, inti);
>> +			if (ret == 0)
> 
> This to me seems like the right spot to kvm->stat.io_390_inatomic++ unless I misunderstand the intent of the counter.
> 
> 
>

Re: [PATCH v1 3/3] Introducing kvm_arch_set_irq_inatomic fast inject

Posted by Matthew Rosato 3 weeks, 2 days ago

On 3/13/26 10:01 AM, Douglas Freimuth wrote:

>>> @@ -450,6 +450,10 @@ struct kvm_vm_stat {
>>>       u64 inject_io;
>>>       u64 io_390_adapter_map;
>>>       u64 io_390_adapter_unmap;
>>> +    u64 io_390_inatomic;
>>> +    u64 io_flic_inject_airq;
>>> +    u64 io_set_adapter_int;
>>> +    u64 io_390_inatomic_adapter_masked;
>>>       u64 inject_float_mchk;
>>>       u64 inject_pfault_done;
>>>       u64 inject_service_signal;
>>> @@ -481,7 +485,7 @@ struct s390_io_adapter {
>>>       bool masked;
>>>       bool swap;
>>>       bool suppressible;
>>> -    struct rw_semaphore maps_lock;
>>> +    spinlock_t maps_lock;
>>
>> Patch 1 (re-)introduced the maps_lock, now you are converting it to a spinlock 2 patches later.
>>
>> I realize that you were bringing back code that was previously removed by
>>
>> f65470661f36 KVM: s390/interrupt: do not pin adapter interrupt pages
>>
>> but for this series wouldn't it make sense to just start by introducing maps_lock as a spinlock from patch 1 vs re-introducing the semaphore for the span of 2 commits?
>>
> 
> Matt, thank you for your input. The policy of individual patches not only compiling but having individual utility and standing on their own applies here. In patches 1 and 2 the maps_lock is more flexible as a semaphore providing utility for the patch. While in patch 3 the maps_lock has to be a spin_lock so inatomic can use it with interrupts disabled.

I would agree completely if these were 2 separate series, with some period of time in between when the semaphore is implemented and a subsequent switch to a spinlock. 

But here you are introducing a semaphore and immediately replacing it with a spinlock _all within a single series_, such that the semaphore implementation will never be used in practice.  

In the end, that just creates a bunch of extra insertions and subsequent deletions of those insertions all within this series.

Yes, the semaphore would be the preferred implementation if allowed to sleep but since the goal of the series is it implement kvm_arch_set_irq_inatomic then you can just indicate in patch 1 why you are already switching to a spinlock when you re-introduce these ioctl functions (e.g. in preparation for kvm_arch_set_irq_inatomic support which requires a thread of execution that will not sleep). 

>>> +
>>> +    /* We're only interested in the 0->1 transition. */
>>> +    if (!level)
>>> +        return -EWOULDBLOCK;
>>> +    if (e->type != KVM_IRQ_ROUTING_S390_ADAPTER)
>>> +        return -EWOULDBLOCK;
>>> +
>>> +    adapter = get_io_adapter(kvm, e->adapter.adapter_id);
>>> +    if (!adapter)
>>> +        return -EWOULDBLOCK;
>>> +
>>> +    s390int.parm64 = isc_to_int_word(adapter->isc);
>>> +    ret = adapter_indicators_set_fast(kvm, adapter, &e->adapter);
>>> +    if (ret < 0)
>>> +        return -EWOULDBLOCK;
>>
>> We know for sure that a guest that is running in SE will always hit this because no mappings will be available.
>> Did you test if it would be more efficient to check the kvm for SE at the start of kvm_arch_set_irq_inatomic() and immediately return -EWOULDBLOCK there?
>>
> 
> It might be slightly more efficient in SE environments to immediately return -EWOULDBLOCK at the start of kvm_arch_set_irq_inatomic. But the change would be fairly broad since it would require changing the mutex for kvm->lock to a spin_lock to allow checking if pv is present with interrupts disabled. I would recommend this for a follow-on if behavior in the field requires it.

Right, I forgot about the mutex.

OK, then I agree let's leave this for follow-on after this series lands.

I suspect that you could get away with peeking the value without the mutex held as a heuristic.  If we get it wrong it would be during a transition into/out of SE and either...

1) we -EWOULDBLOCK and fallback to irqfd_inject when we could have continued down the inatomic path -- irqfd_inject should always work.

or

2) we would proceed into the inatomic path and then get kicked out as we do with this current implementation: when we attempt to find the pre-pinned mapping (which is protected by a spinlock).

The questions to answer would be whether it's permissible to peek the value and whether it buys us enough to be worth it.

Re: [PATCH v1 3/3] Introducing kvm_arch_set_irq_inatomic fast inject

Posted by Douglas Freimuth 3 weeks, 2 days ago


On 3/16/26 11:41 AM, Matthew Rosato wrote:
> On 3/13/26 10:01 AM, Douglas Freimuth wrote:
> 
>>>> @@ -450,6 +450,10 @@ struct kvm_vm_stat {
>>>>        u64 inject_io;
>>>>        u64 io_390_adapter_map;
>>>>        u64 io_390_adapter_unmap;
>>>> +    u64 io_390_inatomic;
>>>> +    u64 io_flic_inject_airq;
>>>> +    u64 io_set_adapter_int;
>>>> +    u64 io_390_inatomic_adapter_masked;
>>>>        u64 inject_float_mchk;
>>>>        u64 inject_pfault_done;
>>>>        u64 inject_service_signal;
>>>> @@ -481,7 +485,7 @@ struct s390_io_adapter {
>>>>        bool masked;
>>>>        bool swap;
>>>>        bool suppressible;
>>>> -    struct rw_semaphore maps_lock;
>>>> +    spinlock_t maps_lock;
>>>
>>> Patch 1 (re-)introduced the maps_lock, now you are converting it to a spinlock 2 patches later.
>>>
>>> I realize that you were bringing back code that was previously removed by
>>>
>>> f65470661f36 KVM: s390/interrupt: do not pin adapter interrupt pages
>>>
>>> but for this series wouldn't it make sense to just start by introducing maps_lock as a spinlock from patch 1 vs re-introducing the semaphore for the span of 2 commits?
>>>
>>
>> Matt, thank you for your input. The policy of individual patches not only compiling but having individual utility and standing on their own applies here. In patches 1 and 2 the maps_lock is more flexible as a semaphore providing utility for the patch. While in patch 3 the maps_lock has to be a spin_lock so inatomic can use it with interrupts disabled.
> 
> I would agree completely if these were 2 separate series, with some period of time in between when the semaphore is implemented and a subsequent switch to a spinlock.
> 
> But here you are introducing a semaphore and immediately replacing it with a spinlock _all within a single series_, such that the semaphore implementation will never be used in practice.
> 
> In the end, that just creates a bunch of extra insertions and subsequent deletions of those insertions all within this series.
> 
> Yes, the semaphore would be the preferred implementation if allowed to sleep but since the goal of the series is it implement kvm_arch_set_irq_inatomic then you can just indicate in patch 1 why you are already switching to a spinlock when you re-introduce these ioctl functions (e.g. in preparation for kvm_arch_set_irq_inatomic support which requires a thread of execution that will not sleep).
> 

I will make the change and note the reasoning in Patch 1.

>>>> +
>>>> +    /* We're only interested in the 0->1 transition. */
>>>> +    if (!level)
>>>> +        return -EWOULDBLOCK;
>>>> +    if (e->type != KVM_IRQ_ROUTING_S390_ADAPTER)
>>>> +        return -EWOULDBLOCK;
>>>> +
>>>> +    adapter = get_io_adapter(kvm, e->adapter.adapter_id);
>>>> +    if (!adapter)
>>>> +        return -EWOULDBLOCK;
>>>> +
>>>> +    s390int.parm64 = isc_to_int_word(adapter->isc);
>>>> +    ret = adapter_indicators_set_fast(kvm, adapter, &e->adapter);
>>>> +    if (ret < 0)
>>>> +        return -EWOULDBLOCK;
>>>
>>> We know for sure that a guest that is running in SE will always hit this because no mappings will be available.
>>> Did you test if it would be more efficient to check the kvm for SE at the start of kvm_arch_set_irq_inatomic() and immediately return -EWOULDBLOCK there?
>>>
>>
>> It might be slightly more efficient in SE environments to immediately return -EWOULDBLOCK at the start of kvm_arch_set_irq_inatomic. But the change would be fairly broad since it would require changing the mutex for kvm->lock to a spin_lock to allow checking if pv is present with interrupts disabled. I would recommend this for a follow-on if behavior in the field requires it.
> 
> Right, I forgot about the mutex.
> 
> OK, then I agree let's leave this for follow-on after this series lands.
> 
> I suspect that you could get away with peeking the value without the mutex held as a heuristic.  If we get it wrong it would be during a transition into/out of SE and either...
> 
> 1) we -EWOULDBLOCK and fallback to irqfd_inject when we could have continued down the inatomic path -- irqfd_inject should always work.
> 
> or
> 
> 2) we would proceed into the inatomic path and then get kicked out as we do with this current implementation: when we attempt to find the pre-pinned mapping (which is protected by a spinlock).
> 
> The questions to answer would be whether it's permissible to peek the value and whether it buys us enough to be worth it.
> 
>

[PATCH v1 1/3] Add map/unmap ioctl and clean mappings post-guest
[PATCH v1 2/3] Enable adapter_indicators_set to use mapped pages
[PATCH v1 3/3] Introducing kvm_arch_set_irq_inatomic fast inject