S390 needs map/unmap ioctls, which map the adapter set
indicator pages, so the pages can be accessed when interrupts are
disabled. The mappings are cleaned up when the guest is removed.
Map/Unmap ioctls are fenced in order to avoid the longterm pinning
in Secure Execution environments. In Secure Execution
environments the path of execution available before this patch is followed.
Statistical counters to count map/unmap functions for adapter indicator
pages are added. The counters can be used to analyze
map/unmap functions in non-Secure Execution environments and similarly
can be used to analyze Secure Execution environments where the counters
will not be incremented as the adapter indicator pages are not mapped.
Signed-off-by: Douglas Freimuth <freimuth@linux.ibm.com>
---
arch/s390/include/asm/kvm_host.h | 5 +
arch/s390/kvm/interrupt.c | 151 +++++++++++++++++++++++++------
arch/s390/kvm/kvm-s390.c | 27 ++++++
3 files changed, 157 insertions(+), 26 deletions(-)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 3039c88daa63..a078420751a1 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -448,6 +448,8 @@ struct kvm_vcpu_arch {
struct kvm_vm_stat {
struct kvm_vm_stat_generic generic;
u64 inject_io;
+ u64 io_390_adapter_map;
+ u64 io_390_adapter_unmap;
u64 inject_float_mchk;
u64 inject_pfault_done;
u64 inject_service_signal;
@@ -479,6 +481,9 @@ struct s390_io_adapter {
bool masked;
bool swap;
bool suppressible;
+ spinlock_t maps_lock;
+ struct list_head maps;
+ unsigned int nr_maps;
};
#define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 7cb8ce833b62..47bd6361c849 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2426,6 +2426,9 @@ static int register_io_adapter(struct kvm_device *dev,
if (!adapter)
return -ENOMEM;
+ INIT_LIST_HEAD(&adapter->maps);
+ spin_lock_init(&adapter->maps_lock);
+ adapter->nr_maps = 0;
adapter->id = adapter_info.id;
adapter->isc = adapter_info.isc;
adapter->maskable = adapter_info.maskable;
@@ -2450,12 +2453,109 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
return ret;
}
+static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
+{
+ struct mm_struct *mm = kvm->mm;
+ struct page *page = NULL;
+ int locked = 1;
+
+ if (mmget_not_zero(mm)) {
+ mmap_read_lock(mm);
+ get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE,
+ &page, &locked);
+ if (locked)
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
+
+ return page;
+}
+
+static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+ struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+ struct s390_map_info *map;
+ unsigned long flags;
+ int ret;
+
+ if (!adapter || !addr)
+ return -EINVAL;
+
+ map = kzalloc_obj(*map, GFP_KERNEL_ACCOUNT);
+ if (!map)
+ return -ENOMEM;
+
+ map->page = get_map_page(kvm, addr);
+ if (!map->page) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&map->list);
+ map->guest_addr = addr;
+ map->addr = addr;
+ spin_lock_irqsave(&adapter->maps_lock, flags);
+ if (adapter->nr_maps < MAX_S390_ADAPTER_MAPS) {
+ list_add_tail(&map->list, &adapter->maps);
+ adapter->nr_maps++;
+ ret = 0;
+ } else {
+ put_page(map->page);
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&adapter->maps_lock, flags);
+out:
+ if (ret)
+ kfree(map);
+ return ret;
+}
+
+static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
+{
+ struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
+ struct s390_map_info *map, *tmp;
+ unsigned long flags;
+ int found = 0, idx;
+
+ if (!adapter || !addr)
+ return -EINVAL;
+
+ list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
+ if (map->guest_addr == addr) {
+ spin_lock_irqsave(&adapter->maps_lock, flags);
+ found = 1;
+ adapter->nr_maps--;
+ list_del(&map->list);
+ spin_unlock_irqrestore(&adapter->maps_lock, flags);
+ idx = srcu_read_lock(&kvm->srcu);
+ mark_page_dirty(kvm, map->addr >> PAGE_SHIFT);
+ set_page_dirty_lock(map->page);
+ srcu_read_unlock(&kvm->srcu, idx);
+ put_page(map->page);
+ kfree(map);
+ break;
+ }
+ }
+
+ return found ? 0 : -ENOENT;
+}
+
void kvm_s390_destroy_adapters(struct kvm *kvm)
{
int i;
+ struct s390_map_info *map, *tmp;
- for (i = 0; i < MAX_S390_IO_ADAPTERS; i++)
+ for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
+ if (!kvm->arch.adapters[i])
+ continue;
+ list_for_each_entry_safe(map, tmp,
+ &kvm->arch.adapters[i]->maps, list) {
+ list_del(&map->list);
+ put_page(map->page);
+ kfree(map);
+ }
kfree(kvm->arch.adapters[i]);
+ }
}
static int modify_io_adapter(struct kvm_device *dev,
@@ -2463,7 +2563,8 @@ static int modify_io_adapter(struct kvm_device *dev,
{
struct kvm_s390_io_adapter_req req;
struct s390_io_adapter *adapter;
- int ret;
+ __u64 host_addr;
+ int ret, idx;
if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
return -EFAULT;
@@ -2477,14 +2578,30 @@ static int modify_io_adapter(struct kvm_device *dev,
if (ret > 0)
ret = 0;
break;
- /*
- * The following operations are no longer needed and therefore no-ops.
- * The gpa to hva translation is done when an IRQ route is set up. The
- * set_irq code uses get_user_pages_remote() to do the actual write.
- */
case KVM_S390_IO_ADAPTER_MAP:
case KVM_S390_IO_ADAPTER_UNMAP:
- ret = 0;
+ /* If in Secure Execution mode do not long term pin. */
+ mutex_lock(&dev->kvm->lock);
+ if (kvm_s390_pv_is_protected(dev->kvm)) {
+ mutex_unlock(&dev->kvm->lock);
+ return 0;
+ }
+ idx = srcu_read_lock(&dev->kvm->srcu);
+ host_addr = gpa_to_hva(dev->kvm, req.addr);
+ if (kvm_is_error_hva(host_addr)) {
+ srcu_read_unlock(&dev->kvm->srcu, idx);
+ return -EFAULT;
+ }
+ srcu_read_unlock(&dev->kvm->srcu, idx);
+ if (req.type == KVM_S390_IO_ADAPTER_MAP) {
+ dev->kvm->stat.io_390_adapter_map++;
+ ret = kvm_s390_adapter_map(dev->kvm, req.id, host_addr);
+ mutex_unlock(&dev->kvm->lock);
+ } else {
+ dev->kvm->stat.io_390_adapter_unmap++;
+ ret = kvm_s390_adapter_unmap(dev->kvm, req.id, host_addr);
+ mutex_unlock(&dev->kvm->lock);
+ }
break;
default:
ret = -EINVAL;
@@ -2730,24 +2847,6 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
}
-static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
-{
- struct mm_struct *mm = kvm->mm;
- struct page *page = NULL;
- int locked = 1;
-
- if (mmget_not_zero(mm)) {
- mmap_read_lock(mm);
- get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE,
- &page, &locked);
- if (locked)
- mmap_read_unlock(mm);
- mmput(mm);
- }
-
- return page;
-}
-
static int adapter_indicators_set(struct kvm *kvm,
struct s390_io_adapter *adapter,
struct kvm_s390_adapter_int *adapter_int)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d7838334a338..4eada48c6e27 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -68,6 +68,8 @@
const struct kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, inject_io),
+ STATS_DESC_COUNTER(VM, io_390_adapter_map),
+ STATS_DESC_COUNTER(VM, io_390_adapter_unmap),
STATS_DESC_COUNTER(VM, inject_float_mchk),
STATS_DESC_COUNTER(VM, inject_pfault_done),
STATS_DESC_COUNTER(VM, inject_service_signal),
@@ -2491,6 +2493,30 @@ static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
return r;
}
+static void kvm_s390_unmap_all_adapters_pv(struct kvm *kvm)
+{
+ unsigned long flags;
+ struct s390_map_info *map, *tmp;
+ int i, idx;
+
+ for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
+ if (!kvm->arch.adapters[i])
+ continue;
+ list_for_each_entry_safe(map, tmp,
+ &kvm->arch.adapters[i]->maps, list) {
+ spin_lock_irqsave(&kvm->arch.adapters[i]->maps_lock, flags);
+ list_del(&map->list);
+ spin_unlock_irqrestore(&kvm->arch.adapters[i]->maps_lock, flags);
+ idx = srcu_read_lock(&kvm->srcu);
+ mark_page_dirty(kvm, map->addr >> PAGE_SHIFT);
+ set_page_dirty_lock(map->page);
+ srcu_read_unlock(&kvm->srcu, idx);
+ put_page(map->page);
+ kfree(map);
+ }
+ }
+}
+
static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
{
const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM);
@@ -2507,6 +2533,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (kvm_s390_pv_is_protected(kvm))
break;
+ kvm_s390_unmap_all_adapters_pv(kvm);
mmap_write_lock(kvm->mm);
/*
* Disable creation of new THPs. Existing THPs can stay, they
--
2.52.0
On 4/6/26 2:44 AM, Douglas Freimuth wrote:
> S390 needs map/unmap ioctls, which map the adapter set
> indicator pages, so the pages can be accessed when interrupts are
> disabled. The mappings are cleaned up when the guest is removed.
>
> Map/Unmap ioctls are fenced in order to avoid the longterm pinning
> in Secure Execution environments. In Secure Execution
> environments the path of execution available before this patch is followed.
>
> Statistical counters to count map/unmap functions for adapter indicator
> pages are added. The counters can be used to analyze
> map/unmap functions in non-Secure Execution environments and similarly
> can be used to analyze Secure Execution environments where the counters
> will not be incremented as the adapter indicator pages are not mapped.
>
> Signed-off-by: Douglas Freimuth <freimuth@linux.ibm.com>
[...]
> +static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
> +{
> + struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
> + struct s390_map_info *map, *tmp;
> + unsigned long flags;
> + int found = 0, idx;
> +
> + if (!adapter || !addr)
> + return -EINVAL;
> +
> + list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
> + if (map->guest_addr == addr) {
> + spin_lock_irqsave(&adapter->maps_lock, flags);
This lock needs to be acquired before the list_for_each_entry_safe call,
that way it protects the list from changing until after you delete the
specified entry.
> + found = 1;
> + adapter->nr_maps--;
> + list_del(&map->list);
> + spin_unlock_irqrestore(&adapter->maps_lock, flags);
> + idx = srcu_read_lock(&kvm->srcu);
> + mark_page_dirty(kvm, map->addr >> PAGE_SHIFT);
This isn't the right address to mark dirty. Per
898885477e0f KVM: s390: Use guest address to mark guest page dirty
You need to keep track of the gaddr and use that to mark the page dirty.
> + set_page_dirty_lock(map->page);
> + srcu_read_unlock(&kvm->srcu, idx);
> + put_page(map->page);
> + kfree(map);
> + break;
> + }
> + }
> +
> + return found ? 0 : -ENOENT;
> +}
> +
> void kvm_s390_destroy_adapters(struct kvm *kvm)
> {
> int i;
> + struct s390_map_info *map, *tmp;
>
> - for (i = 0; i < MAX_S390_IO_ADAPTERS; i++)
> + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
> + if (!kvm->arch.adapters[i])
> + continue;
Should also be holding the maps_lock over this until the list is emptied
(e.g. right until the kfree(kvm->arch.adapters[i] call)
> + list_for_each_entry_safe(map, tmp,
> + &kvm->arch.adapters[i]->maps, list) {
> + list_del(&map->list);
> + put_page(map->page);
> + kfree(map);
> + }
> kfree(kvm->arch.adapters[i]);
> + }
> }
>
> static int modify_io_adapter(struct kvm_device *dev,
> @@ -2463,7 +2563,8 @@ static int modify_io_adapter(struct kvm_device *dev,
> {
> struct kvm_s390_io_adapter_req req;
> struct s390_io_adapter *adapter;
> - int ret;
> + __u64 host_addr;
> + int ret, idx;
>
> if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
> return -EFAULT;
> @@ -2477,14 +2578,30 @@ static int modify_io_adapter(struct kvm_device *dev,
> if (ret > 0)
> ret = 0;
> break;
> - /*
> - * The following operations are no longer needed and therefore no-ops.
> - * The gpa to hva translation is done when an IRQ route is set up. The
> - * set_irq code uses get_user_pages_remote() to do the actual write.
> - */
> case KVM_S390_IO_ADAPTER_MAP:
> case KVM_S390_IO_ADAPTER_UNMAP:
> - ret = 0;
> + /* If in Secure Execution mode do not long term pin. */
> + mutex_lock(&dev->kvm->lock);
> + if (kvm_s390_pv_is_protected(dev->kvm)) {
> + mutex_unlock(&dev->kvm->lock);
> + return 0;
> + }
> + idx = srcu_read_lock(&dev->kvm->srcu);
> + host_addr = gpa_to_hva(dev->kvm, req.addr);
> + if (kvm_is_error_hva(host_addr)) {
> + srcu_read_unlock(&dev->kvm->srcu, idx);
dev->kvm->lock also needs to be dropped here.
> + return -EFAULT;
> + }
> + srcu_read_unlock(&dev->kvm->srcu, idx);
> + if (req.type == KVM_S390_IO_ADAPTER_MAP) {
> + dev->kvm->stat.io_390_adapter_map++;
> + ret = kvm_s390_adapter_map(dev->kvm, req.id, host_addr);
> + mutex_unlock(&dev->kvm->lock);
This unlock...
> + } else {
> + dev->kvm->stat.io_390_adapter_unmap++;
> + ret = kvm_s390_adapter_unmap(dev->kvm, req.id, host_addr);
> + mutex_unlock(&dev->kvm->lock);
and this unlock...
> + }
Could be combined and moved here
> break;
> default:
> ret = -EINVAL;
> @@ -2730,24 +2847,6 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
> return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
> }
>
> -static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
> -{
> - struct mm_struct *mm = kvm->mm;
> - struct page *page = NULL;
> - int locked = 1;
> -
> - if (mmget_not_zero(mm)) {
> - mmap_read_lock(mm);
> - get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE,
> - &page, &locked);
> - if (locked)
> - mmap_read_unlock(mm);
> - mmput(mm);
> - }
> -
> - return page;
> -}
> -
> static int adapter_indicators_set(struct kvm *kvm,
> struct s390_io_adapter *adapter,
> struct kvm_s390_adapter_int *adapter_int)
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index d7838334a338..4eada48c6e27 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -68,6 +68,8 @@
> const struct kvm_stats_desc kvm_vm_stats_desc[] = {
> KVM_GENERIC_VM_STATS(),
> STATS_DESC_COUNTER(VM, inject_io),
> + STATS_DESC_COUNTER(VM, io_390_adapter_map),
> + STATS_DESC_COUNTER(VM, io_390_adapter_unmap),
> STATS_DESC_COUNTER(VM, inject_float_mchk),
> STATS_DESC_COUNTER(VM, inject_pfault_done),
> STATS_DESC_COUNTER(VM, inject_service_signal),
> @@ -2491,6 +2493,30 @@ static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd,
> return r;
> }
>
> +static void kvm_s390_unmap_all_adapters_pv(struct kvm *kvm)
> +{
> + unsigned long flags;
> + struct s390_map_info *map, *tmp;
> + int i, idx;
> +
> + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
> + if (!kvm->arch.adapters[i])
> + continue;
> + list_for_each_entry_safe(map, tmp,
> + &kvm->arch.adapters[i]->maps, list) {
> + spin_lock_irqsave(&kvm->arch.adapters[i]->maps_lock, flags);
Same comment as kvm_s390_adapter_unmap, need to aquire this before the
list_for_each_entry_safe call.
Actually this one will need to be a bit more creative since you need to
drop the spinlock before each call to set_page_dirty_lock(), but you'll
need to re-acquire it again each time you go back to the list. That
makes list_for_each_entry_safe a bad choice.
Maybe using list_first_entry_or_null() each time you re-acquire the
spinlock, until you get a null (meaning the list is empty)?
> + list_del(&map->list);
You need to be decrementing nr_maps here before droppping the lock;
unlike kvm_s390_destroy_adapters we are not freeing the structure and if
we leave SE mode we could get more mappings later so the nr_maps value
has to be kept up-to-date.
> + spin_unlock_irqrestore(&kvm->arch.adapters[i]->maps_lock, flags);
> + idx = srcu_read_lock(&kvm->srcu);
> + mark_page_dirty(kvm, map->addr >> PAGE_SHIFT);
Same comment as above, need to use the gaddr
> + set_page_dirty_lock(map->page);
> + srcu_read_unlock(&kvm->srcu, idx);
> + put_page(map->page);
> + kfree(map);
> + }
> + }
> +}
> +
> static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
> {
> const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM);
> @@ -2507,6 +2533,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
> if (kvm_s390_pv_is_protected(kvm))
> break;
>
> + kvm_s390_unmap_all_adapters_pv(kvm);
> mmap_write_lock(kvm->mm);
> /*
> * Disable creation of new THPs. Existing THPs can stay, they
© 2016 - 2026 Red Hat, Inc.