On s390 systems, which use a machine level hypervisor, PCI devices are
always accessed through a form of PCI pass-through which fundamentally
operates on a per PCI function granularity. This is also reflected in the
s390 PCI hotplug driver which creates hotplug slots for individual PCI
functions. Its reset_slot() function, which is a wrapper for
zpci_hot_reset_device(), thus also resets individual functions.
Currently, the kernel's PCI_SLOT() macro assigns the same pci_slot object
to multifunction devices. This approach worked fine on s390 systems that
only exposed virtual functions as individual PCI domains to the operating
system. Since commit 44510d6fa0c0 ("s390/pci: Handling multifunctions")
s390 supports exposing the topology of multifunction PCI devices by
grouping them in a shared PCI domain. When attempting to reset a function
through the hotplug driver, the shared slot assignment causes the wrong
function to be reset instead of the intended one. It also leaks memory as
we do create a pci_slot object for the function, but don't correctly free
it in pci_slot_release().
Add a flag for struct pci_slot to allow per function PCI slots for
functions managed through a hypervisor, which exposes individual PCI
functions while retaining the topology. Since we can use all 8 bits
for slot 'number' (for ARI devices), change slot 'number' u16 to
account for special values -1 and PCI_SLOT_ALL_DEVICES.
Fixes: 44510d6fa0c0 ("s390/pci: Handling multifunctions")
Cc: stable@vger.kernel.org
Suggested-by: Niklas Schnelle <schnelle@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
drivers/pci/pci.c | 5 +++--
drivers/pci/slot.c | 31 ++++++++++++++++++++++++-------
include/linux/pci.h | 5 +++--
3 files changed, 30 insertions(+), 11 deletions(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 115e5c11bab3..a93084053537 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4867,8 +4867,9 @@ static int pci_reset_hotplug_slot(struct hotplug_slot *hotplug, bool probe)
static int pci_dev_reset_slot_function(struct pci_dev *dev, bool probe)
{
- if (dev->multifunction || dev->subordinate || !dev->slot ||
- dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET)
+ if (dev->subordinate || !dev->slot ||
+ dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET ||
+ (dev->multifunction && !dev->slot->per_func_slot))
return -ENOTTY;
return pci_reset_hotplug_slot(dev->slot->hotplug, probe);
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index e0b7fb43423c..8842fa069392 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -37,7 +37,7 @@ static const struct sysfs_ops pci_slot_sysfs_ops = {
static ssize_t address_read_file(struct pci_slot *slot, char *buf)
{
- if (slot->number == 0xff)
+ if (slot->number == (u16)-1)
return sysfs_emit(buf, "%04x:%02x\n",
pci_domain_nr(slot->bus),
slot->bus->number);
@@ -72,6 +72,23 @@ static ssize_t cur_speed_read_file(struct pci_slot *slot, char *buf)
return bus_speed_read(slot->bus->cur_bus_speed, buf);
}
+static bool pci_dev_matches_slot(struct pci_dev *dev, struct pci_slot *slot)
+{
+ if (slot->per_func_slot)
+ return dev->devfn == slot->number;
+
+ return slot->number == PCI_SLOT_ALL_DEVICES ||
+ PCI_SLOT(dev->devfn) == slot->number;
+}
+
+static bool pci_slot_enabled_per_func(void)
+{
+ if (IS_ENABLED(CONFIG_S390))
+ return true;
+
+ return false;
+}
+
static void pci_slot_release(struct kobject *kobj)
{
struct pci_dev *dev;
@@ -82,8 +99,7 @@ static void pci_slot_release(struct kobject *kobj)
down_read(&pci_bus_sem);
list_for_each_entry(dev, &slot->bus->devices, bus_list)
- if (slot->number == PCI_SLOT_ALL_DEVICES ||
- PCI_SLOT(dev->devfn) == slot->number)
+ if (pci_dev_matches_slot(dev, slot))
dev->slot = NULL;
up_read(&pci_bus_sem);
@@ -176,8 +192,7 @@ void pci_dev_assign_slot(struct pci_dev *dev)
mutex_lock(&pci_slot_mutex);
list_for_each_entry(slot, &dev->bus->slots, list)
- if (slot->number == PCI_SLOT_ALL_DEVICES ||
- PCI_SLOT(dev->devfn) == slot->number)
+ if (pci_dev_matches_slot(dev, slot))
dev->slot = slot;
mutex_unlock(&pci_slot_mutex);
}
@@ -287,6 +302,9 @@ struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
slot->bus = pci_bus_get(parent);
slot->number = slot_nr;
+ if (pci_slot_enabled_per_func())
+ slot->per_func_slot = 1;
+
slot->kobj.kset = pci_slots_kset;
slot_name = make_slot_name(name);
@@ -307,8 +325,7 @@ struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
down_read(&pci_bus_sem);
list_for_each_entry(dev, &parent->devices, bus_list)
- if (slot_nr == PCI_SLOT_ALL_DEVICES ||
- PCI_SLOT(dev->devfn) == slot_nr)
+ if (pci_dev_matches_slot(dev, slot))
dev->slot = slot;
up_read(&pci_bus_sem);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8861eeb4381d..50a84bba3c91 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -78,14 +78,15 @@
* and, if ARI Forwarding is enabled, functions may appear to be on multiple
* devices.
*/
-#define PCI_SLOT_ALL_DEVICES 0xfe
+#define PCI_SLOT_ALL_DEVICES 0xfeff
/* pci_slot represents a physical slot */
struct pci_slot {
struct pci_bus *bus; /* Bus this slot is on */
struct list_head list; /* Node in list of slots */
struct hotplug_slot *hotplug; /* Hotplug info (move here) */
- unsigned char number; /* Device nr, or PCI_SLOT_ALL_DEVICES */
+ u16 number; /* Device nr, or PCI_SLOT_ALL_DEVICES */
+ unsigned int per_func_slot:1; /* Allow per function slot */
struct kobject kobj;
};
--
2.43.0
On Mon, Mar 16, 2026 at 12:15:36PM -0700, Farhan Ali wrote:
> On s390 systems, which use a machine level hypervisor, PCI devices are
> always accessed through a form of PCI pass-through which fundamentally
> operates on a per PCI function granularity. This is also reflected in the
> s390 PCI hotplug driver which creates hotplug slots for individual PCI
> functions. Its reset_slot() function, which is a wrapper for
> zpci_hot_reset_device(), thus also resets individual functions.
I think this "pass-through" is from the hypervisor to Linux, i.e.,
what we think of as the host kernel, right?
> Currently, the kernel's PCI_SLOT() macro assigns the same pci_slot object
> to multifunction devices. This approach worked fine on s390 systems that
> only exposed virtual functions as individual PCI domains to the operating
> system. Since commit 44510d6fa0c0 ("s390/pci: Handling multifunctions")
> s390 supports exposing the topology of multifunction PCI devices by
> grouping them in a shared PCI domain. When attempting to reset a function
> through the hotplug driver, the shared slot assignment causes the wrong
> function to be reset instead of the intended one. It also leaks memory as
> we do create a pci_slot object for the function, but don't correctly free
> it in pci_slot_release().
This alludes to the patch fixing a reset issue, but I think it should
be more prominent, e.g., the reset and leak fixes could be a separate
paragraph. The subject line should also mention at least the reset
fix.
> Add a flag for struct pci_slot to allow per function PCI slots for
> functions managed through a hypervisor, which exposes individual PCI
> functions while retaining the topology. Since we can use all 8 bits
> for slot 'number' (for ARI devices), change slot 'number' u16 to
> account for special values -1 and PCI_SLOT_ALL_DEVICES.
> ...
> static ssize_t address_read_file(struct pci_slot *slot, char *buf)
> {
> - if (slot->number == 0xff)
> + if (slot->number == (u16)-1)
This "-1" is mentioned in the commit log, but I don't know where it
came from. I guess we must assign -1 as a default somewhere? Could
this be a #define to connect that assignment with this test?
> return sysfs_emit(buf, "%04x:%02x\n",
> pci_domain_nr(slot->bus),
> slot->bus->number);
On 3/24/2026 2:55 PM, Bjorn Helgaas wrote:
> On Mon, Mar 16, 2026 at 12:15:36PM -0700, Farhan Ali wrote:
>> On s390 systems, which use a machine level hypervisor, PCI devices are
>> always accessed through a form of PCI pass-through which fundamentally
>> operates on a per PCI function granularity. This is also reflected in the
>> s390 PCI hotplug driver which creates hotplug slots for individual PCI
>> functions. Its reset_slot() function, which is a wrapper for
>> zpci_hot_reset_device(), thus also resets individual functions.
> I think this "pass-through" is from the hypervisor to Linux, i.e.,
> what we think of as the host kernel, right?
Yes, on s390x we have PR/SM hypervisor which would this passthrough to a
Linux. The Linux would be running in a LPAR (Logical Partition) created
by the PR/SM hypervisor. For end users an LPAR is the 'host' for all
practical purposes.
>
>> Currently, the kernel's PCI_SLOT() macro assigns the same pci_slot object
>> to multifunction devices. This approach worked fine on s390 systems that
>> only exposed virtual functions as individual PCI domains to the operating
>> system. Since commit 44510d6fa0c0 ("s390/pci: Handling multifunctions")
>> s390 supports exposing the topology of multifunction PCI devices by
>> grouping them in a shared PCI domain. When attempting to reset a function
>> through the hotplug driver, the shared slot assignment causes the wrong
>> function to be reset instead of the intended one. It also leaks memory as
>> we do create a pci_slot object for the function, but don't correctly free
>> it in pci_slot_release().
> This alludes to the patch fixing a reset issue, but I think it should
> be more prominent, e.g., the reset and leak fixes could be a separate
> paragraph. The subject line should also mention at least the reset
> fix.
Will fix this. I will move what we fix into a separate paragraph.
>
>> Add a flag for struct pci_slot to allow per function PCI slots for
>> functions managed through a hypervisor, which exposes individual PCI
>> functions while retaining the topology. Since we can use all 8 bits
>> for slot 'number' (for ARI devices), change slot 'number' u16 to
>> account for special values -1 and PCI_SLOT_ALL_DEVICES.
>> ...
>> static ssize_t address_read_file(struct pci_slot *slot, char *buf)
>> {
>> - if (slot->number == 0xff)
>> + if (slot->number == (u16)-1)
> This "-1" is mentioned in the commit log, but I don't know where it
> came from. I guess we must assign -1 as a default somewhere? Could
> this be a #define to connect that assignment with this test?
The -1 is used a placeholder and from what I could tell
rpaphp_register_slot() would be the only one to use this. Would you
prefer this to be a #define?
Thanks
Farhan
>> return sysfs_emit(buf, "%04x:%02x\n",
>> pci_domain_nr(slot->bus),
>> slot->bus->number);
On Tue, Mar 24, 2026 at 04:08:28PM -0700, Farhan Ali wrote:
> On 3/24/2026 2:55 PM, Bjorn Helgaas wrote:
> > On Mon, Mar 16, 2026 at 12:15:36PM -0700, Farhan Ali wrote:
> > > On s390 systems, which use a machine level hypervisor, PCI devices are
> > > always accessed through a form of PCI pass-through which fundamentally
> > > operates on a per PCI function granularity. This is also reflected in the
> > > s390 PCI hotplug driver which creates hotplug slots for individual PCI
> > > functions. Its reset_slot() function, which is a wrapper for
> > > zpci_hot_reset_device(), thus also resets individual functions.
> ...
> > > static ssize_t address_read_file(struct pci_slot *slot, char *buf)
> > > {
> > > - if (slot->number == 0xff)
> > > + if (slot->number == (u16)-1)
> > This "-1" is mentioned in the commit log, but I don't know where it
> > came from. I guess we must assign -1 as a default somewhere? Could
> > this be a #define to connect that assignment with this test?
>
> The -1 is used a placeholder and from what I could tell
> rpaphp_register_slot() would be the only one to use this. Would you prefer
> this to be a #define?
If that's the only place that uses it, I think a #define would help to
match that use with this one.
I hate having to put it in include/linux/pci.h, but it seems like it
would belong next to PCI_SLOT_ALL_DEVICES (which also doesn't need to
be exposed and is only there because it's next to the struct
pci_slot).
© 2016 - 2026 Red Hat, Inc.