[PATCH v11 1/9] PCI: Allow per function PCI slots

Farhan Ali posted 9 patches 3 weeks ago
There is a newer version of this series
[PATCH v11 1/9] PCI: Allow per function PCI slots
Posted by Farhan Ali 3 weeks ago
On s390 systems, which use a machine level hypervisor, PCI devices are
always accessed through a form of PCI pass-through which fundamentally
operates on a per PCI function granularity. This is also reflected in the
s390 PCI hotplug driver which creates hotplug slots for individual PCI
functions. Its reset_slot() function, which is a wrapper for
zpci_hot_reset_device(), thus also resets individual functions.

Currently, the kernel's PCI_SLOT() macro assigns the same pci_slot object
to multifunction devices. This approach worked fine on s390 systems that
only exposed virtual functions as individual PCI domains to the operating
system.  Since commit 44510d6fa0c0 ("s390/pci: Handling multifunctions")
s390 supports exposing the topology of multifunction PCI devices by
grouping them in a shared PCI domain. When attempting to reset a function
through the hotplug driver, the shared slot assignment causes the wrong
function to be reset instead of the intended one. It also leaks memory as
we do create a pci_slot object for the function, but don't correctly free
it in pci_slot_release().

Add a flag for struct pci_slot to allow per function PCI slots for
functions managed through a hypervisor, which exposes individual PCI
functions while retaining the topology. Since we can use all 8 bits
for slot 'number' (for ARI devices), change slot 'number' u16 to
account for special values -1 and PCI_SLOT_ALL_DEVICES.

Fixes: 44510d6fa0c0 ("s390/pci: Handling multifunctions")
Cc: stable@vger.kernel.org
Suggested-by: Niklas Schnelle <schnelle@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Farhan Ali <alifm@linux.ibm.com>
---
 drivers/pci/pci.c   |  5 +++--
 drivers/pci/slot.c  | 31 ++++++++++++++++++++++++-------
 include/linux/pci.h |  5 +++--
 3 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 115e5c11bab3..a93084053537 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4867,8 +4867,9 @@ static int pci_reset_hotplug_slot(struct hotplug_slot *hotplug, bool probe)
 
 static int pci_dev_reset_slot_function(struct pci_dev *dev, bool probe)
 {
-	if (dev->multifunction || dev->subordinate || !dev->slot ||
-	    dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET)
+	if (dev->subordinate || !dev->slot ||
+	    dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET ||
+	    (dev->multifunction && !dev->slot->per_func_slot))
 		return -ENOTTY;
 
 	return pci_reset_hotplug_slot(dev->slot->hotplug, probe);
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index e0b7fb43423c..8842fa069392 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -37,7 +37,7 @@ static const struct sysfs_ops pci_slot_sysfs_ops = {
 
 static ssize_t address_read_file(struct pci_slot *slot, char *buf)
 {
-	if (slot->number == 0xff)
+	if (slot->number == (u16)-1)
 		return sysfs_emit(buf, "%04x:%02x\n",
 				  pci_domain_nr(slot->bus),
 				  slot->bus->number);
@@ -72,6 +72,23 @@ static ssize_t cur_speed_read_file(struct pci_slot *slot, char *buf)
 	return bus_speed_read(slot->bus->cur_bus_speed, buf);
 }
 
+static bool pci_dev_matches_slot(struct pci_dev *dev, struct pci_slot *slot)
+{
+	if (slot->per_func_slot)
+		return dev->devfn == slot->number;
+
+	return slot->number == PCI_SLOT_ALL_DEVICES ||
+		PCI_SLOT(dev->devfn) == slot->number;
+}
+
+static bool pci_slot_enabled_per_func(void)
+{
+	if (IS_ENABLED(CONFIG_S390))
+		return true;
+
+	return false;
+}
+
 static void pci_slot_release(struct kobject *kobj)
 {
 	struct pci_dev *dev;
@@ -82,8 +99,7 @@ static void pci_slot_release(struct kobject *kobj)
 
 	down_read(&pci_bus_sem);
 	list_for_each_entry(dev, &slot->bus->devices, bus_list)
-		if (slot->number == PCI_SLOT_ALL_DEVICES ||
-		    PCI_SLOT(dev->devfn) == slot->number)
+		if (pci_dev_matches_slot(dev, slot))
 			dev->slot = NULL;
 	up_read(&pci_bus_sem);
 
@@ -176,8 +192,7 @@ void pci_dev_assign_slot(struct pci_dev *dev)
 
 	mutex_lock(&pci_slot_mutex);
 	list_for_each_entry(slot, &dev->bus->slots, list)
-		if (slot->number == PCI_SLOT_ALL_DEVICES ||
-		    PCI_SLOT(dev->devfn) == slot->number)
+		if (pci_dev_matches_slot(dev, slot))
 			dev->slot = slot;
 	mutex_unlock(&pci_slot_mutex);
 }
@@ -287,6 +302,9 @@ struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
 	slot->bus = pci_bus_get(parent);
 	slot->number = slot_nr;
 
+	if (pci_slot_enabled_per_func())
+		slot->per_func_slot = 1;
+
 	slot->kobj.kset = pci_slots_kset;
 
 	slot_name = make_slot_name(name);
@@ -307,8 +325,7 @@ struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr,
 
 	down_read(&pci_bus_sem);
 	list_for_each_entry(dev, &parent->devices, bus_list)
-		if (slot_nr == PCI_SLOT_ALL_DEVICES ||
-		    PCI_SLOT(dev->devfn) == slot_nr)
+		if (pci_dev_matches_slot(dev, slot))
 			dev->slot = slot;
 	up_read(&pci_bus_sem);
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8861eeb4381d..50a84bba3c91 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -78,14 +78,15 @@
  * and, if ARI Forwarding is enabled, functions may appear to be on multiple
  * devices.
  */
-#define PCI_SLOT_ALL_DEVICES	0xfe
+#define PCI_SLOT_ALL_DEVICES	0xfeff
 
 /* pci_slot represents a physical slot */
 struct pci_slot {
 	struct pci_bus		*bus;		/* Bus this slot is on */
 	struct list_head	list;		/* Node in list of slots */
 	struct hotplug_slot	*hotplug;	/* Hotplug info (move here) */
-	unsigned char		number;		/* Device nr, or PCI_SLOT_ALL_DEVICES */
+	u16			number;		/* Device nr, or PCI_SLOT_ALL_DEVICES */
+	unsigned int		per_func_slot:1; /* Allow per function slot */
 	struct kobject		kobj;
 };
 
-- 
2.43.0
Re: [PATCH v11 1/9] PCI: Allow per function PCI slots
Posted by Bjorn Helgaas 1 week, 6 days ago
On Mon, Mar 16, 2026 at 12:15:36PM -0700, Farhan Ali wrote:
> On s390 systems, which use a machine level hypervisor, PCI devices are
> always accessed through a form of PCI pass-through which fundamentally
> operates on a per PCI function granularity. This is also reflected in the
> s390 PCI hotplug driver which creates hotplug slots for individual PCI
> functions. Its reset_slot() function, which is a wrapper for
> zpci_hot_reset_device(), thus also resets individual functions.

I think this "pass-through" is from the hypervisor to Linux, i.e.,
what we think of as the host kernel, right?

> Currently, the kernel's PCI_SLOT() macro assigns the same pci_slot object
> to multifunction devices. This approach worked fine on s390 systems that
> only exposed virtual functions as individual PCI domains to the operating
> system.  Since commit 44510d6fa0c0 ("s390/pci: Handling multifunctions")
> s390 supports exposing the topology of multifunction PCI devices by
> grouping them in a shared PCI domain. When attempting to reset a function
> through the hotplug driver, the shared slot assignment causes the wrong
> function to be reset instead of the intended one. It also leaks memory as
> we do create a pci_slot object for the function, but don't correctly free
> it in pci_slot_release().

This alludes to the patch fixing a reset issue, but I think it should
be more prominent, e.g., the reset and leak fixes could be a separate
paragraph.  The subject line should also mention at least the reset
fix.

> Add a flag for struct pci_slot to allow per function PCI slots for
> functions managed through a hypervisor, which exposes individual PCI
> functions while retaining the topology. Since we can use all 8 bits
> for slot 'number' (for ARI devices), change slot 'number' u16 to
> account for special values -1 and PCI_SLOT_ALL_DEVICES.
> ...

>  static ssize_t address_read_file(struct pci_slot *slot, char *buf)
>  {
> -	if (slot->number == 0xff)
> +	if (slot->number == (u16)-1)

This "-1" is mentioned in the commit log, but I don't know where it
came from.  I guess we must assign -1 as a default somewhere?  Could
this be a #define to connect that assignment with this test?

>  		return sysfs_emit(buf, "%04x:%02x\n",
>  				  pci_domain_nr(slot->bus),
>  				  slot->bus->number);
Re: [PATCH v11 1/9] PCI: Allow per function PCI slots
Posted by Farhan Ali 1 week, 6 days ago
On 3/24/2026 2:55 PM, Bjorn Helgaas wrote:
> On Mon, Mar 16, 2026 at 12:15:36PM -0700, Farhan Ali wrote:
>> On s390 systems, which use a machine level hypervisor, PCI devices are
>> always accessed through a form of PCI pass-through which fundamentally
>> operates on a per PCI function granularity. This is also reflected in the
>> s390 PCI hotplug driver which creates hotplug slots for individual PCI
>> functions. Its reset_slot() function, which is a wrapper for
>> zpci_hot_reset_device(), thus also resets individual functions.
> I think this "pass-through" is from the hypervisor to Linux, i.e.,
> what we think of as the host kernel, right?

Yes, on s390x we have PR/SM hypervisor which would this passthrough to a 
Linux. The Linux would be running in a LPAR (Logical Partition) created 
by the PR/SM hypervisor. For end users an LPAR is the 'host' for all 
practical purposes.


>
>> Currently, the kernel's PCI_SLOT() macro assigns the same pci_slot object
>> to multifunction devices. This approach worked fine on s390 systems that
>> only exposed virtual functions as individual PCI domains to the operating
>> system.  Since commit 44510d6fa0c0 ("s390/pci: Handling multifunctions")
>> s390 supports exposing the topology of multifunction PCI devices by
>> grouping them in a shared PCI domain. When attempting to reset a function
>> through the hotplug driver, the shared slot assignment causes the wrong
>> function to be reset instead of the intended one. It also leaks memory as
>> we do create a pci_slot object for the function, but don't correctly free
>> it in pci_slot_release().
> This alludes to the patch fixing a reset issue, but I think it should
> be more prominent, e.g., the reset and leak fixes could be a separate
> paragraph.  The subject line should also mention at least the reset
> fix.

Will fix this. I will move what we fix into a separate paragraph.


>
>> Add a flag for struct pci_slot to allow per function PCI slots for
>> functions managed through a hypervisor, which exposes individual PCI
>> functions while retaining the topology. Since we can use all 8 bits
>> for slot 'number' (for ARI devices), change slot 'number' u16 to
>> account for special values -1 and PCI_SLOT_ALL_DEVICES.
>> ...
>>   static ssize_t address_read_file(struct pci_slot *slot, char *buf)
>>   {
>> -	if (slot->number == 0xff)
>> +	if (slot->number == (u16)-1)
> This "-1" is mentioned in the commit log, but I don't know where it
> came from.  I guess we must assign -1 as a default somewhere?  Could
> this be a #define to connect that assignment with this test?

The -1 is used a placeholder and from what I could tell 
rpaphp_register_slot() would be the only one to use this. Would you 
prefer this to be a #define?

Thanks

Farhan

>>   		return sysfs_emit(buf, "%04x:%02x\n",
>>   				  pci_domain_nr(slot->bus),
>>   				  slot->bus->number);
Re: [PATCH v11 1/9] PCI: Allow per function PCI slots
Posted by Bjorn Helgaas 1 week, 6 days ago
On Tue, Mar 24, 2026 at 04:08:28PM -0700, Farhan Ali wrote:
> On 3/24/2026 2:55 PM, Bjorn Helgaas wrote:
> > On Mon, Mar 16, 2026 at 12:15:36PM -0700, Farhan Ali wrote:
> > > On s390 systems, which use a machine level hypervisor, PCI devices are
> > > always accessed through a form of PCI pass-through which fundamentally
> > > operates on a per PCI function granularity. This is also reflected in the
> > > s390 PCI hotplug driver which creates hotplug slots for individual PCI
> > > functions. Its reset_slot() function, which is a wrapper for
> > > zpci_hot_reset_device(), thus also resets individual functions.
> ...

> > >   static ssize_t address_read_file(struct pci_slot *slot, char *buf)
> > >   {
> > > -	if (slot->number == 0xff)
> > > +	if (slot->number == (u16)-1)
> > This "-1" is mentioned in the commit log, but I don't know where it
> > came from.  I guess we must assign -1 as a default somewhere?  Could
> > this be a #define to connect that assignment with this test?
> 
> The -1 is used a placeholder and from what I could tell
> rpaphp_register_slot() would be the only one to use this. Would you prefer
> this to be a #define?

If that's the only place that uses it, I think a #define would help to
match that use with this one.

I hate having to put it in include/linux/pci.h, but it seems like it
would belong next to PCI_SLOT_ALL_DEVICES (which also doesn't need to
be exposed and is only there because it's next to the struct
pci_slot).