[PATCH v2] PCI: Batch BAR sizing operations

Alex Williamson posted 1 patch 2 months, 1 week ago
drivers/pci/iov.c   |  8 +++-
drivers/pci/pci.h   |  4 +-
drivers/pci/probe.c | 93 +++++++++++++++++++++++++++++++++------------
3 files changed, 78 insertions(+), 27 deletions(-)
[PATCH v2] PCI: Batch BAR sizing operations
Posted by Alex Williamson 2 months, 1 week ago
Toggling memory enable is free on bare metal, but potentially expensive
in virtualized environments as the device MMIO spaces are added and
removed from the VM address space, including DMA mapping of those spaces
through the IOMMU where peer-to-peer is supported.  Currently memory
decode is disabled around sizing each individual BAR, even for SR-IOV
BARs while VF Enable is cleared.

This can be better optimized for virtual environments by sizing a set
of BARs at once, stashing the resulting mask into an array, while only
toggling memory enable once.  This also naturally improves the SR-IOV
path as the caller becomes responsible for any necessary decode disables
while sizing BARs, therefore SR-IOV BARs are sized relying only on the
VF Enable rather than toggling the PF memory enable in the command
register.

Reported-by: Mitchell Augustin <mitchell.augustin@canonical.com>
Link: https://lore.kernel.org/all/CAHTA-uYp07FgM6T1OZQKqAdSA5JrZo0ReNEyZgQZub4mDRrV5w@mail.gmail.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

v2:
 - Move PCI_POSSIBLE_ERROR() test back to original location such that it
   only tests the lower half of 64-bit BARs as noted by Ilpo Järvinen.
 - Reduce delta from original code by retaining the local @sz variable
   filled from the @sizes array and keep location of parsing upper half
   of 64-bit BARs.

 drivers/pci/iov.c   |  8 +++-
 drivers/pci/pci.h   |  4 +-
 drivers/pci/probe.c | 93 +++++++++++++++++++++++++++++++++------------
 3 files changed, 78 insertions(+), 27 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 4be402fe9ab9..9e4770cdd4d5 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -747,6 +747,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
 	struct resource *res;
 	const char *res_name;
 	struct pci_dev *pdev;
+	u32 sriovbars[PCI_SRIOV_NUM_BARS];
 
 	pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
 	if (ctrl & PCI_SRIOV_CTRL_VFE) {
@@ -783,6 +784,10 @@ static int sriov_init(struct pci_dev *dev, int pos)
 	if (!iov)
 		return -ENOMEM;
 
+	/* Sizing SR-IOV BARs with VF Enable cleared - no decode */
+	__pci_size_stdbars(dev, PCI_SRIOV_NUM_BARS,
+			   pos + PCI_SRIOV_BAR, sriovbars);
+
 	nres = 0;
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &dev->resource[i + PCI_IOV_RESOURCES];
@@ -796,7 +801,8 @@ static int sriov_init(struct pci_dev *dev, int pos)
 			bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
 		else
 			bar64 = __pci_read_base(dev, pci_bar_unknown, res,
-						pos + PCI_SRIOV_BAR + i * 4);
+						pos + PCI_SRIOV_BAR + i * 4,
+						&sriovbars[i]);
 		if (!res->flags)
 			continue;
 		if (resource_size(res) & (PAGE_SIZE - 1)) {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2e40fc63ba31..6f27651c2a70 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -315,8 +315,10 @@ bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl,
 int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout);
 
 int pci_setup_device(struct pci_dev *dev);
+void __pci_size_stdbars(struct pci_dev *dev, int count,
+			unsigned int pos, u32 *sizes);
 int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
-		    struct resource *res, unsigned int reg);
+		    struct resource *res, unsigned int reg, u32 *sizes);
 void pci_configure_ari(struct pci_dev *dev);
 void __pci_bus_size_bridges(struct pci_bus *bus,
 			struct list_head *realloc_head);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 2e81ab0f5a25..bf6aec555044 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -164,41 +164,67 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
 
 #define PCI_COMMAND_DECODE_ENABLE	(PCI_COMMAND_MEMORY | PCI_COMMAND_IO)
 
+/**
+ * __pci_size_bars - Read the raw BAR mask for a range of PCI BARs
+ * @dev: the PCI device
+ * @count: number of BARs to size
+ * @pos: starting config space position
+ * @sizes: array to store mask values
+ * @rom: indicate whether to use ROM mask, which avoids enabling ROM BARs
+ *
+ * Provided @sizes array must be sufficiently sized to store results for
+ * @count u32 BARs.  Caller is responsible for disabling decode to specified
+ * BAR range around calling this function.  This function is intended to avoid
+ * disabling decode around sizing each BAR individually, which can result in
+ * non-trivial overhead in virtualized environments with very large PCI BARs.
+ */
+static void __pci_size_bars(struct pci_dev *dev, int count,
+			    unsigned int pos, u32 *sizes, bool rom)
+{
+	u32 orig, mask = rom ? PCI_ROM_ADDRESS_MASK : ~0;
+	int i;
+
+	for (i = 0; i < count; i++, pos += 4, sizes++) {
+		pci_read_config_dword(dev, pos, &orig);
+		pci_write_config_dword(dev, pos, mask);
+		pci_read_config_dword(dev, pos, sizes);
+		pci_write_config_dword(dev, pos, orig);
+	}
+}
+
+void __pci_size_stdbars(struct pci_dev *dev, int count,
+			unsigned int pos, u32 *sizes)
+{
+	__pci_size_bars(dev, count, pos, sizes, false);
+}
+
+static void __pci_size_rom(struct pci_dev *dev, unsigned int pos, u32 *sizes)
+{
+	__pci_size_bars(dev, 1, pos, sizes, true);
+}
+
 /**
  * __pci_read_base - Read a PCI BAR
  * @dev: the PCI device
  * @type: type of the BAR
  * @res: resource buffer to be filled in
  * @pos: BAR position in the config space
+ * @sizes: array of one or more pre-read BAR masks
  *
  * Returns 1 if the BAR is 64-bit, or 0 if 32-bit.
  */
 int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
-		    struct resource *res, unsigned int pos)
+		    struct resource *res, unsigned int pos, u32 *sizes)
 {
-	u32 l = 0, sz = 0, mask;
+	u32 l = 0, sz;
 	u64 l64, sz64, mask64;
-	u16 orig_cmd;
 	struct pci_bus_region region, inverted_region;
 	const char *res_name = pci_resource_name(dev, res - dev->resource);
 
-	mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
-
-	/* No printks while decoding is disabled! */
-	if (!dev->mmio_always_on) {
-		pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
-		if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
-			pci_write_config_word(dev, PCI_COMMAND,
-				orig_cmd & ~PCI_COMMAND_DECODE_ENABLE);
-		}
-	}
-
 	res->name = pci_name(dev);
 
 	pci_read_config_dword(dev, pos, &l);
-	pci_write_config_dword(dev, pos, l | mask);
-	pci_read_config_dword(dev, pos, &sz);
-	pci_write_config_dword(dev, pos, l);
+	sz = sizes[0];
 
 	/*
 	 * All bits set in sz means the device isn't working properly.
@@ -238,18 +264,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 
 	if (res->flags & IORESOURCE_MEM_64) {
 		pci_read_config_dword(dev, pos + 4, &l);
-		pci_write_config_dword(dev, pos + 4, ~0);
-		pci_read_config_dword(dev, pos + 4, &sz);
-		pci_write_config_dword(dev, pos + 4, l);
+		sz = sizes[1];
 
 		l64 |= ((u64)l << 32);
 		sz64 |= ((u64)sz << 32);
 		mask64 |= ((u64)~0 << 32);
 	}
 
-	if (!dev->mmio_always_on && (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
-		pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
-
 	if (!sz64)
 		goto fail;
 
@@ -320,7 +341,11 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 
 static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
 {
+	u32 rombar, stdbars[PCI_STD_NUM_BARS];
 	unsigned int pos, reg;
+	u16 orig_cmd;
+
+	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
 
 	if (dev->non_compliant_bars)
 		return;
@@ -329,10 +354,28 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
 	if (dev->is_virtfn)
 		return;
 
+	/* No printks while decoding is disabled! */
+	if (!dev->mmio_always_on) {
+		pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
+		if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
+			pci_write_config_word(dev, PCI_COMMAND,
+				orig_cmd & ~PCI_COMMAND_DECODE_ENABLE);
+		}
+	}
+
+	__pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars);
+	if (rom)
+		__pci_size_rom(dev, rom, &rombar);
+
+	if (!dev->mmio_always_on &&
+	    (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
+		pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
+
 	for (pos = 0; pos < howmany; pos++) {
 		struct resource *res = &dev->resource[pos];
 		reg = PCI_BASE_ADDRESS_0 + (pos << 2);
-		pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
+		pos += __pci_read_base(dev, pci_bar_unknown,
+				       res, reg, &stdbars[pos]);
 	}
 
 	if (rom) {
@@ -340,7 +383,7 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
 		dev->rom_base_reg = rom;
 		res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH |
 				IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
-		__pci_read_base(dev, pci_bar_mem32, res, rom);
+		__pci_read_base(dev, pci_bar_mem32, res, rom, &rombar);
 	}
 }
 
-- 
2.47.1

Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by Oleg Nesterov 1 month, 3 weeks ago
On 01/20, Alex Williamson wrote:
>
>  static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
>  {
> +	u32 rombar, stdbars[PCI_STD_NUM_BARS];
>  	unsigned int pos, reg;
> +	u16 orig_cmd;
> +
> +	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);

FYI, I can't build the kernel after this patch:

	$ make drivers/pci/probe.o
	  UPD     include/config/kernel.release
	  UPD     include/generated/utsrelease.h
	  CALL    scripts/checksyscalls.sh
	  DESCEND objtool
	  INSTALL libsubcmd_headers
	  CC      drivers/pci/probe.o
	In file included from <command-line>:0:0:
	drivers/pci/probe.c: In function ‘pci_read_bases’:
	././include/linux/compiler_types.h:542:38: error: call to ‘__compiletime_assert_338’ declared with attribute error: BUILD_BUG_ON failed: howmany > PCI_STD_NUM_BARS
	  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
					      ^
	././include/linux/compiler_types.h:523:4: note: in definition of macro ‘__compiletime_assert’
	    prefix ## suffix();    \
	    ^
	././include/linux/compiler_types.h:542:2: note: in expansion of macro ‘_compiletime_assert’
	  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
	  ^
	./include/linux/build_bug.h:39:37: note: in expansion of macro ‘compiletime_assert’
	 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
					     ^
	./include/linux/build_bug.h:50:2: note: in expansion of macro ‘BUILD_BUG_ON_MSG’
	  BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
	  ^
	drivers/pci/probe.c:348:2: note: in expansion of macro ‘BUILD_BUG_ON’
	  BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);

Yes, my gcc version 5.3.1 is very old, but according to Documentation/process/changes.rst
it should still be supported, the minimal version is 5.1.

Oleg.

Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by Alex Williamson 1 month, 3 weeks ago
On Sun, 9 Feb 2025 16:45:12 +0100
Oleg Nesterov <oleg@redhat.com> wrote:

> On 01/20, Alex Williamson wrote:
> >
> >  static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
> >  {
> > +	u32 rombar, stdbars[PCI_STD_NUM_BARS];
> >  	unsigned int pos, reg;
> > +	u16 orig_cmd;
> > +
> > +	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);  
> 
> FYI, I can't build the kernel after this patch:
> 
> 	$ make drivers/pci/probe.o
> 	  UPD     include/config/kernel.release
> 	  UPD     include/generated/utsrelease.h
> 	  CALL    scripts/checksyscalls.sh
> 	  DESCEND objtool
> 	  INSTALL libsubcmd_headers
> 	  CC      drivers/pci/probe.o
> 	In file included from <command-line>:0:0:
> 	drivers/pci/probe.c: In function ‘pci_read_bases’:
> 	././include/linux/compiler_types.h:542:38: error: call to ‘__compiletime_assert_338’ declared with attribute error: BUILD_BUG_ON failed: howmany > PCI_STD_NUM_BARS
> 	  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
> 					      ^
> 	././include/linux/compiler_types.h:523:4: note: in definition of macro ‘__compiletime_assert’
> 	    prefix ## suffix();    \
> 	    ^
> 	././include/linux/compiler_types.h:542:2: note: in expansion of macro ‘_compiletime_assert’
> 	  _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
> 	  ^
> 	./include/linux/build_bug.h:39:37: note: in expansion of macro ‘compiletime_assert’
> 	 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
> 					     ^
> 	./include/linux/build_bug.h:50:2: note: in expansion of macro ‘BUILD_BUG_ON_MSG’
> 	  BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
> 	  ^
> 	drivers/pci/probe.c:348:2: note: in expansion of macro ‘BUILD_BUG_ON’
> 	  BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
> 
> Yes, my gcc version 5.3.1 is very old, but according to Documentation/process/changes.rst
> it should still be supported, the minimal version is 5.1.

While I try to setup an environment with an old gcc, can we do something
with __builtin_constant_p() to only evaluate this on versions of gcc
that can determine howmany is constant?  Ex.

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index b6536ed599c3..c70cb480125e 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -345,7 +345,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
 	unsigned int pos, reg;
 	u16 orig_cmd;
 
-	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
+	if (__builtin_constant_p(howmany))
+		BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
 
 	if (dev->non_compliant_bars)
 		return;

I welcome other suggestions too.  Thanks,

Alex
Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by Oleg Nesterov 1 month, 3 weeks ago
On 02/10, Alex Williamson wrote:
>
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -345,7 +345,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
>  	unsigned int pos, reg;
>  	u16 orig_cmd;
>  
> -	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
> +	if (__builtin_constant_p(howmany))
> +		BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);

Or just

	BUILD_BUG_ON(__builtin_constant_p(howmany) && howmany > PCI_STD_NUM_BARS);

I dunno... Works for me in any case.

I don't know if this is "right" solution though, but I can't suggest
anything better.

Oleg.
Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by Oleg Nesterov 1 month, 3 weeks ago
On 02/10, Oleg Nesterov wrote:
>
> On 02/10, Alex Williamson wrote:
> >
> > --- a/drivers/pci/probe.c
> > +++ b/drivers/pci/probe.c
> > @@ -345,7 +345,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
> >  	unsigned int pos, reg;
> >  	u16 orig_cmd;
> >
> > -	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
> > +	if (__builtin_constant_p(howmany))
> > +		BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
>
> Or just
>
> 	BUILD_BUG_ON(__builtin_constant_p(howmany) && howmany > PCI_STD_NUM_BARS);
>
> I dunno... Works for me in any case.
>
> I don't know if this is "right" solution though,

I mean, atm I simply don't know if with the newer compiler
__builtin_constant_p(howmany) will be true in this case.

Oleg.
Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by Alex Williamson 1 month, 3 weeks ago
On Mon, 10 Feb 2025 21:08:19 +0100
Oleg Nesterov <oleg@redhat.com> wrote:

> On 02/10, Oleg Nesterov wrote:
> >
> > On 02/10, Alex Williamson wrote:  
> > >
> > > --- a/drivers/pci/probe.c
> > > +++ b/drivers/pci/probe.c
> > > @@ -345,7 +345,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
> > >  	unsigned int pos, reg;
> > >  	u16 orig_cmd;
> > >
> > > -	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
> > > +	if (__builtin_constant_p(howmany))
> > > +		BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);  
> >
> > Or just
> >
> > 	BUILD_BUG_ON(__builtin_constant_p(howmany) && howmany > PCI_STD_NUM_BARS);
> >
> > I dunno... Works for me in any case.
> >
> > I don't know if this is "right" solution though,  
> 
> I mean, atm I simply don't know if with the newer compiler
> __builtin_constant_p(howmany) will be true in this case.

That I can confirm.  With gcc 14.2.1, if I change the call in
pci_setup_device() to use 7 rather than 6 for howmany, the build fails.
If you confirm this resolves the build on older gcc, it seems like a
valid fix.  I installed a VM with Fedora 23, which uses gcc 3.5.1, but
I haven't yet been able to successfully build any kernel with it.  I'll
post a patch if you want to provide a Tested-by.  Thanks,

Alex
Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by David Laight 1 month, 3 weeks ago
On Mon, 10 Feb 2025 13:48:48 -0700
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Mon, 10 Feb 2025 21:08:19 +0100
> Oleg Nesterov <oleg@redhat.com> wrote:
> 
> > On 02/10, Oleg Nesterov wrote:  
> > >
> > > On 02/10, Alex Williamson wrote:    
> > > >
> > > > --- a/drivers/pci/probe.c
> > > > +++ b/drivers/pci/probe.c
> > > > @@ -345,7 +345,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)

You probably need __always_inline to have any chance of a compile-time
test of howmany.

> > > >  	unsigned int pos, reg;
> > > >  	u16 orig_cmd;
> > > >
> > > > -	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
> > > > +	if (__builtin_constant_p(howmany))
> > > > +		BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);    
> > >
> > > Or just
> > >
> > > 	BUILD_BUG_ON(__builtin_constant_p(howmany) && howmany > PCI_STD_NUM_BARS);

	statically_true(howmany > PCI_STD_NUM_BARS)

> > >
> > > I dunno... Works for me in any case.
> > >
> > > I don't know if this is "right" solution though,    
> > 
> > I mean, atm I simply don't know if with the newer compiler
> > __builtin_constant_p(howmany) will be true in this case.  
> 
> That I can confirm.  With gcc 14.2.1, if I change the call in
> pci_setup_device() to use 7 rather than 6 for howmany, the build fails.
> If you confirm this resolves the build on older gcc, it seems like a
> valid fix.  I installed a VM with Fedora 23, which uses gcc 3.5.1, but
> I haven't yet been able to successfully build any kernel with it.  I'll
> post a patch if you want to provide a Tested-by.  Thanks,

Far too old a version to be supported.

	David

> 
> Alex
> 
>
Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by Oleg Nesterov 1 month, 3 weeks ago
On 02/10, David Laight wrote:
>
> On Mon, 10 Feb 2025 13:48:48 -0700
> Alex Williamson <alex.williamson@redhat.com> wrote:
>
> > On Mon, 10 Feb 2025 21:08:19 +0100
> > Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > > On 02/10, Oleg Nesterov wrote:
> > > >
> > > > On 02/10, Alex Williamson wrote:
> > > > >
> > > > > --- a/drivers/pci/probe.c
> > > > > +++ b/drivers/pci/probe.c
> > > > > @@ -345,7 +345,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
>
> You probably need __always_inline to have any chance of a compile-time
> test of howmany.

Can't really comment...

But note that nobody else reported this problem, so I guess the newer
compilers are smart enough? Not that I am sure we can rely on it...

> > > >
> > > > 	BUILD_BUG_ON(__builtin_constant_p(howmany) && howmany > PCI_STD_NUM_BARS);
>
> 	statically_true(howmany > PCI_STD_NUM_BARS)

Indeed.

Oleg.
Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by Ilpo Järvinen 2 months, 1 week ago
On Mon, 20 Jan 2025, Alex Williamson wrote:

> Toggling memory enable is free on bare metal, but potentially expensive
> in virtualized environments as the device MMIO spaces are added and
> removed from the VM address space, including DMA mapping of those spaces
> through the IOMMU where peer-to-peer is supported.  Currently memory
> decode is disabled around sizing each individual BAR, even for SR-IOV
> BARs while VF Enable is cleared.
> 
> This can be better optimized for virtual environments by sizing a set
> of BARs at once, stashing the resulting mask into an array, while only
> toggling memory enable once.  This also naturally improves the SR-IOV
> path as the caller becomes responsible for any necessary decode disables
> while sizing BARs, therefore SR-IOV BARs are sized relying only on the
> VF Enable rather than toggling the PF memory enable in the command
> register.
> 
> Reported-by: Mitchell Augustin <mitchell.augustin@canonical.com>
> Link: https://lore.kernel.org/all/CAHTA-uYp07FgM6T1OZQKqAdSA5JrZo0ReNEyZgQZub4mDRrV5w@mail.gmail.com
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> v2:
>  - Move PCI_POSSIBLE_ERROR() test back to original location such that it
>    only tests the lower half of 64-bit BARs as noted by Ilpo Järvinen.
>  - Reduce delta from original code by retaining the local @sz variable
>    filled from the @sizes array and keep location of parsing upper half
>    of 64-bit BARs.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>

--
 i.

>  drivers/pci/iov.c   |  8 +++-
>  drivers/pci/pci.h   |  4 +-
>  drivers/pci/probe.c | 93 +++++++++++++++++++++++++++++++++------------
>  3 files changed, 78 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 4be402fe9ab9..9e4770cdd4d5 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -747,6 +747,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	struct resource *res;
>  	const char *res_name;
>  	struct pci_dev *pdev;
> +	u32 sriovbars[PCI_SRIOV_NUM_BARS];
>  
>  	pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
>  	if (ctrl & PCI_SRIOV_CTRL_VFE) {
> @@ -783,6 +784,10 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	if (!iov)
>  		return -ENOMEM;
>  
> +	/* Sizing SR-IOV BARs with VF Enable cleared - no decode */
> +	__pci_size_stdbars(dev, PCI_SRIOV_NUM_BARS,
> +			   pos + PCI_SRIOV_BAR, sriovbars);
> +
>  	nres = 0;
>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>  		res = &dev->resource[i + PCI_IOV_RESOURCES];
> @@ -796,7 +801,8 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  			bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
>  		else
>  			bar64 = __pci_read_base(dev, pci_bar_unknown, res,
> -						pos + PCI_SRIOV_BAR + i * 4);
> +						pos + PCI_SRIOV_BAR + i * 4,
> +						&sriovbars[i]);
>  		if (!res->flags)
>  			continue;
>  		if (resource_size(res) & (PAGE_SIZE - 1)) {
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 2e40fc63ba31..6f27651c2a70 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -315,8 +315,10 @@ bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl,
>  int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout);
>  
>  int pci_setup_device(struct pci_dev *dev);
> +void __pci_size_stdbars(struct pci_dev *dev, int count,
> +			unsigned int pos, u32 *sizes);
>  int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> -		    struct resource *res, unsigned int reg);
> +		    struct resource *res, unsigned int reg, u32 *sizes);
>  void pci_configure_ari(struct pci_dev *dev);
>  void __pci_bus_size_bridges(struct pci_bus *bus,
>  			struct list_head *realloc_head);
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 2e81ab0f5a25..bf6aec555044 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -164,41 +164,67 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
>  
>  #define PCI_COMMAND_DECODE_ENABLE	(PCI_COMMAND_MEMORY | PCI_COMMAND_IO)
>  
> +/**
> + * __pci_size_bars - Read the raw BAR mask for a range of PCI BARs
> + * @dev: the PCI device
> + * @count: number of BARs to size
> + * @pos: starting config space position
> + * @sizes: array to store mask values
> + * @rom: indicate whether to use ROM mask, which avoids enabling ROM BARs
> + *
> + * Provided @sizes array must be sufficiently sized to store results for
> + * @count u32 BARs.  Caller is responsible for disabling decode to specified
> + * BAR range around calling this function.  This function is intended to avoid
> + * disabling decode around sizing each BAR individually, which can result in
> + * non-trivial overhead in virtualized environments with very large PCI BARs.
> + */
> +static void __pci_size_bars(struct pci_dev *dev, int count,
> +			    unsigned int pos, u32 *sizes, bool rom)
> +{
> +	u32 orig, mask = rom ? PCI_ROM_ADDRESS_MASK : ~0;
> +	int i;
> +
> +	for (i = 0; i < count; i++, pos += 4, sizes++) {
> +		pci_read_config_dword(dev, pos, &orig);
> +		pci_write_config_dword(dev, pos, mask);
> +		pci_read_config_dword(dev, pos, sizes);
> +		pci_write_config_dword(dev, pos, orig);
> +	}
> +}
> +
> +void __pci_size_stdbars(struct pci_dev *dev, int count,
> +			unsigned int pos, u32 *sizes)
> +{
> +	__pci_size_bars(dev, count, pos, sizes, false);
> +}
> +
> +static void __pci_size_rom(struct pci_dev *dev, unsigned int pos, u32 *sizes)
> +{
> +	__pci_size_bars(dev, 1, pos, sizes, true);
> +}
> +
>  /**
>   * __pci_read_base - Read a PCI BAR
>   * @dev: the PCI device
>   * @type: type of the BAR
>   * @res: resource buffer to be filled in
>   * @pos: BAR position in the config space
> + * @sizes: array of one or more pre-read BAR masks
>   *
>   * Returns 1 if the BAR is 64-bit, or 0 if 32-bit.
>   */
>  int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> -		    struct resource *res, unsigned int pos)
> +		    struct resource *res, unsigned int pos, u32 *sizes)
>  {
> -	u32 l = 0, sz = 0, mask;
> +	u32 l = 0, sz;
>  	u64 l64, sz64, mask64;
> -	u16 orig_cmd;
>  	struct pci_bus_region region, inverted_region;
>  	const char *res_name = pci_resource_name(dev, res - dev->resource);
>  
> -	mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
> -
> -	/* No printks while decoding is disabled! */
> -	if (!dev->mmio_always_on) {
> -		pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
> -		if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
> -			pci_write_config_word(dev, PCI_COMMAND,
> -				orig_cmd & ~PCI_COMMAND_DECODE_ENABLE);
> -		}
> -	}
> -
>  	res->name = pci_name(dev);
>  
>  	pci_read_config_dword(dev, pos, &l);
> -	pci_write_config_dword(dev, pos, l | mask);
> -	pci_read_config_dword(dev, pos, &sz);
> -	pci_write_config_dword(dev, pos, l);
> +	sz = sizes[0];
>  
>  	/*
>  	 * All bits set in sz means the device isn't working properly.
> @@ -238,18 +264,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
>  
>  	if (res->flags & IORESOURCE_MEM_64) {
>  		pci_read_config_dword(dev, pos + 4, &l);
> -		pci_write_config_dword(dev, pos + 4, ~0);
> -		pci_read_config_dword(dev, pos + 4, &sz);
> -		pci_write_config_dword(dev, pos + 4, l);
> +		sz = sizes[1];
>  
>  		l64 |= ((u64)l << 32);
>  		sz64 |= ((u64)sz << 32);
>  		mask64 |= ((u64)~0 << 32);
>  	}
>  
> -	if (!dev->mmio_always_on && (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
> -		pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
> -
>  	if (!sz64)
>  		goto fail;
>  
> @@ -320,7 +341,11 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
>  
>  static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
>  {
> +	u32 rombar, stdbars[PCI_STD_NUM_BARS];
>  	unsigned int pos, reg;
> +	u16 orig_cmd;
> +
> +	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
>  
>  	if (dev->non_compliant_bars)
>  		return;
> @@ -329,10 +354,28 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
>  	if (dev->is_virtfn)
>  		return;
>  
> +	/* No printks while decoding is disabled! */
> +	if (!dev->mmio_always_on) {
> +		pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
> +		if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
> +			pci_write_config_word(dev, PCI_COMMAND,
> +				orig_cmd & ~PCI_COMMAND_DECODE_ENABLE);
> +		}
> +	}
> +
> +	__pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars);
> +	if (rom)
> +		__pci_size_rom(dev, rom, &rombar);
> +
> +	if (!dev->mmio_always_on &&
> +	    (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
> +		pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
> +
>  	for (pos = 0; pos < howmany; pos++) {
>  		struct resource *res = &dev->resource[pos];
>  		reg = PCI_BASE_ADDRESS_0 + (pos << 2);
> -		pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
> +		pos += __pci_read_base(dev, pci_bar_unknown,
> +				       res, reg, &stdbars[pos]);
>  	}
>  
>  	if (rom) {
> @@ -340,7 +383,7 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
>  		dev->rom_base_reg = rom;
>  		res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH |
>  				IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
> -		__pci_read_base(dev, pci_bar_mem32, res, rom);
> +		__pci_read_base(dev, pci_bar_mem32, res, rom, &rombar);
>  	}
>  }
>  
> 
Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by Bjorn Helgaas 2 months, 1 week ago
On Mon, Jan 20, 2025 at 11:21:59AM -0700, Alex Williamson wrote:
> Toggling memory enable is free on bare metal, but potentially expensive
> in virtualized environments as the device MMIO spaces are added and
> removed from the VM address space, including DMA mapping of those spaces
> through the IOMMU where peer-to-peer is supported.  Currently memory
> decode is disabled around sizing each individual BAR, even for SR-IOV
> BARs while VF Enable is cleared.
> 
> This can be better optimized for virtual environments by sizing a set
> of BARs at once, stashing the resulting mask into an array, while only
> toggling memory enable once.  This also naturally improves the SR-IOV
> path as the caller becomes responsible for any necessary decode disables
> while sizing BARs, therefore SR-IOV BARs are sized relying only on the
> VF Enable rather than toggling the PF memory enable in the command
> register.
> 
> Reported-by: Mitchell Augustin <mitchell.augustin@canonical.com>
> Link: https://lore.kernel.org/all/CAHTA-uYp07FgM6T1OZQKqAdSA5JrZo0ReNEyZgQZub4mDRrV5w@mail.gmail.com
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

Updated pci/enumeration with this v2, thanks, Alex!

> ---
> 
> v2:
>  - Move PCI_POSSIBLE_ERROR() test back to original location such that it
>    only tests the lower half of 64-bit BARs as noted by Ilpo Järvinen.
>  - Reduce delta from original code by retaining the local @sz variable
>    filled from the @sizes array and keep location of parsing upper half
>    of 64-bit BARs.
> 
>  drivers/pci/iov.c   |  8 +++-
>  drivers/pci/pci.h   |  4 +-
>  drivers/pci/probe.c | 93 +++++++++++++++++++++++++++++++++------------
>  3 files changed, 78 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 4be402fe9ab9..9e4770cdd4d5 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -747,6 +747,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	struct resource *res;
>  	const char *res_name;
>  	struct pci_dev *pdev;
> +	u32 sriovbars[PCI_SRIOV_NUM_BARS];
>  
>  	pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
>  	if (ctrl & PCI_SRIOV_CTRL_VFE) {
> @@ -783,6 +784,10 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	if (!iov)
>  		return -ENOMEM;
>  
> +	/* Sizing SR-IOV BARs with VF Enable cleared - no decode */
> +	__pci_size_stdbars(dev, PCI_SRIOV_NUM_BARS,
> +			   pos + PCI_SRIOV_BAR, sriovbars);
> +
>  	nres = 0;
>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>  		res = &dev->resource[i + PCI_IOV_RESOURCES];
> @@ -796,7 +801,8 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  			bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
>  		else
>  			bar64 = __pci_read_base(dev, pci_bar_unknown, res,
> -						pos + PCI_SRIOV_BAR + i * 4);
> +						pos + PCI_SRIOV_BAR + i * 4,
> +						&sriovbars[i]);
>  		if (!res->flags)
>  			continue;
>  		if (resource_size(res) & (PAGE_SIZE - 1)) {
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 2e40fc63ba31..6f27651c2a70 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -315,8 +315,10 @@ bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl,
>  int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout);
>  
>  int pci_setup_device(struct pci_dev *dev);
> +void __pci_size_stdbars(struct pci_dev *dev, int count,
> +			unsigned int pos, u32 *sizes);
>  int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> -		    struct resource *res, unsigned int reg);
> +		    struct resource *res, unsigned int reg, u32 *sizes);
>  void pci_configure_ari(struct pci_dev *dev);
>  void __pci_bus_size_bridges(struct pci_bus *bus,
>  			struct list_head *realloc_head);
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 2e81ab0f5a25..bf6aec555044 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -164,41 +164,67 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
>  
>  #define PCI_COMMAND_DECODE_ENABLE	(PCI_COMMAND_MEMORY | PCI_COMMAND_IO)
>  
> +/**
> + * __pci_size_bars - Read the raw BAR mask for a range of PCI BARs
> + * @dev: the PCI device
> + * @count: number of BARs to size
> + * @pos: starting config space position
> + * @sizes: array to store mask values
> + * @rom: indicate whether to use ROM mask, which avoids enabling ROM BARs
> + *
> + * Provided @sizes array must be sufficiently sized to store results for
> + * @count u32 BARs.  Caller is responsible for disabling decode to specified
> + * BAR range around calling this function.  This function is intended to avoid
> + * disabling decode around sizing each BAR individually, which can result in
> + * non-trivial overhead in virtualized environments with very large PCI BARs.
> + */
> +static void __pci_size_bars(struct pci_dev *dev, int count,
> +			    unsigned int pos, u32 *sizes, bool rom)
> +{
> +	u32 orig, mask = rom ? PCI_ROM_ADDRESS_MASK : ~0;
> +	int i;
> +
> +	for (i = 0; i < count; i++, pos += 4, sizes++) {
> +		pci_read_config_dword(dev, pos, &orig);
> +		pci_write_config_dword(dev, pos, mask);
> +		pci_read_config_dword(dev, pos, sizes);
> +		pci_write_config_dword(dev, pos, orig);
> +	}
> +}
> +
> +void __pci_size_stdbars(struct pci_dev *dev, int count,
> +			unsigned int pos, u32 *sizes)
> +{
> +	__pci_size_bars(dev, count, pos, sizes, false);
> +}
> +
> +static void __pci_size_rom(struct pci_dev *dev, unsigned int pos, u32 *sizes)
> +{
> +	__pci_size_bars(dev, 1, pos, sizes, true);
> +}
> +
>  /**
>   * __pci_read_base - Read a PCI BAR
>   * @dev: the PCI device
>   * @type: type of the BAR
>   * @res: resource buffer to be filled in
>   * @pos: BAR position in the config space
> + * @sizes: array of one or more pre-read BAR masks
>   *
>   * Returns 1 if the BAR is 64-bit, or 0 if 32-bit.
>   */
>  int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> -		    struct resource *res, unsigned int pos)
> +		    struct resource *res, unsigned int pos, u32 *sizes)
>  {
> -	u32 l = 0, sz = 0, mask;
> +	u32 l = 0, sz;
>  	u64 l64, sz64, mask64;
> -	u16 orig_cmd;
>  	struct pci_bus_region region, inverted_region;
>  	const char *res_name = pci_resource_name(dev, res - dev->resource);
>  
> -	mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
> -
> -	/* No printks while decoding is disabled! */
> -	if (!dev->mmio_always_on) {
> -		pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
> -		if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
> -			pci_write_config_word(dev, PCI_COMMAND,
> -				orig_cmd & ~PCI_COMMAND_DECODE_ENABLE);
> -		}
> -	}
> -
>  	res->name = pci_name(dev);
>  
>  	pci_read_config_dword(dev, pos, &l);
> -	pci_write_config_dword(dev, pos, l | mask);
> -	pci_read_config_dword(dev, pos, &sz);
> -	pci_write_config_dword(dev, pos, l);
> +	sz = sizes[0];
>  
>  	/*
>  	 * All bits set in sz means the device isn't working properly.
> @@ -238,18 +264,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
>  
>  	if (res->flags & IORESOURCE_MEM_64) {
>  		pci_read_config_dword(dev, pos + 4, &l);
> -		pci_write_config_dword(dev, pos + 4, ~0);
> -		pci_read_config_dword(dev, pos + 4, &sz);
> -		pci_write_config_dword(dev, pos + 4, l);
> +		sz = sizes[1];
>  
>  		l64 |= ((u64)l << 32);
>  		sz64 |= ((u64)sz << 32);
>  		mask64 |= ((u64)~0 << 32);
>  	}
>  
> -	if (!dev->mmio_always_on && (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
> -		pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
> -
>  	if (!sz64)
>  		goto fail;
>  
> @@ -320,7 +341,11 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
>  
>  static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
>  {
> +	u32 rombar, stdbars[PCI_STD_NUM_BARS];
>  	unsigned int pos, reg;
> +	u16 orig_cmd;
> +
> +	BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
>  
>  	if (dev->non_compliant_bars)
>  		return;
> @@ -329,10 +354,28 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
>  	if (dev->is_virtfn)
>  		return;
>  
> +	/* No printks while decoding is disabled! */
> +	if (!dev->mmio_always_on) {
> +		pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
> +		if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
> +			pci_write_config_word(dev, PCI_COMMAND,
> +				orig_cmd & ~PCI_COMMAND_DECODE_ENABLE);
> +		}
> +	}
> +
> +	__pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars);
> +	if (rom)
> +		__pci_size_rom(dev, rom, &rombar);
> +
> +	if (!dev->mmio_always_on &&
> +	    (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
> +		pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
> +
>  	for (pos = 0; pos < howmany; pos++) {
>  		struct resource *res = &dev->resource[pos];
>  		reg = PCI_BASE_ADDRESS_0 + (pos << 2);
> -		pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
> +		pos += __pci_read_base(dev, pci_bar_unknown,
> +				       res, reg, &stdbars[pos]);
>  	}
>  
>  	if (rom) {
> @@ -340,7 +383,7 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
>  		dev->rom_base_reg = rom;
>  		res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH |
>  				IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
> -		__pci_read_base(dev, pci_bar_mem32, res, rom);
> +		__pci_read_base(dev, pci_bar_mem32, res, rom, &rombar);
>  	}
>  }
>  
> -- 
> 2.47.1
> 
Re: [PATCH v2] PCI: Batch BAR sizing operations
Posted by Mitchell Augustin 2 months, 1 week ago
Thanks Alex,

I just verified that v2 of the patch causes no regressions and
performs just as well as v1 and my original patch on my hardware.

Reviewed-by: "Mitchell Augustin" <mitchell.augustin@canonical.com>
Tested-by: "Mitchell Augustin" <mitchell.augustin@canonical.com>


On Mon, Jan 20, 2025 at 4:44 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Mon, Jan 20, 2025 at 11:21:59AM -0700, Alex Williamson wrote:
> > Toggling memory enable is free on bare metal, but potentially expensive
> > in virtualized environments as the device MMIO spaces are added and
> > removed from the VM address space, including DMA mapping of those spaces
> > through the IOMMU where peer-to-peer is supported.  Currently memory
> > decode is disabled around sizing each individual BAR, even for SR-IOV
> > BARs while VF Enable is cleared.
> >
> > This can be better optimized for virtual environments by sizing a set
> > of BARs at once, stashing the resulting mask into an array, while only
> > toggling memory enable once.  This also naturally improves the SR-IOV
> > path as the caller becomes responsible for any necessary decode disables
> > while sizing BARs, therefore SR-IOV BARs are sized relying only on the
> > VF Enable rather than toggling the PF memory enable in the command
> > register.
> >
> > Reported-by: Mitchell Augustin <mitchell.augustin@canonical.com>
> > Link: https://lore.kernel.org/all/CAHTA-uYp07FgM6T1OZQKqAdSA5JrZo0ReNEyZgQZub4mDRrV5w@mail.gmail.com
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
>
> Updated pci/enumeration with this v2, thanks, Alex!
>
> > ---
> >
> > v2:
> >  - Move PCI_POSSIBLE_ERROR() test back to original location such that it
> >    only tests the lower half of 64-bit BARs as noted by Ilpo Järvinen.
> >  - Reduce delta from original code by retaining the local @sz variable
> >    filled from the @sizes array and keep location of parsing upper half
> >    of 64-bit BARs.
> >
> >  drivers/pci/iov.c   |  8 +++-
> >  drivers/pci/pci.h   |  4 +-
> >  drivers/pci/probe.c | 93 +++++++++++++++++++++++++++++++++------------
> >  3 files changed, 78 insertions(+), 27 deletions(-)
> >
> > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> > index 4be402fe9ab9..9e4770cdd4d5 100644
> > --- a/drivers/pci/iov.c
> > +++ b/drivers/pci/iov.c
> > @@ -747,6 +747,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
> >       struct resource *res;
> >       const char *res_name;
> >       struct pci_dev *pdev;
> > +     u32 sriovbars[PCI_SRIOV_NUM_BARS];
> >
> >       pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
> >       if (ctrl & PCI_SRIOV_CTRL_VFE) {
> > @@ -783,6 +784,10 @@ static int sriov_init(struct pci_dev *dev, int pos)
> >       if (!iov)
> >               return -ENOMEM;
> >
> > +     /* Sizing SR-IOV BARs with VF Enable cleared - no decode */
> > +     __pci_size_stdbars(dev, PCI_SRIOV_NUM_BARS,
> > +                        pos + PCI_SRIOV_BAR, sriovbars);
> > +
> >       nres = 0;
> >       for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> >               res = &dev->resource[i + PCI_IOV_RESOURCES];
> > @@ -796,7 +801,8 @@ static int sriov_init(struct pci_dev *dev, int pos)
> >                       bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
> >               else
> >                       bar64 = __pci_read_base(dev, pci_bar_unknown, res,
> > -                                             pos + PCI_SRIOV_BAR + i * 4);
> > +                                             pos + PCI_SRIOV_BAR + i * 4,
> > +                                             &sriovbars[i]);
> >               if (!res->flags)
> >                       continue;
> >               if (resource_size(res) & (PAGE_SIZE - 1)) {
> > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> > index 2e40fc63ba31..6f27651c2a70 100644
> > --- a/drivers/pci/pci.h
> > +++ b/drivers/pci/pci.h
> > @@ -315,8 +315,10 @@ bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl,
> >  int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout);
> >
> >  int pci_setup_device(struct pci_dev *dev);
> > +void __pci_size_stdbars(struct pci_dev *dev, int count,
> > +                     unsigned int pos, u32 *sizes);
> >  int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> > -                 struct resource *res, unsigned int reg);
> > +                 struct resource *res, unsigned int reg, u32 *sizes);
> >  void pci_configure_ari(struct pci_dev *dev);
> >  void __pci_bus_size_bridges(struct pci_bus *bus,
> >                       struct list_head *realloc_head);
> > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> > index 2e81ab0f5a25..bf6aec555044 100644
> > --- a/drivers/pci/probe.c
> > +++ b/drivers/pci/probe.c
> > @@ -164,41 +164,67 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
> >
> >  #define PCI_COMMAND_DECODE_ENABLE    (PCI_COMMAND_MEMORY | PCI_COMMAND_IO)
> >
> > +/**
> > + * __pci_size_bars - Read the raw BAR mask for a range of PCI BARs
> > + * @dev: the PCI device
> > + * @count: number of BARs to size
> > + * @pos: starting config space position
> > + * @sizes: array to store mask values
> > + * @rom: indicate whether to use ROM mask, which avoids enabling ROM BARs
> > + *
> > + * Provided @sizes array must be sufficiently sized to store results for
> > + * @count u32 BARs.  Caller is responsible for disabling decode to specified
> > + * BAR range around calling this function.  This function is intended to avoid
> > + * disabling decode around sizing each BAR individually, which can result in
> > + * non-trivial overhead in virtualized environments with very large PCI BARs.
> > + */
> > +static void __pci_size_bars(struct pci_dev *dev, int count,
> > +                         unsigned int pos, u32 *sizes, bool rom)
> > +{
> > +     u32 orig, mask = rom ? PCI_ROM_ADDRESS_MASK : ~0;
> > +     int i;
> > +
> > +     for (i = 0; i < count; i++, pos += 4, sizes++) {
> > +             pci_read_config_dword(dev, pos, &orig);
> > +             pci_write_config_dword(dev, pos, mask);
> > +             pci_read_config_dword(dev, pos, sizes);
> > +             pci_write_config_dword(dev, pos, orig);
> > +     }
> > +}
> > +
> > +void __pci_size_stdbars(struct pci_dev *dev, int count,
> > +                     unsigned int pos, u32 *sizes)
> > +{
> > +     __pci_size_bars(dev, count, pos, sizes, false);
> > +}
> > +
> > +static void __pci_size_rom(struct pci_dev *dev, unsigned int pos, u32 *sizes)
> > +{
> > +     __pci_size_bars(dev, 1, pos, sizes, true);
> > +}
> > +
> >  /**
> >   * __pci_read_base - Read a PCI BAR
> >   * @dev: the PCI device
> >   * @type: type of the BAR
> >   * @res: resource buffer to be filled in
> >   * @pos: BAR position in the config space
> > + * @sizes: array of one or more pre-read BAR masks
> >   *
> >   * Returns 1 if the BAR is 64-bit, or 0 if 32-bit.
> >   */
> >  int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> > -                 struct resource *res, unsigned int pos)
> > +                 struct resource *res, unsigned int pos, u32 *sizes)
> >  {
> > -     u32 l = 0, sz = 0, mask;
> > +     u32 l = 0, sz;
> >       u64 l64, sz64, mask64;
> > -     u16 orig_cmd;
> >       struct pci_bus_region region, inverted_region;
> >       const char *res_name = pci_resource_name(dev, res - dev->resource);
> >
> > -     mask = type ? PCI_ROM_ADDRESS_MASK : ~0;
> > -
> > -     /* No printks while decoding is disabled! */
> > -     if (!dev->mmio_always_on) {
> > -             pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
> > -             if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
> > -                     pci_write_config_word(dev, PCI_COMMAND,
> > -                             orig_cmd & ~PCI_COMMAND_DECODE_ENABLE);
> > -             }
> > -     }
> > -
> >       res->name = pci_name(dev);
> >
> >       pci_read_config_dword(dev, pos, &l);
> > -     pci_write_config_dword(dev, pos, l | mask);
> > -     pci_read_config_dword(dev, pos, &sz);
> > -     pci_write_config_dword(dev, pos, l);
> > +     sz = sizes[0];
> >
> >       /*
> >        * All bits set in sz means the device isn't working properly.
> > @@ -238,18 +264,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> >
> >       if (res->flags & IORESOURCE_MEM_64) {
> >               pci_read_config_dword(dev, pos + 4, &l);
> > -             pci_write_config_dword(dev, pos + 4, ~0);
> > -             pci_read_config_dword(dev, pos + 4, &sz);
> > -             pci_write_config_dword(dev, pos + 4, l);
> > +             sz = sizes[1];
> >
> >               l64 |= ((u64)l << 32);
> >               sz64 |= ((u64)sz << 32);
> >               mask64 |= ((u64)~0 << 32);
> >       }
> >
> > -     if (!dev->mmio_always_on && (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
> > -             pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
> > -
> >       if (!sz64)
> >               goto fail;
> >
> > @@ -320,7 +341,11 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> >
> >  static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
> >  {
> > +     u32 rombar, stdbars[PCI_STD_NUM_BARS];
> >       unsigned int pos, reg;
> > +     u16 orig_cmd;
> > +
> > +     BUILD_BUG_ON(howmany > PCI_STD_NUM_BARS);
> >
> >       if (dev->non_compliant_bars)
> >               return;
> > @@ -329,10 +354,28 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
> >       if (dev->is_virtfn)
> >               return;
> >
> > +     /* No printks while decoding is disabled! */
> > +     if (!dev->mmio_always_on) {
> > +             pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
> > +             if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
> > +                     pci_write_config_word(dev, PCI_COMMAND,
> > +                             orig_cmd & ~PCI_COMMAND_DECODE_ENABLE);
> > +             }
> > +     }
> > +
> > +     __pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars);
> > +     if (rom)
> > +             __pci_size_rom(dev, rom, &rombar);
> > +
> > +     if (!dev->mmio_always_on &&
> > +         (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
> > +             pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
> > +
> >       for (pos = 0; pos < howmany; pos++) {
> >               struct resource *res = &dev->resource[pos];
> >               reg = PCI_BASE_ADDRESS_0 + (pos << 2);
> > -             pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
> > +             pos += __pci_read_base(dev, pci_bar_unknown,
> > +                                    res, reg, &stdbars[pos]);
> >       }
> >
> >       if (rom) {
> > @@ -340,7 +383,7 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
> >               dev->rom_base_reg = rom;
> >               res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH |
> >                               IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
> > -             __pci_read_base(dev, pci_bar_mem32, res, rom);
> > +             __pci_read_base(dev, pci_bar_mem32, res, rom, &rombar);
> >       }
> >  }
> >
> > --
> > 2.47.1
> >



-- 
Mitchell Augustin
Software Engineer - Ubuntu Partner Engineering