From: Stewart Hildebrand <stewart.hildebrand@amd.com>
This code is expected to only be used by privileged domains,
unprivileged domains should not get access to the SR-IOV capability.
Implement RW handlers for PCI_SRIOV_CTRL register to dynamically
map/unmap VF BARS. Recalculate BAR sizes before mapping VFs to account
for possible changes in the system page size register. Also force VFs to
always use emulated reads for command register, this is needed to
prevent some drivers accidentally unmapping BARs. Discovery of VFs is
done by Dom0, which must register them with Xen.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
Signed-off-by: Mykyta Poturai <mykyta_poturai@epam.com>
---
v2->v3:
* rework adding VFs with multiple mapping operations support
* use private.h
* style fixes
* fixup cleanup_sriov
* emulate command register read for VFs
v1->v2:
* switch to VF discovery by Xen
* fix sequential vpci_modify_bars calls
* move documentation changes to a separate commit
---
xen/drivers/vpci/Makefile | 1 +
xen/drivers/vpci/header.c | 6 +-
xen/drivers/vpci/private.h | 1 +
xen/drivers/vpci/sriov.c | 314 +++++++++++++++++++++++++++++++++++++
xen/include/xen/vpci.h | 7 +-
5 files changed, 327 insertions(+), 2 deletions(-)
create mode 100644 xen/drivers/vpci/sriov.c
diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index 9793a4f9b0..8b0e3c03a7 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1,3 +1,4 @@
obj-y += cap.o
obj-y += vpci.o header.o rebar.o
+obj-y += sriov.o
obj-$(CONFIG_HAS_PCI_MSI) += msi.o msix.o
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 451cdd3a6f..a36285672e 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -869,7 +869,8 @@ int vpci_init_header(struct pci_dev *pdev)
* PCI_COMMAND_PARITY, PCI_COMMAND_SERR, and PCI_COMMAND_FAST_BACK.
*/
rc = vpci_add_register_mask(pdev->vpci,
- is_hwdom ? vpci_hw_read16 : guest_cmd_read,
+ is_hwdom && !pdev->info.is_virtfn
+ ? vpci_hw_read16 : guest_cmd_read,
cmd_write, PCI_COMMAND, 2, header, 0, 0,
is_hwdom ? 0
: PCI_COMMAND_RSVDP_MASK |
@@ -900,6 +901,9 @@ int vpci_init_header(struct pci_dev *pdev)
header->guest_cmd = cmd;
+ if ( pdev->info.is_virtfn )
+ return vpci_vf_init_header(pdev);
+
/* Disable memory decoding before sizing. */
if ( !is_hwdom || (cmd & PCI_COMMAND_MEMORY) )
pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd & ~PCI_COMMAND_MEMORY);
diff --git a/xen/drivers/vpci/private.h b/xen/drivers/vpci/private.h
index f012fd160d..8e0043ddbe 100644
--- a/xen/drivers/vpci/private.h
+++ b/xen/drivers/vpci/private.h
@@ -45,6 +45,7 @@ typedef struct {
REGISTER_VPCI_CAPABILITY(PCI_EXT_CAP_ID_##name, name, finit, fclean, true)
int __must_check vpci_init_header(struct pci_dev *pdev);
+int __must_check vpci_vf_init_header(struct pci_dev *pdev);
int vpci_init_capabilities(struct pci_dev *pdev, bool ext_only);
void vpci_cleanup_capabilities(struct pci_dev *pdev, bool ext_only);
diff --git a/xen/drivers/vpci/sriov.c b/xen/drivers/vpci/sriov.c
new file mode 100644
index 0000000000..ec6a7b84d5
--- /dev/null
+++ b/xen/drivers/vpci/sriov.c
@@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Handlers for accesses to the SR-IOV capability structure.
+ *
+ * Copyright (C) 2026 Citrix Systems R&D
+ */
+
+#include <xen/sched.h>
+#include <xen/vpci.h>
+#include <xsm/xsm.h>
+#include "private.h"
+
+static int vf_init_bars(const struct pci_dev *vf_pdev)
+{
+ int vf_idx;
+ unsigned int i;
+ const struct pci_dev *pf_pdev = vf_pdev->pf_pdev;
+ struct vpci_bar *bars = vf_pdev->vpci->header.bars;
+ struct vpci_bar *physfn_vf_bars = pf_pdev->vpci->sriov->vf_bars;
+ unsigned int sriov_pos = pci_find_ext_capability(pf_pdev,
+ PCI_EXT_CAP_ID_SRIOV);
+ uint16_t offset = pci_conf_read16(pf_pdev->sbdf,
+ sriov_pos + PCI_SRIOV_VF_OFFSET);
+ uint16_t stride = pci_conf_read16(pf_pdev->sbdf,
+ sriov_pos + PCI_SRIOV_VF_STRIDE);
+
+ vf_idx = vf_pdev->sbdf.sbdf - (pf_pdev->sbdf.sbdf + offset);
+ if ( vf_idx < 0 )
+ return -EINVAL;
+
+ if ( stride )
+ {
+ if ( vf_idx % stride )
+ return -EINVAL;
+ vf_idx /= stride;
+ }
+
+ /*
+ * Set up BARs for this VF out of PF's VF BARs taking into account
+ * the index of the VF.
+ */
+ for ( i = 0; i < PCI_SRIOV_NUM_BARS; i++ )
+ {
+ bars[i].addr = physfn_vf_bars[i].addr + vf_idx * physfn_vf_bars[i].size;
+ bars[i].guest_addr = bars[i].addr;
+ bars[i].size = physfn_vf_bars[i].size;
+ bars[i].type = physfn_vf_bars[i].type;
+ bars[i].prefetchable = physfn_vf_bars[i].prefetchable;
+ }
+
+ return 0;
+}
+
+static int map_vfs(const struct pci_dev *pf_pdev, uint16_t cmd)
+{
+ struct pci_dev *vf_pdev;
+ int rc;
+
+ ASSERT(rw_is_write_locked(&pf_pdev->domain->pci_lock));
+
+ list_for_each_entry(vf_pdev, &pf_pdev->vf_list, vf_list)
+ {
+ rc = vpci_modify_bars(vf_pdev, cmd, false);
+ if ( rc )
+ {
+ gprintk(XENLOG_ERR, "failed to %s VF %pp: %d\n",
+ (cmd & PCI_COMMAND_MEMORY) ? "map" : "unmap",
+ &vf_pdev->sbdf, rc);
+ return rc;
+ }
+
+ vf_pdev->vpci->header.guest_cmd &= ~PCI_COMMAND_MEMORY;
+ vf_pdev->vpci->header.guest_cmd |= (cmd & PCI_COMMAND_MEMORY);
+ }
+
+ return 0;
+}
+
+static int size_vf_bars(const struct pci_dev *pf_pdev, unsigned int sriov_pos,
+ uint64_t *vf_rlen)
+{
+ struct vpci_bar *bars;
+ unsigned int i;
+ int rc = 0;
+
+ ASSERT(rw_is_write_locked(&pf_pdev->domain->pci_lock));
+ ASSERT(!pf_pdev->info.is_virtfn);
+ ASSERT(pf_pdev->vpci->sriov);
+
+ /* Read BARs for VFs out of PF's SR-IOV extended capability. */
+ bars = pf_pdev->vpci->sriov->vf_bars;
+ /* Set the BARs addresses and size. */
+ for ( i = 0; i < PCI_SRIOV_NUM_BARS; i += rc )
+ {
+ unsigned int idx = sriov_pos + PCI_SRIOV_BAR + i * 4;
+ uint32_t bar;
+ uint64_t addr, size;
+
+ bar = pci_conf_read32(pf_pdev->sbdf, idx);
+
+ rc = pci_size_mem_bar(pf_pdev->sbdf, idx, &addr, &size,
+ PCI_BAR_VF |
+ ((i == PCI_SRIOV_NUM_BARS - 1) ? PCI_BAR_LAST
+ : 0));
+
+ /*
+ * Update vf_rlen on the PF. According to the spec the size of
+ * the BARs can change if the system page size register is
+ * modified, so always update rlen when enabling VFs.
+ */
+ vf_rlen[i] = size;
+
+ if ( !size )
+ {
+ bars[i].type = VPCI_BAR_EMPTY;
+ continue;
+ }
+
+ bars[i].addr = addr;
+ bars[i].guest_addr = addr;
+ bars[i].size = size;
+ bars[i].prefetchable = bar & PCI_BASE_ADDRESS_MEM_PREFETCH;
+
+ switch ( rc )
+ {
+ case 1:
+ bars[i].type = VPCI_BAR_MEM32;
+ break;
+
+ case 2:
+ bars[i].type = VPCI_BAR_MEM64_LO;
+ bars[i + 1].type = VPCI_BAR_MEM64_HI;
+ break;
+
+ default:
+ ASSERT_UNREACHABLE();
+ }
+ }
+
+ rc = rc > 0 ? 0 : rc;
+
+ return rc;
+}
+
+static void cf_check control_write(const struct pci_dev *pdev, unsigned int reg,
+ uint32_t val, void *data)
+{
+ unsigned int sriov_pos = reg - PCI_SRIOV_CTRL;
+ struct vpci_sriov *sriov = pdev->vpci->sriov;
+ uint16_t control = pci_conf_read16(pdev->sbdf, reg);
+ bool mem_enabled = control & PCI_SRIOV_CTRL_MSE;
+ bool new_mem_enabled = val & PCI_SRIOV_CTRL_MSE;
+ bool enabled = control & PCI_SRIOV_CTRL_VFE;
+ bool new_enabled = val & PCI_SRIOV_CTRL_VFE;
+ int rc;
+
+ ASSERT(!pdev->info.is_virtfn);
+
+ if ( new_enabled == enabled && new_mem_enabled == mem_enabled )
+ {
+ pci_conf_write16(pdev->sbdf, reg, val);
+ return;
+ }
+
+ if ( mem_enabled && !new_mem_enabled )
+ map_vfs(pdev, 0);
+
+ if ( !enabled && new_enabled )
+ {
+ size_vf_bars(pdev, sriov_pos, (uint64_t *)data);
+
+ /*
+ * Only update the number of active VFs when enabling, when
+ * disabling use the cached value in order to always remove the same
+ * number of VFs that were active.
+ */
+ sriov->num_vfs = pci_conf_read16(pdev->sbdf,
+ sriov_pos + PCI_SRIOV_NUM_VF);
+ }
+
+ if ( !mem_enabled && new_mem_enabled )
+ {
+ rc = map_vfs(pdev, PCI_COMMAND_MEMORY);
+
+ if ( rc )
+ map_vfs(pdev, 0);
+ }
+
+ pci_conf_write16(pdev->sbdf, reg, val);
+}
+
+int vpci_vf_init_header(struct pci_dev *vf_pdev)
+{
+ const struct pci_dev *pf_pdev;
+ unsigned int sriov_pos;
+ int rc = 0;
+ uint16_t ctrl;
+
+ ASSERT(rw_is_write_locked(&vf_pdev->domain->pci_lock));
+
+ if ( !vf_pdev->info.is_virtfn )
+ return 0;
+
+ pf_pdev = vf_pdev->pf_pdev;
+ ASSERT(pf_pdev);
+
+ rc = vf_init_bars(vf_pdev);
+ if ( rc )
+ return rc;
+
+ sriov_pos = pci_find_ext_capability(pf_pdev, PCI_EXT_CAP_ID_SRIOV);
+ ctrl = pci_conf_read16(pf_pdev->sbdf, sriov_pos + PCI_SRIOV_CTRL);
+
+ if ( (pf_pdev->domain == vf_pdev->domain) && (ctrl & PCI_SRIOV_CTRL_MSE) )
+ {
+ rc = vpci_modify_bars(vf_pdev, PCI_COMMAND_MEMORY, false);
+ if ( rc )
+ return rc;
+
+ vf_pdev->vpci->header.guest_cmd |= PCI_COMMAND_MEMORY;
+ }
+
+ return rc;
+}
+
+static int cf_check init_sriov(struct pci_dev *pdev)
+{
+ unsigned int pos;
+
+ ASSERT(!pdev->info.is_virtfn);
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+
+ if ( !pos )
+ return 0;
+
+ if ( xsm_resource_setup_pci(XSM_PRIV, pdev->sbdf.bdf) )
+ {
+ printk(XENLOG_ERR
+ "%pp: SR-IOV configuration unsupported for unpriv %pd\n",
+ &pdev->sbdf, pdev->domain);
+ return -EACCES;
+ }
+
+ pdev->vpci->sriov = xzalloc(struct vpci_sriov);
+ if ( !pdev->vpci->sriov )
+ return -ENOMEM;
+
+ /*
+ * We need to modify vf_rlen in control_write but we can't do it cleanly
+ * from pdev because write callback only accepts const pdev. Moving vf_rlen
+ * inside of struct vpci_sriov is also not possible because it is used
+ * before vpci init. So pass it here as additional data to not require
+ * dropping const in control_write.
+ */
+ return vpci_add_register(pdev->vpci, vpci_hw_read16, control_write,
+ pos + PCI_SRIOV_CTRL, 2, &pdev->physfn.vf_rlen);
+}
+
+static int cf_check cleanup_sriov(const struct pci_dev *pdev, bool hide)
+{
+ unsigned int pos;
+ int rc;
+
+ if ( !pdev->vpci->sriov )
+ return 0;
+
+ ASSERT(!pdev->info.is_virtfn);
+ ASSERT(list_empty(&pdev->vf_list));
+
+ if ( !hide )
+ {
+ XFREE(pdev->vpci->sriov);
+ return 0;
+ }
+
+ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+ rc = vpci_remove_registers(pdev->vpci, pos + PCI_SRIOV_CTRL, 2);
+ if ( rc )
+ {
+ printk(XENLOG_ERR "%pd %pp: fail to remove SRIOV handlers rc=%d\n",
+ pdev->domain, &pdev->sbdf, rc);
+ ASSERT_UNREACHABLE();
+ return rc;
+ }
+ XFREE(pdev->vpci->sriov);
+
+ /*
+ * Unprivileged domains have a deny by default register access policy, no
+ * need to add any further handlers for them.
+ */
+ if ( !is_hardware_domain(pdev->domain) )
+ return 0;
+
+ rc = vpci_add_register(pdev->vpci, vpci_hw_read16, NULL,
+ pos + PCI_SRIOV_CTRL, 2, NULL);
+ if ( rc )
+ printk(XENLOG_ERR "%pd %pp: fail to add SRIOV ctrl handler rc=%d\n",
+ pdev->domain, &pdev->sbdf, rc);
+
+ return rc;
+}
+
+REGISTER_VPCI_EXTCAP(SRIOV, init_sriov, cleanup_sriov);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index e34f7abe6d..fc8a943abe 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -84,7 +84,6 @@ struct vpci {
* upon to know whether BARs are mapped into the guest p2m.
*/
bool bars_mapped : 1;
- /* FIXME: currently there's no support for SR-IOV. */
} header;
/* MSI data. */
@@ -145,6 +144,12 @@ struct vpci {
unsigned int nbars:3;
} rebar;
+ struct vpci_sriov {
+ /* PF only */
+ struct vpci_bar vf_bars[PCI_SRIOV_NUM_BARS];
+ uint16_t num_vfs;
+ } *sriov;
+
#ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
/* Guest SBDF of the device. */
#define INVALID_GUEST_SBDF ((pci_sbdf_t){ .sbdf = ~0U })
--
2.51.2
On 4/9/26 10:01, Mykyta Poturai wrote: > From: Stewart Hildebrand <stewart.hildebrand@amd.com> > > This code is expected to only be used by privileged domains, > unprivileged domains should not get access to the SR-IOV capability. > > Implement RW handlers for PCI_SRIOV_CTRL register to dynamically > map/unmap VF BARS. Recalculate BAR sizes before mapping VFs to account > for possible changes in the system page size register. Also force VFs to > always use emulated reads for command register, this is needed to > prevent some drivers accidentally unmapping BARs. Discovery of VFs is > done by Dom0, which must register them with Xen. > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com> > Signed-off-by: Mykyta Poturai <mykyta_poturai@epam.com> > --- > v2->v3: > * rework adding VFs with multiple mapping operations support > * use private.h > * style fixes > * fixup cleanup_sriov > * emulate command register read for VFs > > v1->v2: > * switch to VF discovery by Xen > * fix sequential vpci_modify_bars calls > * move documentation changes to a separate commit > --- I have only cursory read this patch, but it got me thinking, which I'm still in the middle of completing my questions. Figured I will pose it to the community for other's opinions as well. From what I have looked at this far, it is just wiring up the SRIOV under the existing PCI access controls. It's possible to just provide a specific label to the device to differentiate in policy, but I am wondering if there might want to be a further degree of fine-grained access controls over SRIOV devices? V/r, Daniel P. Smith
© 2016 - 2026 Red Hat, Inc.