Introduce vPCI BAR mapping task queue. Store information needed to
map/unmap BARs in struct vpci_map_task. Allow queueing of BAR map/unmap
operations in a list, thus making it possible to perform multiple p2m
operations associated with single PCI device.
This is preparatory work for further changes that need to perform
multiple unmap/map operations before returning to guest.
At the moment, only a single operation will be queued. However, when
multiple operations are queued, there is a check in modify_bars() to
skip BARs already in the requested state that will no longer be
accurate. Remove this check in preparation of upcoming changes.
Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
---
apply_map() and vpci_process_map_task() are very similar. Should we try
to combine them into a single function?
I concede that the dynamic allocation/deallocation of struct
vpci_map_task is not ideal. However, to support SR-IOV, there will be a
need to queue many mapping operations (one per VF), and statically
pre-allocating that much would seem wasteful. Only the hardware and/or
control domain would need to queue many operations, and only when
configuring SR-IOV.
v3->v4:
* switch back to dynamically allocated queue elements
v2->v3:
* base on ("vpci: Use pervcpu ranges for BAR mapping") from [1]
* rework with fixed array of map/unmap slots
[1] https://lore.kernel.org/xen-devel/cover.1772806036.git.mykyta_poturai@epam.com/T/#t
v1->v2:
* new patch
---
xen/common/domain.c | 5 +-
xen/drivers/vpci/header.c | 227 ++++++++++++++++++++++++++-----------
xen/drivers/vpci/vpci.c | 30 +----
xen/include/xen/rangeset.h | 7 --
xen/include/xen/vpci.h | 21 ++--
5 files changed, 179 insertions(+), 111 deletions(-)
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 5ef7db8f0960..b1931be9870b 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -455,8 +455,6 @@ static int vcpu_teardown(struct vcpu *v)
*/
static void vcpu_destroy(struct vcpu *v)
{
- vpci_vcpu_destroy(v);
-
free_vcpu_struct(v);
}
@@ -514,8 +512,7 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
if ( arch_vcpu_create(v) != 0 )
goto fail_sched;
- if ( vpci_vcpu_init(v) )
- goto fail_sched;
+ vpci_vcpu_init(v);
d->vcpu[vcpu_id] = v;
if ( vcpu_id != 0 )
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 89dce932f3b1..146915e28c50 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -23,6 +23,7 @@
#include <xen/lib.h>
#include <xen/sched.h>
#include <xen/softirq.h>
+#include <xen/xvmalloc.h>
#include <xsm/xsm.h>
@@ -35,7 +36,7 @@
struct map_data {
struct domain *d;
- const struct vpci_bar *bar;
+ const struct vpci_bar_map *bar;
bool map;
};
@@ -174,32 +175,20 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
ASSERT_UNREACHABLE();
}
-bool vpci_process_pending(struct vcpu *v)
+static int vpci_process_map_task(const struct pci_dev *pdev,
+ struct vpci_map_task *task)
{
- const struct pci_dev *pdev = v->vpci.pdev;
- struct vpci_header *header = NULL;
unsigned int i;
- if ( !pdev )
- return false;
-
- read_lock(&v->domain->pci_lock);
-
- if ( !pdev->vpci || (v->domain != pdev->domain) )
- {
- v->vpci.pdev = NULL;
- read_unlock(&v->domain->pci_lock);
- return false;
- }
+ ASSERT(rw_is_locked(&pdev->domain->pci_lock));
- header = &pdev->vpci->header;
- for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
{
- struct vpci_bar *bar = &header->bars[i];
- struct rangeset *mem = v->vpci.mem[i];
+ struct vpci_bar_map *bar = &task->bars[i];
+ struct rangeset *mem = bar->mem;
struct map_data data = {
- .d = v->domain,
- .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
+ .d = pdev->domain,
+ .map = task->cmd & PCI_COMMAND_MEMORY,
.bar = bar,
};
int rc;
@@ -210,58 +199,116 @@ bool vpci_process_pending(struct vcpu *v)
rc = rangeset_consume_ranges(mem, map_range, &data);
if ( rc == -ERESTART )
- {
- read_unlock(&v->domain->pci_lock);
- return true;
- }
+ return rc;
if ( rc )
{
spin_lock(&pdev->vpci->lock);
/* Disable memory decoding unconditionally on failure. */
- modify_decoding(pdev, v->vpci.cmd & ~PCI_COMMAND_MEMORY,
- false);
+ modify_decoding(pdev, task->cmd & ~PCI_COMMAND_MEMORY, false);
spin_unlock(&pdev->vpci->lock);
- /* Clean all the rangesets */
- for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
- if ( !rangeset_is_empty(v->vpci.mem[i]) )
- rangeset_purge(v->vpci.mem[i]);
+ if ( !is_hardware_domain(pdev->domain) )
+ domain_crash(pdev->domain);
+
+ return rc;
+ }
+ }
+
+ spin_lock(&pdev->vpci->lock);
+ modify_decoding(pdev, task->cmd, task->rom_only);
+ spin_unlock(&pdev->vpci->lock);
+
+ return 0;
+}
+
+static void destroy_map_task(struct vpci_map_task *task)
+{
+ unsigned int i;
+
+ if ( !task )
+ {
+ ASSERT_UNREACHABLE();
+ return;
+ }
+
+ for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
+ rangeset_destroy(task->bars[i].mem);
+
+ xvfree(task);
+}
+
+static void clear_map_queue(struct vcpu *v)
+{
+ struct vpci_map_task *task;
+
+ while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
+ struct vpci_map_task,
+ next)) != NULL )
+ {
+ list_del(&task->next);
+ destroy_map_task(task);
+ }
+}
+
+bool vpci_process_pending(struct vcpu *v)
+{
+ const struct pci_dev *pdev = v->vpci.pdev;
+ struct vpci_map_task *task;
- v->vpci.pdev = NULL;
+ if ( !pdev )
+ return false;
+ read_lock(&v->domain->pci_lock);
+
+ if ( !pdev->vpci || (v->domain != pdev->domain) )
+ {
+ clear_map_queue(v);
+ v->vpci.pdev = NULL;
+ read_unlock(&v->domain->pci_lock);
+ return false;
+ }
+
+ while ( (task = list_first_entry_or_null(&v->vpci.task_queue,
+ struct vpci_map_task,
+ next)) != NULL )
+ {
+ int rc = vpci_process_map_task(pdev, task);
+
+ if ( rc == -ERESTART )
+ {
read_unlock(&v->domain->pci_lock);
+ return true;
+ }
- if ( !is_hardware_domain(v->domain) )
- domain_crash(v->domain);
+ list_del(&task->next);
+ destroy_map_task(task);
- return false;
+ if ( rc )
+ {
+ clear_map_queue(v);
+ break;
}
}
v->vpci.pdev = NULL;
- spin_lock(&pdev->vpci->lock);
- modify_decoding(pdev, v->vpci.cmd, v->vpci.rom_only);
- spin_unlock(&pdev->vpci->lock);
-
read_unlock(&v->domain->pci_lock);
return false;
}
static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
- uint16_t cmd)
+ struct vpci_map_task *task)
{
- struct vpci_header *header = &pdev->vpci->header;
int rc = 0;
unsigned int i;
ASSERT(rw_is_write_locked(&d->pci_lock));
- for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
{
- struct vpci_bar *bar = &header->bars[i];
- struct rangeset *mem = current->vpci.mem[i];
+ struct vpci_bar_map *bar = &task->bars[i];
+ struct rangeset *mem = bar->mem;
struct map_data data = { .d = d, .map = true, .bar = bar };
if ( rangeset_is_empty(mem) )
@@ -281,15 +328,52 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
}
}
if ( !rc )
- modify_decoding(pdev, cmd, false);
+ modify_decoding(pdev, task->cmd, false);
return rc;
}
-static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
+static struct vpci_map_task *alloc_map_task(const struct pci_dev *pdev,
+ uint16_t cmd, bool rom_only)
+{
+ struct vpci_map_task *task;
+ unsigned int i;
+
+ task = xvzalloc(struct vpci_map_task);
+
+ if ( !task )
+ return NULL;
+
+ for ( i = 0; i < ARRAY_SIZE(task->bars); i++ )
+ {
+ if ( !MAPPABLE_BAR(&pdev->vpci->header.bars[i]) )
+ continue;
+
+ task->bars[i].mem = rangeset_new(pdev->domain, NULL,
+ RANGESETF_no_print);
+
+ if ( !task->bars[i].mem )
+ {
+ destroy_map_task(task);
+ return NULL;
+ }
+
+ task->bars[i].addr = pdev->vpci->header.bars[i].addr;
+ task->bars[i].guest_addr = pdev->vpci->header.bars[i].guest_addr;
+ }
+
+ task->cmd = cmd;
+ task->rom_only = rom_only;
+
+ return task;
+}
+
+static void defer_map(const struct pci_dev *pdev, struct vpci_map_task *task)
{
struct vcpu *curr = current;
+ ASSERT(!curr->vpci.pdev || curr->vpci.pdev == pdev);
+
/*
* FIXME: when deferring the {un}map the state of the device should not
* be trusted. For example the enable bit is toggled after the device
@@ -297,8 +381,8 @@ static void defer_map(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
* started for the same device if the domain is not well-behaved.
*/
curr->vpci.pdev = pdev;
- curr->vpci.cmd = cmd;
- curr->vpci.rom_only = rom_only;
+ list_add_tail(&task->next, &curr->vpci.task_queue);
+
/*
* Raise a scheduler softirq in order to prevent the guest from resuming
* execution with pending mapping operations, to trigger the invocation
@@ -313,11 +397,17 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
struct pci_dev *tmp;
const struct domain *d;
const struct vpci_msix *msix = pdev->vpci->msix;
+ struct vpci_map_task *task;
unsigned int i, j;
int rc;
ASSERT(rw_is_write_locked(&pdev->domain->pci_lock));
+ task = alloc_map_task(pdev, cmd, rom_only);
+
+ if ( !task )
+ return -ENOMEM;
+
/*
* Create a rangeset per BAR that represents the current device memory
* region and compare it against all the currently active BAR memory
@@ -333,19 +423,18 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
{
struct vpci_bar *bar = &header->bars[i];
- struct rangeset *mem = current->vpci.mem[i];
+ struct rangeset *mem = task->bars[i].mem;
unsigned long start = PFN_DOWN(bar->addr);
unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
unsigned long start_guest = PFN_DOWN(bar->guest_addr);
unsigned long end_guest = PFN_DOWN(bar->guest_addr + bar->size - 1);
- ASSERT(mem);
+ if ( !mem )
+ continue;
if ( !MAPPABLE_BAR(bar) ||
(rom_only ? bar->type != VPCI_BAR_ROM
- : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) ||
- /* Skip BARs already in the requested state. */
- bar->enabled == !!(cmd & PCI_COMMAND_MEMORY) )
+ : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
continue;
if ( !pci_check_bar(pdev, _mfn(start), _mfn(end)) )
@@ -368,7 +457,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
gprintk(XENLOG_G_WARNING,
"%pp: can't map BAR%u - offset mismatch: %#lx vs %#lx\n",
&pdev->sbdf, i, bar->guest_addr, bar->addr);
- return -EINVAL;
+ rc = -EINVAL;
+ goto fail;
}
rc = rangeset_add_range(mem, start_guest, end_guest);
@@ -376,13 +466,13 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
{
printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
start_guest, end_guest, rc);
- return rc;
+ goto fail;
}
/* Check for overlap with the already setup BAR ranges. */
for ( j = 0; j < i; j++ )
{
- struct rangeset *prev_mem = current->vpci.mem[j];
+ struct rangeset *prev_mem = task->bars[j].mem;
if ( rangeset_is_empty(prev_mem) )
continue;
@@ -393,7 +483,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
gprintk(XENLOG_WARNING,
"%pp: failed to remove overlapping range [%lx, %lx]: %d\n",
&pdev->sbdf, start_guest, end_guest, rc);
- return rc;
+ goto fail;
}
}
@@ -403,7 +493,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
gprintk(XENLOG_WARNING,
"%pp: failed to sanitize BAR#%u memory: %d\n",
&pdev->sbdf, i, rc);
- return rc;
+ goto fail;
}
}
@@ -414,9 +504,9 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
vmsix_table_size(pdev->vpci, i) - 1);
- for ( j = 0; j < ARRAY_SIZE(current->vpci.mem); j++ )
+ for ( j = 0; j < ARRAY_SIZE(task->bars); j++ )
{
- struct rangeset *mem = current->vpci.mem[j];
+ struct rangeset *mem = task->bars[j].mem;
if ( rangeset_is_empty(mem) )
continue;
@@ -427,7 +517,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
gprintk(XENLOG_WARNING,
"%pp: failed to remove MSIX table [%lx, %lx]: %d\n",
&pdev->sbdf, start, end, rc);
- return rc;
+ goto fail;
}
}
}
@@ -471,7 +561,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
for ( j = 0; j < ARRAY_SIZE(header->bars); j++)
{
const struct vpci_bar *bar = &header->bars[j];
- struct rangeset *mem = current->vpci.mem[j];
+ struct rangeset *mem = task->bars[j].mem;
if ( !rangeset_overlaps_range(mem, start, end) ||
/*
@@ -490,7 +580,7 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
gprintk(XENLOG_WARNING,
"%pp: failed to remove [%lx, %lx]: %d\n",
&pdev->sbdf, start, end, rc);
- return rc;
+ goto fail;
}
}
}
@@ -513,12 +603,19 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
* will always be to establish mappings and process all the BARs.
*/
ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
- return apply_map(pdev->domain, pdev, cmd);
+ rc = apply_map(pdev->domain, pdev, task);
+ destroy_map_task(task);
+ return rc;
}
- defer_map(pdev, cmd, rom_only);
+ defer_map(pdev, task);
return 0;
+
+ fail:
+ destroy_map_task(task);
+
+ return rc;
}
static void cf_check cmd_write(
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 8e6343653078..ce9fb5b357ff 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -24,35 +24,9 @@
#ifdef __XEN__
-void vpci_vcpu_destroy(struct vcpu *v)
+void vpci_vcpu_init(struct vcpu *v)
{
- unsigned int i;
-
- if ( !has_vpci(v->domain) && !is_idle_domain(v->domain) )
- return;
-
- for ( i = 0; i < ARRAY_SIZE(v->vpci.mem); i++ )
- RANGESET_DESTROY(v->vpci.mem[i]);
-}
-
-int vpci_vcpu_init(struct vcpu *v)
-{
- unsigned int i;
-
- if ( !has_vpci(v->domain) && !is_idle_domain(v->domain) )
- return 0;
-
- for ( i = 0; i < ARRAY_SIZE(v->vpci.mem); i++ )
- {
- char str[32];
-
- snprintf(str, sizeof(str), "%pv:BAR%u", v, i);
- v->vpci.mem[i] = rangeset_new(v->domain, str, RANGESETF_no_print);
- if ( !v->vpci.mem[i] )
- return -ENOMEM;
- }
-
- return 0;
+ INIT_LIST_HEAD(&v->vpci.task_queue);
}
#ifdef CONFIG_HAS_VPCI_GUEST_SUPPORT
diff --git a/xen/include/xen/rangeset.h b/xen/include/xen/rangeset.h
index f01e00ec9234..817505badf6f 100644
--- a/xen/include/xen/rangeset.h
+++ b/xen/include/xen/rangeset.h
@@ -40,13 +40,6 @@ struct rangeset *rangeset_new(
void rangeset_destroy(
struct rangeset *r);
-/* Destroy a rangeset, and zero the pointer to it. */
-#define RANGESET_DESTROY(r) \
- ({ \
- rangeset_destroy(r); \
- (r) = NULL; \
- })
-
/*
* Set a limit on the number of ranges that may exist in set @r.
* NOTE: This must be called while @r is empty.
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index b55bacbe6e01..e34f7abe6da2 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -19,8 +19,7 @@
*/
#define VPCI_MAX_VIRT_DEV (PCI_SLOT(~0) + 1)
-void vpci_vcpu_destroy(struct vcpu *v);
-int vpci_vcpu_init(struct vcpu *v);
+void vpci_vcpu_init(struct vcpu *v);
/* Assign vPCI to device by adding handlers. */
int __must_check vpci_assign_device(struct pci_dev *pdev);
@@ -155,14 +154,23 @@ struct vpci {
};
#ifdef __XEN__
-struct vpci_vcpu {
+struct vpci_map_task {
/* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
- const struct pci_dev *pdev;
- struct rangeset *mem[ARRAY_SIZE(((struct vpci_header *)NULL)->bars)];
+ struct list_head next;
+ struct vpci_bar_map {
+ uint64_t addr;
+ uint64_t guest_addr;
+ struct rangeset *mem;
+ } bars[ARRAY_SIZE(((struct vpci_header *)NULL)->bars)];
uint16_t cmd;
bool rom_only : 1;
};
+struct vpci_vcpu {
+ const struct pci_dev *pdev;
+ struct list_head task_queue;
+};
+
void vpci_dump_msi(void);
/* Arch-specific vPCI MSI helpers. */
@@ -207,8 +215,7 @@ bool vpci_ecam_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int len,
#else /* !CONFIG_HAS_VPCI */
struct vpci_vcpu {};
-static inline void vpci_vcpu_destroy(struct vcpu *v) { }
-static inline int vpci_vcpu_init(struct vcpu *v) { return 0; }
+static inline void vpci_vcpu_init(struct vcpu *v) { }
static inline int vpci_reinit_ext_capabilities(struct pci_dev *pdev)
{
--
2.53.0
On 4/6/26 22:11, Stewart Hildebrand wrote:
> Introduce vPCI BAR mapping task queue. Store information needed to
> map/unmap BARs in struct vpci_map_task. Allow queueing of BAR map/unmap
> operations in a list, thus making it possible to perform multiple p2m
> operations associated with single PCI device.
>
> This is preparatory work for further changes that need to perform
> multiple unmap/map operations before returning to guest.
>
> At the moment, only a single operation will be queued. However, when
> multiple operations are queued, there is a check in modify_bars() to
> skip BARs already in the requested state that will no longer be
> accurate. Remove this check in preparation of upcoming changes.
>
> Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
> ---
> apply_map() and vpci_process_map_task() are very similar. Should we try
> to combine them into a single function?
>
> I concede that the dynamic allocation/deallocation of struct
> vpci_map_task is not ideal. However, to support SR-IOV, there will be a
> need to queue many mapping operations (one per VF), and statically
> pre-allocating that much would seem wasteful. Only the hardware and/or
> control domain would need to queue many operations, and only when
> configuring SR-IOV.
>
> v3->v4:
> * switch back to dynamically allocated queue elements
>
> v2->v3:
> * base on ("vpci: Use pervcpu ranges for BAR mapping") from [1]
> * rework with fixed array of map/unmap slots
>
> [1] https://lore.kernel.org/xen-devel/cover.1772806036.git.mykyta_poturai@epam.com/T/#t
>
> v1->v2:
> * new patch
It seems like this patch is modifying a lot of the same code as the
previous one. Maybe it will be a good idea to squash them into a single one?
--
Mykyta
© 2016 - 2026 Red Hat, Inc.