From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Instead of handling a single range set, that contains all the memory
regions of all the BARs and ROM, have them per BAR.
This is in preparation of making non-identity mappings in p2m for the
MMIOs/ROM.
Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
---
xen/drivers/vpci/header.c | 172 ++++++++++++++++++++++++++------------
xen/include/xen/vpci.h | 3 +-
2 files changed, 122 insertions(+), 53 deletions(-)
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 793f79ece831..7f54199a3894 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -131,49 +131,75 @@ static void modify_decoding(const struct pci_dev *pdev, uint16_t cmd,
bool vpci_process_pending(struct vcpu *v)
{
- if ( v->vpci.mem )
+ if ( v->vpci.num_mem_ranges )
{
struct map_data data = {
.d = v->domain,
.map = v->vpci.cmd & PCI_COMMAND_MEMORY,
};
- int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
+ struct pci_dev *pdev = v->vpci.pdev;
+ struct vpci_header *header = &pdev->vpci->header;
+ unsigned int i;
- if ( rc == -ERESTART )
- return true;
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ {
+ struct vpci_bar *bar = &header->bars[i];
+ int rc;
- spin_lock(&v->vpci.pdev->vpci->lock);
- /* Disable memory decoding unconditionally on failure. */
- modify_decoding(v->vpci.pdev,
- rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
- !rc && v->vpci.rom_only);
- spin_unlock(&v->vpci.pdev->vpci->lock);
+ if ( !bar->mem )
+ continue;
- rangeset_destroy(v->vpci.mem);
- v->vpci.mem = NULL;
- if ( rc )
- /*
- * FIXME: in case of failure remove the device from the domain.
- * Note that there might still be leftover mappings. While this is
- * safe for Dom0, for DomUs the domain will likely need to be
- * killed in order to avoid leaking stale p2m mappings on
- * failure.
- */
- vpci_remove_device(v->vpci.pdev);
+ rc = rangeset_consume_ranges(bar->mem, map_range, &data);
+
+ if ( rc == -ERESTART )
+ return true;
+
+ spin_lock(&pdev->vpci->lock);
+ /* Disable memory decoding unconditionally on failure. */
+ modify_decoding(pdev,
+ rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
+ !rc && v->vpci.rom_only);
+ spin_unlock(&pdev->vpci->lock);
+
+ rangeset_destroy(bar->mem);
+ bar->mem = NULL;
+ v->vpci.num_mem_ranges--;
+ if ( rc )
+ /*
+ * FIXME: in case of failure remove the device from the domain.
+ * Note that there might still be leftover mappings. While this is
+ * safe for Dom0, for DomUs the domain will likely need to be
+ * killed in order to avoid leaking stale p2m mappings on
+ * failure.
+ */
+ vpci_remove_device(pdev);
+ }
}
return false;
}
static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
- struct rangeset *mem, uint16_t cmd)
+ uint16_t cmd)
{
struct map_data data = { .d = d, .map = true };
- int rc;
+ struct vpci_header *header = &pdev->vpci->header;
+ int rc = 0;
+ unsigned int i;
+
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ {
+ struct vpci_bar *bar = &header->bars[i];
- while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART )
- process_pending_softirqs();
- rangeset_destroy(mem);
+ if ( !bar->mem )
+ continue;
+
+ while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
+ &data)) == -ERESTART )
+ process_pending_softirqs();
+ rangeset_destroy(bar->mem);
+ bar->mem = NULL;
+ }
if ( !rc )
modify_decoding(pdev, cmd, false);
@@ -181,7 +207,7 @@ static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
}
static void defer_map(struct domain *d, struct pci_dev *pdev,
- struct rangeset *mem, uint16_t cmd, bool rom_only)
+ uint16_t cmd, bool rom_only, uint8_t num_mem_ranges)
{
struct vcpu *curr = current;
@@ -192,9 +218,9 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
* started for the same device if the domain is not well-behaved.
*/
curr->vpci.pdev = pdev;
- curr->vpci.mem = mem;
curr->vpci.cmd = cmd;
curr->vpci.rom_only = rom_only;
+ curr->vpci.num_mem_ranges = num_mem_ranges;
/*
* Raise a scheduler softirq in order to prevent the guest from resuming
* execution with pending mapping operations, to trigger the invocation
@@ -206,42 +232,47 @@ static void defer_map(struct domain *d, struct pci_dev *pdev,
static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
{
struct vpci_header *header = &pdev->vpci->header;
- struct rangeset *mem = rangeset_new(NULL, NULL, 0);
struct pci_dev *tmp, *dev = NULL;
const struct vpci_msix *msix = pdev->vpci->msix;
- unsigned int i;
+ unsigned int i, j;
int rc;
-
- if ( !mem )
- return -ENOMEM;
+ uint8_t num_mem_ranges;
/*
- * Create a rangeset that represents the current device BARs memory region
+ * Create a rangeset per BAR that represents the current device memory region
* and compare it against all the currently active BAR memory regions. If
* an overlap is found, subtract it from the region to be mapped/unmapped.
*
- * First fill the rangeset with all the BARs of this device or with the ROM
+ * First fill the rangesets with all the BARs of this device or with the ROM
* BAR only, depending on whether the guest is toggling the memory decode
* bit of the command register, or the enable bit of the ROM BAR register.
*/
for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
{
- const struct vpci_bar *bar = &header->bars[i];
+ struct vpci_bar *bar = &header->bars[i];
unsigned long start = PFN_DOWN(bar->addr);
unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
+ bar->mem = NULL;
+
if ( !MAPPABLE_BAR(bar) ||
(rom_only ? bar->type != VPCI_BAR_ROM
: (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) )
continue;
- rc = rangeset_add_range(mem, start, end);
+ bar->mem = rangeset_new(NULL, NULL, 0);
+ if ( !bar->mem )
+ {
+ rc = -ENOMEM;
+ goto fail;
+ }
+
+ rc = rangeset_add_range(bar->mem, start, end);
if ( rc )
{
printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
start, end, rc);
- rangeset_destroy(mem);
- return rc;
+ goto fail;
}
}
@@ -252,14 +283,21 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
vmsix_table_size(pdev->vpci, i) - 1);
- rc = rangeset_remove_range(mem, start, end);
- if ( rc )
+ for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
{
- printk(XENLOG_G_WARNING
- "Failed to remove MSIX table [%lx, %lx]: %d\n",
- start, end, rc);
- rangeset_destroy(mem);
- return rc;
+ const struct vpci_bar *bar = &header->bars[j];
+
+ if ( !bar->mem )
+ continue;
+
+ rc = rangeset_remove_range(bar->mem, start, end);
+ if ( rc )
+ {
+ printk(XENLOG_G_WARNING
+ "Failed to remove MSIX table [%lx, %lx]: %d\n",
+ start, end, rc);
+ goto fail;
+ }
}
}
@@ -291,7 +329,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
unsigned long start = PFN_DOWN(bar->addr);
unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
- if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
+ if ( !bar->enabled ||
+ !rangeset_overlaps_range(bar->mem, start, end) ||
/*
* If only the ROM enable bit is toggled check against other
* BARs in the same device for overlaps, but not against the
@@ -300,13 +339,12 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
(rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
continue;
- rc = rangeset_remove_range(mem, start, end);
+ rc = rangeset_remove_range(bar->mem, start, end);
if ( rc )
{
printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
start, end, rc);
- rangeset_destroy(mem);
- return rc;
+ goto fail;
}
}
}
@@ -324,12 +362,42 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
* will always be to establish mappings and process all the BARs.
*/
ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
- return apply_map(pdev->domain, pdev, mem, cmd);
+ return apply_map(pdev->domain, pdev, cmd);
}
- defer_map(dev->domain, dev, mem, cmd, rom_only);
+ /* Find out how many memory ranges has left after MSI and overlaps. */
+ num_mem_ranges = 0;
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ {
+ struct vpci_bar *bar = &header->bars[i];
+
+ if ( !rangeset_is_empty(bar->mem) )
+ num_mem_ranges++;
+ }
+
+ /*
+ * There are cases when PCI device, root port for example, has neither
+ * memory space nor IO. In this case PCI command register write is
+ * missed resulting in the underlying PCI device not functional, so:
+ * - if there are no regions write the command register now
+ * - if there are regions then defer work and write later on
+ */
+ if ( !num_mem_ranges )
+ pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
+ else
+ defer_map(dev->domain, dev, cmd, rom_only, num_mem_ranges);
return 0;
+
+fail:
+ for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+ {
+ struct vpci_bar *bar = &header->bars[i];
+
+ rangeset_destroy(bar->mem);
+ bar->mem = NULL;
+ }
+ return rc;
}
static void cmd_write(const struct pci_dev *pdev, unsigned int reg,
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index db86b8e7fa3c..a0cbdb4bf4fd 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -82,6 +82,7 @@ struct vpci {
/* Guest view of the BAR. */
uint64_t guest_addr;
uint64_t size;
+ struct rangeset *mem;
enum {
VPCI_BAR_EMPTY,
VPCI_BAR_IO,
@@ -156,9 +157,9 @@ struct vpci {
struct vpci_vcpu {
/* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
- struct rangeset *mem;
struct pci_dev *pdev;
uint16_t cmd;
+ uint8_t num_mem_ranges;
bool rom_only : 1;
};
--
2.25.1
On 03.09.2021 12:08, Oleksandr Andrushchenko wrote: > From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> > > Instead of handling a single range set, that contains all the memory > regions of all the BARs and ROM, have them per BAR. Without looking at how you carry out this change - this look wrong (as in: wasteful) to me. Despite ... > This is in preparation of making non-identity mappings in p2m for the > MMIOs/ROM. ... the need for this, every individual BAR is still contiguous in both host and guest address spaces, so can be represented as a single (start,end) tuple (or a pair thereof, to account for both host and guest values). No need to use a rangeset for this. Jan
On 06.09.21 17:47, Jan Beulich wrote: > On 03.09.2021 12:08, Oleksandr Andrushchenko wrote: >> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> >> >> Instead of handling a single range set, that contains all the memory >> regions of all the BARs and ROM, have them per BAR. > Without looking at how you carry out this change - this look wrong (as > in: wasteful) to me. Despite ... > >> This is in preparation of making non-identity mappings in p2m for the >> MMIOs/ROM. > ... the need for this, every individual BAR is still contiguous in both > host and guest address spaces, so can be represented as a single > (start,end) tuple (or a pair thereof, to account for both host and guest > values). No need to use a rangeset for this. First of all this change is in preparation for non-identity mappings, e.g. currently we collect all the memory ranges which require mappings into a single range set, then we cut off MSI-X regions and then use range set functionality to call a callback for every memory range left after MSI-X. This works perfectly fine for 1:1 mappings, e.g. what we have as the range set's starting address is what we want to be mapped/unmapped. Why range sets? Because they allow partial mappings, e.g. you can map part of the range and return back and continue from where you stopped. And if I understand that correctly that was the initial intention of introducing range sets here. For non-identity mappings this becomes not that easy. Each individual BAR may be mapped differently according to what guest OS has programmed as bar->guest_addr (guest view of the BAR start). Thus we need to collect all those non-identity mappings per BAR now (so we have a mapping "guest view" : "physical BAR" and again cut off MSI-X regions as before. So, yes, it may be a bit wasteful to use many range sets, but makes vPCI life much-much easier. Thus, I think that even per-BAR range sets are good to go as they have more pros than cons. IMO Even if we go with "can be represented as a single (start,end) tuple" it doesn't answer the question what needs to be done if a range gets partially mapped/unmapped. We'll need to put some logic to re-try the operation later and remember where did we stop. At the end of the day we'll invent one more range set, but now vPCI own. > > Jan > Thank you, Oleksandr
On 08.09.2021 16:31, Oleksandr Andrushchenko wrote: > > On 06.09.21 17:47, Jan Beulich wrote: >> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote: >>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> >>> >>> Instead of handling a single range set, that contains all the memory >>> regions of all the BARs and ROM, have them per BAR. >> Without looking at how you carry out this change - this look wrong (as >> in: wasteful) to me. Despite ... >> >>> This is in preparation of making non-identity mappings in p2m for the >>> MMIOs/ROM. >> ... the need for this, every individual BAR is still contiguous in both >> host and guest address spaces, so can be represented as a single >> (start,end) tuple (or a pair thereof, to account for both host and guest >> values). No need to use a rangeset for this. > > First of all this change is in preparation for non-identity mappings, I'm afraid I continue to not see how this matters in the discussion at hand. I'm fully aware that this is the goal. > e.g. currently we collect all the memory ranges which require mappings > into a single range set, then we cut off MSI-X regions and then use range set > functionality to call a callback for every memory range left after MSI-X. > This works perfectly fine for 1:1 mappings, e.g. what we have as the range > set's starting address is what we want to be mapped/unmapped. > Why range sets? Because they allow partial mappings, e.g. you can map part of > the range and return back and continue from where you stopped. And if I > understand that correctly that was the initial intention of introducing range sets here. > > For non-identity mappings this becomes not that easy. Each individual BAR may be > mapped differently according to what guest OS has programmed as bar->guest_addr > (guest view of the BAR start). I don't see how the rangeset helps here. You have a guest and a host pair of values for every BAR. Pages with e.g. the MSI-X table may not be mapped to their host counterpart address, yes, but you need to special cases these anyway: Accesses to them need to be handled. Hence I'm having a hard time seeing how a per-BAR rangeset (which will cover at most three distinct ranges afaict, which is way too little for this kind of data organization imo) can gain you all this much. Overall the 6 BARs of a device will cover up to 8 non-adjacent ranges. IOW the majority (4 or more) of the rangesets will indeed merely represent a plain (start,end) pair (or be entirely empty). > Thus we need to collect all those non-identity mappings > per BAR now (so we have a mapping "guest view" : "physical BAR" and again cut off > MSI-X regions as before. So, yes, it may be a bit wasteful to use many range sets, > but makes vPCI life much-much easier. Which I'm yet to be convinced of. Then again I'm not the maintainer of this code, so if you can convince Roger you'll be all good. > Thus, I think that even per-BAR range sets are > good to go as they have more pros than cons. IMO > Even if we go with "can be represented as a single (start,end) tuple" it doesn't answer > the question what needs to be done if a range gets partially mapped/unmapped. This question also isn't answered when you use rangesets. Jan
On 08.09.21 18:00, Jan Beulich wrote: > On 08.09.2021 16:31, Oleksandr Andrushchenko wrote: >> On 06.09.21 17:47, Jan Beulich wrote: >>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote: >>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> >>>> >>>> Instead of handling a single range set, that contains all the memory >>>> regions of all the BARs and ROM, have them per BAR. >>> Without looking at how you carry out this change - this look wrong (as >>> in: wasteful) to me. Despite ... >>> >>>> This is in preparation of making non-identity mappings in p2m for the >>>> MMIOs/ROM. >>> ... the need for this, every individual BAR is still contiguous in both >>> host and guest address spaces, so can be represented as a single >>> (start,end) tuple (or a pair thereof, to account for both host and guest >>> values). No need to use a rangeset for this. >> First of all this change is in preparation for non-identity mappings, > I'm afraid I continue to not see how this matters in the discussion at > hand. I'm fully aware that this is the goal. > >> e.g. currently we collect all the memory ranges which require mappings >> into a single range set, then we cut off MSI-X regions and then use range set >> functionality to call a callback for every memory range left after MSI-X. >> This works perfectly fine for 1:1 mappings, e.g. what we have as the range >> set's starting address is what we want to be mapped/unmapped. >> Why range sets? Because they allow partial mappings, e.g. you can map part of >> the range and return back and continue from where you stopped. And if I >> understand that correctly that was the initial intention of introducing range sets here. >> >> For non-identity mappings this becomes not that easy. Each individual BAR may be >> mapped differently according to what guest OS has programmed as bar->guest_addr >> (guest view of the BAR start). > I don't see how the rangeset helps here. You have a guest and a host pair > of values for every BAR. Pages with e.g. the MSI-X table may not be mapped > to their host counterpart address, yes, but you need to special cases > these anyway: Accesses to them need to be handled. Hence I'm having a hard > time seeing how a per-BAR rangeset (which will cover at most three distinct > ranges afaict, which is way too little for this kind of data organization > imo) can gain you all this much. > > Overall the 6 BARs of a device will cover up to 8 non-adjacent ranges. IOW > the majority (4 or more) of the rangesets will indeed merely represent a > plain (start,end) pair (or be entirely empty). First of all, let me explain why I decided to move to per-BAR range sets. Before this change all the MMIO regions and MSI-X holes were accounted by a single range set, e.g. we go over all BARs and add MMIOs and then subtract MSI-X from there. When it comes to mapping/unmapping we have an assumtion that the starting address of each element in the range set is equal to map/unmap address, e.g. we have identity mapping. Please note, that the range set accepts a single private data parameter which is enough to hold all required data about the pdev in common, but there is no way to provide any per-BAR data. Now, that we want non-identity mappings, we can no longer assume that starting address == mapping address and we need to provide additional information on how to map and which is now per-BAR. This is why I decided to use per-BAR range sets. One of the solutions may be that we form an additional list of structures in a form (I ommit some of the fields): struct non_identity { unsigned long start_mfn; unsigned long start_gfn; unsigned long size; }; So this way when the range set gets processed we go over the list and find out the corresponding list's element which describes the range set entry being processed (s, e, data): static int map_range(unsigned long s, unsigned long e, void *data, unsigned long *c) { [snip] go over the list elements if ( list->start_mfn == s ) found, can use list->start_gfn for mapping [snip] } This has some complications as map_range may be called multiple times for the same range: if {unmap|map}_mmio_regions was not able to complete the operation it returns the number of pages it was able to process: rc = map->map ? map_mmio_regions(map->d, start_gfn, size, _mfn(s)) : unmap_mmio_regions(map->d, start_gfn, size, _mfn(s)); In this case we need to update the list item: list->start_mfn += rc; list->start_gfn += rc; list->size -= rc; and if all the pages of the range were processed delete the list entry. With respect of creating the list everything also not so complicated: while processing each BAR create a list entry and fill it with mfn, gfn and size. Then, if MSI-X region is present within this BAR, break the list item into multiple ones with respect to the holes, for example: MMIO 0 list item MSI-X hole 0 MMIO 1 list item MSI-X hole 1 Here instead of a single BAR description we now have 2 list elements describing the BAR without MSI-X regions. All the above still relies on a single range set per pdev as it is in the original code. We can go this route if we agree this is more acceptable than the range sets per BAR > >> Thus we need to collect all those non-identity mappings >> per BAR now (so we have a mapping "guest view" : "physical BAR" and again cut off >> MSI-X regions as before. So, yes, it may be a bit wasteful to use many range sets, >> but makes vPCI life much-much easier. > Which I'm yet to be convinced of. Then again I'm not the maintainer of > this code, so if you can convince Roger you'll be all good. Per-BAR range sets look more clear to me and add relatively less code which seems to be good. > >> Thus, I think that even per-BAR range sets are >> good to go as they have more pros than cons. IMO >> Even if we go with "can be represented as a single (start,end) tuple" it doesn't answer >> the question what needs to be done if a range gets partially mapped/unmapped. > This question also isn't answered when you use rangesets. bool vpci_process_pending(struct vcpu *v) { [snip] rc = rangeset_consume_ranges(bar->mem, map_range, &data); if ( rc == -ERESTART ) return true; > > Jan > Thank you, Oleksandr
On 09.09.2021 07:22, Oleksandr Andrushchenko wrote: > > On 08.09.21 18:00, Jan Beulich wrote: >> On 08.09.2021 16:31, Oleksandr Andrushchenko wrote: >>> On 06.09.21 17:47, Jan Beulich wrote: >>>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote: >>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> >>>>> >>>>> Instead of handling a single range set, that contains all the memory >>>>> regions of all the BARs and ROM, have them per BAR. >>>> Without looking at how you carry out this change - this look wrong (as >>>> in: wasteful) to me. Despite ... >>>> >>>>> This is in preparation of making non-identity mappings in p2m for the >>>>> MMIOs/ROM. >>>> ... the need for this, every individual BAR is still contiguous in both >>>> host and guest address spaces, so can be represented as a single >>>> (start,end) tuple (or a pair thereof, to account for both host and guest >>>> values). No need to use a rangeset for this. >>> First of all this change is in preparation for non-identity mappings, >> I'm afraid I continue to not see how this matters in the discussion at >> hand. I'm fully aware that this is the goal. >> >>> e.g. currently we collect all the memory ranges which require mappings >>> into a single range set, then we cut off MSI-X regions and then use range set >>> functionality to call a callback for every memory range left after MSI-X. >>> This works perfectly fine for 1:1 mappings, e.g. what we have as the range >>> set's starting address is what we want to be mapped/unmapped. >>> Why range sets? Because they allow partial mappings, e.g. you can map part of >>> the range and return back and continue from where you stopped. And if I >>> understand that correctly that was the initial intention of introducing range sets here. >>> >>> For non-identity mappings this becomes not that easy. Each individual BAR may be >>> mapped differently according to what guest OS has programmed as bar->guest_addr >>> (guest view of the BAR start). >> I don't see how the rangeset helps here. You have a guest and a host pair >> of values for every BAR. Pages with e.g. the MSI-X table may not be mapped >> to their host counterpart address, yes, but you need to special cases >> these anyway: Accesses to them need to be handled. Hence I'm having a hard >> time seeing how a per-BAR rangeset (which will cover at most three distinct >> ranges afaict, which is way too little for this kind of data organization >> imo) can gain you all this much. >> >> Overall the 6 BARs of a device will cover up to 8 non-adjacent ranges. IOW >> the majority (4 or more) of the rangesets will indeed merely represent a >> plain (start,end) pair (or be entirely empty). > First of all, let me explain why I decided to move to per-BAR > range sets. > Before this change all the MMIO regions and MSI-X holes were > accounted by a single range set, e.g. we go over all BARs and > add MMIOs and then subtract MSI-X from there. When it comes to > mapping/unmapping we have an assumtion that the starting address of > each element in the range set is equal to map/unmap address, e.g. > we have identity mapping. Please note, that the range set accepts > a single private data parameter which is enough to hold all > required data about the pdev in common, but there is no way to provide > any per-BAR data. > > Now, that we want non-identity mappings, we can no longer assume > that starting address == mapping address and we need to provide > additional information on how to map and which is now per-BAR. > This is why I decided to use per-BAR range sets. > > One of the solutions may be that we form an additional list of > structures in a form (I ommit some of the fields): > struct non_identity { > unsigned long start_mfn; > unsigned long start_gfn; > unsigned long size; > }; > So this way when the range set gets processed we go over the list > and find out the corresponding list's element which describes the > range set entry being processed (s, e, data): > > static int map_range(unsigned long s, unsigned long e, void *data, > unsigned long *c) > { > [snip] > go over the list elements > if ( list->start_mfn == s ) > found, can use list->start_gfn for mapping > [snip] > } > This has some complications as map_range may be called multiple times > for the same range: if {unmap|map}_mmio_regions was not able to complete > the operation it returns the number of pages it was able to process: > rc = map->map ? map_mmio_regions(map->d, start_gfn, > size, _mfn(s)) > : unmap_mmio_regions(map->d, start_gfn, > size, _mfn(s)); > In this case we need to update the list item: > list->start_mfn += rc; > list->start_gfn += rc; > list->size -= rc; > and if all the pages of the range were processed delete the list entry. > > With respect of creating the list everything also not so complicated: > while processing each BAR create a list entry and fill it with mfn, gfn > and size. Then, if MSI-X region is present within this BAR, break the > list item into multiple ones with respect to the holes, for example: > > MMIO 0 list item > MSI-X hole 0 > MMIO 1 list item > MSI-X hole 1 > > Here instead of a single BAR description we now have 2 list elements > describing the BAR without MSI-X regions. > > All the above still relies on a single range set per pdev as it is in the > original code. We can go this route if we agree this is more acceptable > than the range sets per BAR I guess I am now even more confused: I can't spot any "rangeset per pdev" either. The rangeset I see being used doesn't get associated with anything that's device-related; it gets accumulated as a transient data structure, but _all_ devices owned by a domain influence its final content. If you associate rangesets with either a device or a BAR, I'm failing to see how you'd deal with multiple BARs living in the same page (see also below). Considering that a rangeset really is a compressed representation of a bitmap, I wonder whether this data structure is suitable at all for what you want to express. You have two pieces of information to carry / manage, after all: Which ranges need mapping, and what their GFN <-> MFN relationship is. Maybe the latter needs expressing differently in the first place? And then in a way that's ensuring by its organization that no conflicting GFN <-> MFN mappings will be possible? Isn't this precisely what is already getting recorded in the P2M? I'm also curious what your plan is to deal with BARs overlapping in MFN space: In such a case, the guest cannot independently change the GFNs of any of the involved BARs. (Same the other way around: overlaps in GFN space are only permitted when the same overlap exists in MFN space.) Are you excluding (forbidding) this case? If so, did I miss you saying so somewhere? Yet if no overlaps are allowed in the first place, what modify_bars() does would be far more complicated than necessary in the DomU case, so it may be worthwhile considering to deviate more from how Dom0 gets taken care of. In the end a guest writing a BAR is merely a request to change its P2M. That's very different from Dom0 writing a BAR, which means the physical BAR also changes, and hence the P2M changes in quite different a way. Jan
On 09.09.21 11:24, Jan Beulich wrote: > On 09.09.2021 07:22, Oleksandr Andrushchenko wrote: >> On 08.09.21 18:00, Jan Beulich wrote: >>> On 08.09.2021 16:31, Oleksandr Andrushchenko wrote: >>>> On 06.09.21 17:47, Jan Beulich wrote: >>>>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote: >>>>>> From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> >>>>>> >>>>>> Instead of handling a single range set, that contains all the memory >>>>>> regions of all the BARs and ROM, have them per BAR. >>>>> Without looking at how you carry out this change - this look wrong (as >>>>> in: wasteful) to me. Despite ... >>>>> >>>>>> This is in preparation of making non-identity mappings in p2m for the >>>>>> MMIOs/ROM. >>>>> ... the need for this, every individual BAR is still contiguous in both >>>>> host and guest address spaces, so can be represented as a single >>>>> (start,end) tuple (or a pair thereof, to account for both host and guest >>>>> values). No need to use a rangeset for this. >>>> First of all this change is in preparation for non-identity mappings, >>> I'm afraid I continue to not see how this matters in the discussion at >>> hand. I'm fully aware that this is the goal. >>> >>>> e.g. currently we collect all the memory ranges which require mappings >>>> into a single range set, then we cut off MSI-X regions and then use range set >>>> functionality to call a callback for every memory range left after MSI-X. >>>> This works perfectly fine for 1:1 mappings, e.g. what we have as the range >>>> set's starting address is what we want to be mapped/unmapped. >>>> Why range sets? Because they allow partial mappings, e.g. you can map part of >>>> the range and return back and continue from where you stopped. And if I >>>> understand that correctly that was the initial intention of introducing range sets here. >>>> >>>> For non-identity mappings this becomes not that easy. Each individual BAR may be >>>> mapped differently according to what guest OS has programmed as bar->guest_addr >>>> (guest view of the BAR start). >>> I don't see how the rangeset helps here. You have a guest and a host pair >>> of values for every BAR. Pages with e.g. the MSI-X table may not be mapped >>> to their host counterpart address, yes, but you need to special cases >>> these anyway: Accesses to them need to be handled. Hence I'm having a hard >>> time seeing how a per-BAR rangeset (which will cover at most three distinct >>> ranges afaict, which is way too little for this kind of data organization >>> imo) can gain you all this much. >>> >>> Overall the 6 BARs of a device will cover up to 8 non-adjacent ranges. IOW >>> the majority (4 or more) of the rangesets will indeed merely represent a >>> plain (start,end) pair (or be entirely empty). >> First of all, let me explain why I decided to move to per-BAR >> range sets. >> Before this change all the MMIO regions and MSI-X holes were >> accounted by a single range set, e.g. we go over all BARs and >> add MMIOs and then subtract MSI-X from there. When it comes to >> mapping/unmapping we have an assumtion that the starting address of >> each element in the range set is equal to map/unmap address, e.g. >> we have identity mapping. Please note, that the range set accepts >> a single private data parameter which is enough to hold all >> required data about the pdev in common, but there is no way to provide >> any per-BAR data. >> >> Now, that we want non-identity mappings, we can no longer assume >> that starting address == mapping address and we need to provide >> additional information on how to map and which is now per-BAR. >> This is why I decided to use per-BAR range sets. >> >> One of the solutions may be that we form an additional list of >> structures in a form (I ommit some of the fields): >> struct non_identity { >> unsigned long start_mfn; >> unsigned long start_gfn; >> unsigned long size; >> }; >> So this way when the range set gets processed we go over the list >> and find out the corresponding list's element which describes the >> range set entry being processed (s, e, data): >> >> static int map_range(unsigned long s, unsigned long e, void *data, >> unsigned long *c) >> { >> [snip] >> go over the list elements >> if ( list->start_mfn == s ) >> found, can use list->start_gfn for mapping >> [snip] >> } >> This has some complications as map_range may be called multiple times >> for the same range: if {unmap|map}_mmio_regions was not able to complete >> the operation it returns the number of pages it was able to process: >> rc = map->map ? map_mmio_regions(map->d, start_gfn, >> size, _mfn(s)) >> : unmap_mmio_regions(map->d, start_gfn, >> size, _mfn(s)); >> In this case we need to update the list item: >> list->start_mfn += rc; >> list->start_gfn += rc; >> list->size -= rc; >> and if all the pages of the range were processed delete the list entry. >> >> With respect of creating the list everything also not so complicated: >> while processing each BAR create a list entry and fill it with mfn, gfn >> and size. Then, if MSI-X region is present within this BAR, break the >> list item into multiple ones with respect to the holes, for example: >> >> MMIO 0 list item >> MSI-X hole 0 >> MMIO 1 list item >> MSI-X hole 1 >> >> Here instead of a single BAR description we now have 2 list elements >> describing the BAR without MSI-X regions. >> >> All the above still relies on a single range set per pdev as it is in the >> original code. We can go this route if we agree this is more acceptable >> than the range sets per BAR > I guess I am now even more confused: I can't spot any "rangeset per pdev" > either. The rangeset I see being used doesn't get associated with anything > that's device-related; it gets accumulated as a transient data structure, > but _all_ devices owned by a domain influence its final content. You are absolutely right here, sorry for the confusion: in the current code the range set belongs to struct vpci_vcpu, e.g. /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */ > > If you associate rangesets with either a device or a BAR, I'm failing to > see how you'd deal with multiple BARs living in the same page (see also > below). This was exactly the issue I ran into while emulating RTL8139 on QEMU: The MMIOs are 128 bytes long and Linux put them on the same page. So, it is a known limitation that we can't deal with [1] > > Considering that a rangeset really is a compressed representation of a > bitmap, I wonder whether this data structure is suitable at all for what > you want to express. You have two pieces of information to carry / manage, > after all: Which ranges need mapping, and what their GFN <-> MFN > relationship is. Maybe the latter needs expressing differently in the > first place? I proposed a list which can be extended to hold all the required information there, e.g. MFN, GFN, size etc. > And then in a way that's ensuring by its organization that > no conflicting GFN <-> MFN mappings will be possible? If you mean the use-case above with different device MMIOs living in the same page then my understanding is that such a use-case is not supported [1] > Isn't this > precisely what is already getting recorded in the P2M? > > I'm also curious what your plan is to deal with BARs overlapping in MFN > space: In such a case, the guest cannot independently change the GFNs of > any of the involved BARs. (Same the other way around: overlaps in GFN > space are only permitted when the same overlap exists in MFN space.) Are > you excluding (forbidding) this case? If so, did I miss you saying so > somewhere? Again [1] > Yet if no overlaps are allowed in the first place, what > modify_bars() does would be far more complicated than necessary in the > DomU case, so it may be worthwhile considering to deviate more from how > Dom0 gets taken care of. In the end a guest writing a BAR is merely a > request to change its P2M. That's very different from Dom0 writing a BAR, > which means the physical BAR also changes, and hence the P2M changes in > quite different a way. So, what is the difference then besides hwdom really writes to a BAR? To me most of the logic remains the same: we need to map/unmap. The only difference I see here is that for Dom0 we have 1:1 at the moment and for guest we need GFN <-> MFN. Anyways, I am open to any decision on what would be the right approach here: 1. Use range sets per BAR as in the patch 2. Remove range sets completely and have a per-vCPU list with mapping data as I described above 3. Anything else? > > Jan Thank you, Oleksandr [1] https://wiki.xenproject.org/wiki/Xen_PCI_Passthrough#I_get_.22non-page-aligned_MMIO_BAR.22_error_when_trying_to_start_the_guest
On 09.09.2021 11:12, Oleksandr Andrushchenko wrote: > Anyways, I am open to any decision on what would be the right approach here: > > 1. Use range sets per BAR as in the patch > > 2. Remove range sets completely and have a per-vCPU list with mapping > > data as I described above > > 3. Anything else? A decision first requires a proposal. I think 3 is the way to investigate first: Rather than starting from the code we currently have, start from what you need for DomU-s to work. If there's enough overlap with how we handle Dom0, code can be shared. If things are sufficiently different, separate code paths are likely better. As said - to me a guest altering a BAR is merely a very special form of a request to change its P2M. The M parts remains unchanged (which is the major difference from Dom0), while the P part changes. As long as you can assume no two BARs to share a page, this would appear to suggest that it's simply a P2M operation plus book keeping at the vPCI layer. Completely different from Dom0 handling. All of this applies only with memory decoding enabled, I expect. Disabling memory decoding on a device ought to be a simple "unmap all BARs", while enabling is "map all BARs". Which again is, due to the assumed lack of sharing of pages, much simpler than on Dom0: You only need to subtract the MSI-X table range(s) (if any, and perhaps not necessary when unmapping, as there's nothing wrong to unmap a P2M slot which wasn't mapped); this may not even require any rangeset at all to represent. And in fact I wonder whether for DomU-s you want to support BAR changes in the first place while memory decoding is enabled. Depends much on how quirky the guest OSes are that ought to run on top. Jan
On 09.09.21 12:39, Jan Beulich wrote: > On 09.09.2021 11:12, Oleksandr Andrushchenko wrote: >> Anyways, I am open to any decision on what would be the right approach here: >> >> 1. Use range sets per BAR as in the patch >> >> 2. Remove range sets completely and have a per-vCPU list with mapping >> >> data as I described above >> >> 3. Anything else? > A decision first requires a proposal. I already have 2: one in the patch with the range set per BAR and one described earlier in the thread with a single range set and a list for GFN <-> MFN. If you can tell your opinion I am all ears. But, please be specific as common words don't change anything to me. At the same time I do understand that the current code is not set in stone, but we should have a good reason for major changes to it, IMO. I mean that before DomU's we were fine with the range sets etc, and now we are not: so what has changed so much? > I think 3 is the way to investigate > first: Rather than starting from the code we currently have, start from > what you need for DomU-s to work. If there's enough overlap with how we > handle Dom0, code can be shared. You can see that in my patch the same code is used by both hwdom and guest. What else needs to be proven? The patch shows that all the code besides guest register handlers (which is expected) is all common. > If things are sufficiently different, > separate code paths are likely better. As said - to me a guest altering a > BAR is merely a very special form of a request to change its P2M. The M > parts remains unchanged (which is the major difference from Dom0), while > the P part changes. As long as you can assume no two BARs to share a page, > this would appear to suggest that it's simply a P2M operation plus book > keeping at the vPCI layer. Completely different from Dom0 handling. Underneath, yes, possibly. But at the level vPCI operates there is no such difference I can clearly see in vPCI code and the patch in question. Please point me to the vPCI code I fail to see. > > All of this applies only with memory decoding enabled, I expect. > Disabling memory decoding on a device ought to be a simple "unmap all > BARs", while enabling is "map all BARs". Which again is, due to the > assumed lack of sharing of pages, much simpler than on Dom0: You only > need to subtract the MSI-X table range(s) (if any, and perhaps not > necessary when unmapping, as there's nothing wrong to unmap a P2M slot > which wasn't mapped); this may not even require any rangeset at all to > represent. > > And in fact I wonder whether for DomU-s you want to support BAR changes > in the first place while memory decoding is enabled. No, why? I want to keep the existing logic, e.g. with memory decoding disabled as it is now. > Depends much on > how quirky the guest OSes are that ought to run on top. > > Jan > Thank you, Oleksandr
On 09.09.2021 12:03, Oleksandr Andrushchenko wrote: > On 09.09.21 12:39, Jan Beulich wrote: >> On 09.09.2021 11:12, Oleksandr Andrushchenko wrote: >>> Anyways, I am open to any decision on what would be the right approach here: >>> >>> 1. Use range sets per BAR as in the patch >>> >>> 2. Remove range sets completely and have a per-vCPU list with mapping >>> >>> data as I described above >>> >>> 3. Anything else? >> A decision first requires a proposal. > > I already have 2: one in the patch with the range set per BAR and one described > earlier in the thread with a single range set and a list for GFN <-> MFN. > If you can tell your opinion I am all ears. But, please be specific as common words > don't change anything to me. > At the same time I do understand that the current code is not set in stone, > but we should have a good reason for major changes to it, IMO. And I view your change, as proposed, as a major one. You turn the logic all over imo. > I mean that before > DomU's we were fine with the range sets etc, and now we are not: > so what has changed so much? Nothing has changed. I'm not advocating for removal of the rangeset use in handling Dom0's needs. I'm suggesting that their use might not be a good fit for DomU. >> I think 3 is the way to investigate >> first: Rather than starting from the code we currently have, start from >> what you need for DomU-s to work. If there's enough overlap with how we >> handle Dom0, code can be shared. > > You can see that in my patch the same code is used by both hwdom and > guest. What else needs to be proven? The patch shows that all the code > besides guest register handlers (which is expected) is all common. The complexity of dealing with Dom0 has increased. I've outlined the process that I think should be followed: First determine what DomU needs. Then see how much of this actually fits the existing code (handling Dom0). Then decide whether altering Dom0 handling is actually worth it, compared to handling DomU separately. In fact handling it separately first may have its own benefits, like easing review and reducing the risk of breaking Dom0 handling. If then there are enough similarities, in a 2nd step both may want folding. >> All of this applies only with memory decoding enabled, I expect. >> Disabling memory decoding on a device ought to be a simple "unmap all >> BARs", while enabling is "map all BARs". Which again is, due to the >> assumed lack of sharing of pages, much simpler than on Dom0: You only >> need to subtract the MSI-X table range(s) (if any, and perhaps not >> necessary when unmapping, as there's nothing wrong to unmap a P2M slot >> which wasn't mapped); this may not even require any rangeset at all to >> represent. >> >> And in fact I wonder whether for DomU-s you want to support BAR changes >> in the first place while memory decoding is enabled. > > No, why? I want to keep the existing logic, e.g. with memory decoding > disabled as it is now. Afaict existing code deals with both cases. What I was putting under question is whether DomU handling code also needs to. Jan
On 09.09.21 13:46, Jan Beulich wrote: > On 09.09.2021 12:03, Oleksandr Andrushchenko wrote: >> On 09.09.21 12:39, Jan Beulich wrote: >>> On 09.09.2021 11:12, Oleksandr Andrushchenko wrote: >>>> Anyways, I am open to any decision on what would be the right approach here: >>>> >>>> 1. Use range sets per BAR as in the patch >>>> >>>> 2. Remove range sets completely and have a per-vCPU list with mapping >>>> >>>> data as I described above >>>> >>>> 3. Anything else? >>> A decision first requires a proposal. >> I already have 2: one in the patch with the range set per BAR and one described >> earlier in the thread with a single range set and a list for GFN <-> MFN. >> If you can tell your opinion I am all ears. But, please be specific as common words >> don't change anything to me. >> At the same time I do understand that the current code is not set in stone, >> but we should have a good reason for major changes to it, IMO. > And I view your change, as proposed, as a major one. You turn the logic all > over imo. > >> I mean that before >> DomU's we were fine with the range sets etc, and now we are not: >> so what has changed so much? > Nothing has changed. I'm not advocating for removal of the rangeset use in > handling Dom0's needs. I'm suggesting that their use might not be a good > fit for DomU. The proposed change makes the same code work for both Dom0 and DomU. So, instead of having the common code as it proposed do you suggest to invent something special for DomU (making the same job as we already do for Dom0) and then see if we can then combine the both to have the code common again? I am saying that the code is already common even if you think that for DomU it can be simpler (I can't still see in which way as p2m and other things are not directly touched by the vPCI code, e.g. both Dom0 and DomU use {map|unmap}_mmio_regions and the only difference is that for Dom0 we have MFN == GFN and for DomU it's not). So, even if ranges sets are not good for DomUs (I can't see why), but if they help have the code common I think it is worth having them. > >>> I think 3 is the way to investigate >>> first: Rather than starting from the code we currently have, start from >>> what you need for DomU-s to work. If there's enough overlap with how we >>> handle Dom0, code can be shared. >> You can see that in my patch the same code is used by both hwdom and >> guest. What else needs to be proven? The patch shows that all the code >> besides guest register handlers (which is expected) is all common. > The complexity of dealing with Dom0 has increased. I've outlined the > process that I think should be followed: First determine what DomU needs. It is already known, GFN <-> MFN non-identity mappings > Then see how much of this actually fits the existing code (handling Dom0). It is already in the patch: we have all code common for both Dom0 and DomU > Then decide whether altering Dom0 handling is actually worth it, > compared to handling DomU separately. It leads to the same functionality implemented twice > In fact handling it separately > first may have its own benefits, like easing review and reducing the risk > of breaking Dom0 handling. If then there are enough similarities, in a > 2nd step both may want folding. You can see from the patch if we have "if ( hwdom )" spread over the implementation. I guess you won't find that (besides guest register handlers which is expected). > >>> All of this applies only with memory decoding enabled, I expect. >>> Disabling memory decoding on a device ought to be a simple "unmap all >>> BARs", while enabling is "map all BARs". Which again is, due to the >>> assumed lack of sharing of pages, much simpler than on Dom0: You only >>> need to subtract the MSI-X table range(s) (if any, and perhaps not >>> necessary when unmapping, as there's nothing wrong to unmap a P2M slot >>> which wasn't mapped); this may not even require any rangeset at all to >>> represent. >>> >>> And in fact I wonder whether for DomU-s you want to support BAR changes >>> in the first place while memory decoding is enabled. >> No, why? I want to keep the existing logic, e.g. with memory decoding >> disabled as it is now. > Afaict existing code deals with both cases. Hm, I thought that we only map/unmap with memory decoding disabled. For my education: what happens if you unmap with decoding enabled and domain accesses the MMIOs? > What I was putting under > question is whether DomU handling code also needs to. > > Jan > Thank you, Oleksandr
On 09.09.2021 13:30, Oleksandr Andrushchenko wrote: > On 09.09.21 13:46, Jan Beulich wrote: >> On 09.09.2021 12:03, Oleksandr Andrushchenko wrote: >>> On 09.09.21 12:39, Jan Beulich wrote: >>>> And in fact I wonder whether for DomU-s you want to support BAR changes >>>> in the first place while memory decoding is enabled. >>> No, why? I want to keep the existing logic, e.g. with memory decoding >>> disabled as it is now. >> Afaict existing code deals with both cases. > > Hm, I thought that we only map/unmap with memory decoding disabled. > For my education: what happens if you unmap with decoding enabled and > domain accesses the MMIOs? That would depend on the precise timing; it's certainly not well defined. But supporting this may be needed for quirky OSes, as said before, as they may get away with that on real hardware if they avoid actual accesses at the time of the BAR change. Jan
© 2016 - 2024 Red Hat, Inc.