From: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
Emulate guest BAR register values: this allows creating a guest view
of the registers and emulates size and properties probe as it is done
during PCI device enumeration by the guest.
ROM BAR is only handled for the hardware domain and for guest domains
there is a stub: at the moment PCI expansion ROM is x86 only, so it
might not be used by other architectures without emulating x86. Other
use-cases may include using that expansion ROM before Xen boots, hence
no emulation is needed in Xen itself. Or when a guest wants to use the
ROM code which seems to be rare.
Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com>
---
xen/drivers/vpci/header.c | 69 +++++++++++++++++++++++++++++++++++++-
xen/include/xen/pci_regs.h | 1 +
xen/include/xen/vpci.h | 3 ++
3 files changed, 72 insertions(+), 1 deletion(-)
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 5218b1af247e..793f79ece831 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
uint32_t val, void *data)
{
+ struct vpci_bar *bar = data;
+ bool hi = false;
+
+ if ( bar->type == VPCI_BAR_MEM64_HI )
+ {
+ ASSERT(reg > PCI_BASE_ADDRESS_0);
+ bar--;
+ hi = true;
+ }
+ else
+ val &= PCI_BASE_ADDRESS_MEM_MASK;
+ bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
+ bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
}
static uint32_t guest_bar_read(const struct pci_dev *pdev, unsigned int reg,
void *data)
{
- return 0xffffffff;
+ struct vpci_bar *bar = data;
+ uint32_t val;
+ bool hi = false;
+
+ switch ( bar->type )
+ {
+ case VPCI_BAR_MEM64_HI:
+ ASSERT(reg > PCI_BASE_ADDRESS_0);
+ bar--;
+ hi = true;
+ /* fallthrough */
+ case VPCI_BAR_MEM64_LO:
+ {
+ if ( hi )
+ val = bar->guest_addr >> 32;
+ else
+ val = bar->guest_addr & 0xffffffff;
+ if ( (val & PCI_BASE_ADDRESS_MEM_MASK_32) == PCI_BASE_ADDRESS_MEM_MASK_32 )
+ {
+ /* Guests detects BAR's properties and sizes. */
+ if ( hi )
+ val = bar->size >> 32;
+ else
+ val = 0xffffffff & ~(bar->size - 1);
+ }
+ if ( !hi )
+ {
+ val |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+ val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
+ }
+ bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
+ bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
+ break;
+ }
+ case VPCI_BAR_MEM32:
+ {
+ val = bar->guest_addr;
+ if ( (val & PCI_BASE_ADDRESS_MEM_MASK_32) == PCI_BASE_ADDRESS_MEM_MASK_32 )
+ val = 0xffffffff & ~(bar->size - 1);
+ val |= PCI_BASE_ADDRESS_MEM_TYPE_32;
+ val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
+ break;
+ }
+ default:
+ val = bar->guest_addr;
+ break;
+ }
+ return val;
}
static void rom_write(const struct pci_dev *pdev, unsigned int reg,
@@ -522,6 +582,13 @@ static int add_bar_handlers(struct pci_dev *pdev, bool is_hwdom)
if ( rc )
return rc;
}
+ /*
+ * It is neither safe nor secure to initialize guest's view of the BARs
+ * with real values which are used by the hardware domain, so assign
+ * all zeros to guest's view of the BARs, so the guest can perform
+ * proper PCI device enumeration and assign BARs on its own.
+ */
+ bars[i].guest_addr = 0;
}
return 0;
}
diff --git a/xen/include/xen/pci_regs.h b/xen/include/xen/pci_regs.h
index cc4ee3b83e5c..038eb18c5357 100644
--- a/xen/include/xen/pci_regs.h
+++ b/xen/include/xen/pci_regs.h
@@ -103,6 +103,7 @@
#define PCI_BASE_ADDRESS_MEM_TYPE_64 0x04 /* 64 bit address */
#define PCI_BASE_ADDRESS_MEM_PREFETCH 0x08 /* prefetchable? */
#define PCI_BASE_ADDRESS_MEM_MASK (~0x0fUL)
+#define PCI_BASE_ADDRESS_MEM_MASK_32 (~0x0fU)
#define PCI_BASE_ADDRESS_IO_MASK (~0x03UL)
/* bit 1 is reserved if address_space = 1 */
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 4aa2941a1081..db86b8e7fa3c 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -77,7 +77,10 @@ struct vpci {
struct vpci_header {
/* Information about the PCI BARs of this device. */
struct vpci_bar {
+ /* Physical view of the BAR. */
uint64_t addr;
+ /* Guest view of the BAR. */
+ uint64_t guest_addr;
uint64_t size;
enum {
VPCI_BAR_EMPTY,
--
2.25.1
On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
> --- a/xen/drivers/vpci/header.c
> +++ b/xen/drivers/vpci/header.c
> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
> uint32_t val, void *data)
> {
> + struct vpci_bar *bar = data;
> + bool hi = false;
> +
> + if ( bar->type == VPCI_BAR_MEM64_HI )
> + {
> + ASSERT(reg > PCI_BASE_ADDRESS_0);
> + bar--;
> + hi = true;
> + }
> + else
> + val &= PCI_BASE_ADDRESS_MEM_MASK;
> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
What you store here is not the address that's going to be used, as
you don't mask off the low bits (to account for the BAR's size).
When a BAR gets written with all ones, all writable bits get these
ones stored. The address of the BAR, aiui, really changes to
(typically) close below 4Gb (in the case of a 32-bit BAR), which
is why memory / I/O decoding should be off while sizing BARs.
Therefore you shouldn't look for the specific "all writable bits
are ones" pattern (or worse, as you presently do, the "all bits
outside of the type specifier are ones" one) on the read path.
Instead mask the value appropriately here, and simply return back
the stored value from the read path.
> }
>
> static uint32_t guest_bar_read(const struct pci_dev *pdev, unsigned int reg,
> void *data)
> {
> - return 0xffffffff;
> + struct vpci_bar *bar = data;
> + uint32_t val;
> + bool hi = false;
> +
> + switch ( bar->type )
> + {
> + case VPCI_BAR_MEM64_HI:
> + ASSERT(reg > PCI_BASE_ADDRESS_0);
> + bar--;
> + hi = true;
> + /* fallthrough */
> + case VPCI_BAR_MEM64_LO:
> + {
Please don't add braces to case blocks when they're not needed.
> + if ( hi )
> + val = bar->guest_addr >> 32;
> + else
> + val = bar->guest_addr & 0xffffffff;
> + if ( (val & PCI_BASE_ADDRESS_MEM_MASK_32) == PCI_BASE_ADDRESS_MEM_MASK_32 )
This is wrong when falling through to here from VPCI_BAR_MEM64_HI:
All 32 bits need to be looked at. Yet as per the comment further
up I think it isn't right anyway to apply the mask here.
Also: Stray double blanks.
> + {
> + /* Guests detects BAR's properties and sizes. */
> + if ( hi )
> + val = bar->size >> 32;
> + else
> + val = 0xffffffff & ~(bar->size - 1);
> + }
> + if ( !hi )
> + {
> + val |= PCI_BASE_ADDRESS_MEM_TYPE_64;
> + val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
> + }
> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
> + break;
> + }
> + case VPCI_BAR_MEM32:
Please separate non-fall-through case blocks by a blank line.
> @@ -522,6 +582,13 @@ static int add_bar_handlers(struct pci_dev *pdev, bool is_hwdom)
> if ( rc )
> return rc;
> }
> + /*
> + * It is neither safe nor secure to initialize guest's view of the BARs
> + * with real values which are used by the hardware domain, so assign
> + * all zeros to guest's view of the BARs, so the guest can perform
> + * proper PCI device enumeration and assign BARs on its own.
> + */
> + bars[i].guest_addr = 0;
I'm afraid I don't understand the comment: Without memory decoding
enabled, the BARs are simple registers (with a few r/o bits).
> --- a/xen/include/xen/pci_regs.h
> +++ b/xen/include/xen/pci_regs.h
> @@ -103,6 +103,7 @@
> #define PCI_BASE_ADDRESS_MEM_TYPE_64 0x04 /* 64 bit address */
> #define PCI_BASE_ADDRESS_MEM_PREFETCH 0x08 /* prefetchable? */
> #define PCI_BASE_ADDRESS_MEM_MASK (~0x0fUL)
> +#define PCI_BASE_ADDRESS_MEM_MASK_32 (~0x0fU)
Please don't introduce an identical constant that's merely of
different type. (uint32_t)PCI_BASE_ADDRESS_MEM_MASK at the use
site (if actually still needed as per the comment above) would
seem more clear to me.
Jan
On 06.09.21 17:31, Jan Beulich wrote:
> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
>> --- a/xen/drivers/vpci/header.c
>> +++ b/xen/drivers/vpci/header.c
>> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>> uint32_t val, void *data)
>> {
>> + struct vpci_bar *bar = data;
>> + bool hi = false;
>> +
>> + if ( bar->type == VPCI_BAR_MEM64_HI )
>> + {
>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>> + bar--;
>> + hi = true;
>> + }
>> + else
>> + val &= PCI_BASE_ADDRESS_MEM_MASK;
>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
> What you store here is not the address that's going to be used,
bar->guest_addr is never used directly to be reported to a guest.
The same as bar->addr is never used to write to real BAR.
It is always used as an initial value which is then modified to reflect
lower bits, e.g. BAR type and if prefetchable, so I think this is perfectly
fine to have it this way.
> as
> you don't mask off the low bits (to account for the BAR's size).
> When a BAR gets written with all ones, all writable bits get these
> ones stored. The address of the BAR, aiui, really changes to
> (typically) close below 4Gb (in the case of a 32-bit BAR), which
> is why memory / I/O decoding should be off while sizing BARs.
> Therefore you shouldn't look for the specific "all writable bits
> are ones" pattern (or worse, as you presently do, the "all bits
> outside of the type specifier are ones" one) on the read path.
> Instead mask the value appropriately here, and simply return back
> the stored value from the read path.
"PCI LOCAL BUS SPECIFICATION, REV. 3.0", "IMPLEMENTATION NOTE
Sizing a 32-bit Base Address Register Example" says, that
"Software saves the original value of the Base Address register, writes
0 FFFF FFFFh to the register, then reads it back."
The same applies for 64-bit BARs. So what's wrong if I try to catch such
a write when a guest tries to size the BAR? The only difference is that
I compare as
if ( (val & PCI_BASE_ADDRESS_MEM_MASK_32) == PCI_BASE_ADDRESS_MEM_MASK_32 )
which is because val in the question has lower bits cleared.
With that respect I see no obvious reason why we can't construct our code
as it is.
>
>> }
>>
>> static uint32_t guest_bar_read(const struct pci_dev *pdev, unsigned int reg,
>> void *data)
>> {
>> - return 0xffffffff;
>> + struct vpci_bar *bar = data;
>> + uint32_t val;
>> + bool hi = false;
>> +
>> + switch ( bar->type )
>> + {
>> + case VPCI_BAR_MEM64_HI:
>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>> + bar--;
>> + hi = true;
>> + /* fallthrough */
>> + case VPCI_BAR_MEM64_LO:
>> + {
> Please don't add braces to case blocks when they're not needed.
Sure
>
>> + if ( hi )
>> + val = bar->guest_addr >> 32;
>> + else
>> + val = bar->guest_addr & 0xffffffff;
>> + if ( (val & PCI_BASE_ADDRESS_MEM_MASK_32) == PCI_BASE_ADDRESS_MEM_MASK_32 )
> This is wrong when falling through to here from VPCI_BAR_MEM64_HI:
> All 32 bits need to be looked at.
Good catch, will fix
> Yet as per the comment further
> up I think it isn't right anyway to apply the mask here.
>
> Also: Stray double blanks.
>
>> + {
>> + /* Guests detects BAR's properties and sizes. */
>> + if ( hi )
>> + val = bar->size >> 32;
>> + else
>> + val = 0xffffffff & ~(bar->size - 1);
>> + }
>> + if ( !hi )
>> + {
>> + val |= PCI_BASE_ADDRESS_MEM_TYPE_64;
>> + val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>> + }
>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>> + break;
>> + }
>> + case VPCI_BAR_MEM32:
> Please separate non-fall-through case blocks by a blank line.
Will do
>
>> @@ -522,6 +582,13 @@ static int add_bar_handlers(struct pci_dev *pdev, bool is_hwdom)
>> if ( rc )
>> return rc;
>> }
>> + /*
>> + * It is neither safe nor secure to initialize guest's view of the BARs
>> + * with real values which are used by the hardware domain, so assign
>> + * all zeros to guest's view of the BARs, so the guest can perform
>> + * proper PCI device enumeration and assign BARs on its own.
>> + */
>> + bars[i].guest_addr = 0;
> I'm afraid I don't understand the comment: Without memory decoding
> enabled, the BARs are simple registers (with a few r/o bits).
My first implementation was that bar->guest_addr was initialized with
the value of bar->addr (physical BAR value), but talking on IRC with
Roger he suggested that this might be a security issue to let guest
a hint about physical values, so then I changed the assignment to be 0.
Thus the comment
>
>> --- a/xen/include/xen/pci_regs.h
>> +++ b/xen/include/xen/pci_regs.h
>> @@ -103,6 +103,7 @@
>> #define PCI_BASE_ADDRESS_MEM_TYPE_64 0x04 /* 64 bit address */
>> #define PCI_BASE_ADDRESS_MEM_PREFETCH 0x08 /* prefetchable? */
>> #define PCI_BASE_ADDRESS_MEM_MASK (~0x0fUL)
>> +#define PCI_BASE_ADDRESS_MEM_MASK_32 (~0x0fU)
> Please don't introduce an identical constant that's merely of
> different type. (uint32_t)PCI_BASE_ADDRESS_MEM_MASK at the use
> site (if actually still needed as per the comment above) would
> seem more clear to me.
Ok, I thought type casting is a bigger evil here
>
> Jan
Thank you,
Oleksandr
On 07.09.2021 15:33, Oleksandr Andrushchenko wrote:
> On 06.09.21 17:31, Jan Beulich wrote:
>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
>>> --- a/xen/drivers/vpci/header.c
>>> +++ b/xen/drivers/vpci/header.c
>>> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>> uint32_t val, void *data)
>>> {
>>> + struct vpci_bar *bar = data;
>>> + bool hi = false;
>>> +
>>> + if ( bar->type == VPCI_BAR_MEM64_HI )
>>> + {
>>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>>> + bar--;
>>> + hi = true;
>>> + }
>>> + else
>>> + val &= PCI_BASE_ADDRESS_MEM_MASK;
>>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>> What you store here is not the address that's going to be used,
>
> bar->guest_addr is never used directly to be reported to a guest.
And this is what I question, as an approach. Your model _may_ work,
but its needlessly deviating from how people like me would expect
this to work. And if it's intended to be this way, how would I
have known?
> It is always used as an initial value which is then modified to reflect
> lower bits, e.g. BAR type and if prefetchable, so I think this is perfectly
> fine to have it this way.
And it is also perfectly fine to store the value to be handed
back to guests on the next read. Keeps the read path simple,
which I think can be assumed to be taken more frequently than
the write one. Plus stored values reflect reality.
Plus - if what you say was really the case, why do you mask off
PCI_BASE_ADDRESS_MEM_MASK here? You should then simply store
the written value and do _all_ the processing in the read path.
No point having partial logic in two places.
>> as
>> you don't mask off the low bits (to account for the BAR's size).
>> When a BAR gets written with all ones, all writable bits get these
>> ones stored. The address of the BAR, aiui, really changes to
>> (typically) close below 4Gb (in the case of a 32-bit BAR), which
>> is why memory / I/O decoding should be off while sizing BARs.
>> Therefore you shouldn't look for the specific "all writable bits
>> are ones" pattern (or worse, as you presently do, the "all bits
>> outside of the type specifier are ones" one) on the read path.
>> Instead mask the value appropriately here, and simply return back
>> the stored value from the read path.
> "PCI LOCAL BUS SPECIFICATION, REV. 3.0", "IMPLEMENTATION NOTE
>
> Sizing a 32-bit Base Address Register Example" says, that
>
> "Software saves the original value of the Base Address register, writes
> 0 FFFF FFFFh to the register, then reads it back."
>
> The same applies for 64-bit BARs. So what's wrong if I try to catch such
> a write when a guest tries to size the BAR? The only difference is that
> I compare as
> if ( (val & PCI_BASE_ADDRESS_MEM_MASK_32) == PCI_BASE_ADDRESS_MEM_MASK_32 )
> which is because val in the question has lower bits cleared.
Because while this matches what the spec says, it's not enough to
match how hardware behaves. Yet you want to mimic hardware behavior
as closely as possible here. There is (iirc) at least one other
place in the source tree were we had to adjust a similar initial
implementation to be closer to one expected by guests, no matter
that they may not be following the spec to the letter. Don't
forget that there may be bugs in kernels which don't surface until
the kernel gets run on an overly simplistic emulation.
>>> @@ -522,6 +582,13 @@ static int add_bar_handlers(struct pci_dev *pdev, bool is_hwdom)
>>> if ( rc )
>>> return rc;
>>> }
>>> + /*
>>> + * It is neither safe nor secure to initialize guest's view of the BARs
>>> + * with real values which are used by the hardware domain, so assign
>>> + * all zeros to guest's view of the BARs, so the guest can perform
>>> + * proper PCI device enumeration and assign BARs on its own.
>>> + */
>>> + bars[i].guest_addr = 0;
>> I'm afraid I don't understand the comment: Without memory decoding
>> enabled, the BARs are simple registers (with a few r/o bits).
>
> My first implementation was that bar->guest_addr was initialized with
> the value of bar->addr (physical BAR value), but talking on IRC with
> Roger he suggested that this might be a security issue to let guest
> a hint about physical values, so then I changed the assignment to be 0.
Well, yes, that's certainly correct. It's perhaps too unobvious to me
why one may want to use the host value here in the first place. It
simply has no meaning here.
>>> --- a/xen/include/xen/pci_regs.h
>>> +++ b/xen/include/xen/pci_regs.h
>>> @@ -103,6 +103,7 @@
>>> #define PCI_BASE_ADDRESS_MEM_TYPE_64 0x04 /* 64 bit address */
>>> #define PCI_BASE_ADDRESS_MEM_PREFETCH 0x08 /* prefetchable? */
>>> #define PCI_BASE_ADDRESS_MEM_MASK (~0x0fUL)
>>> +#define PCI_BASE_ADDRESS_MEM_MASK_32 (~0x0fU)
>> Please don't introduce an identical constant that's merely of
>> different type. (uint32_t)PCI_BASE_ADDRESS_MEM_MASK at the use
>> site (if actually still needed as per the comment above) would
>> seem more clear to me.
> Ok, I thought type casting is a bigger evil here
Often it is, but imo here it is not. I hope you realize that ~0x0fU
if not necessarily 0xfffffff0? We make certain assumptions on type
widths. For unsigned int we assume it to be at least 32 bits wide.
We should avoid assumptions of it being exactly 32 bits wide. Just
like we cannot (more obviously) assume the width of unsigned long.
(Which tells us that for 32-bit arches PCI_BASE_ADDRESS_MEM_MASK is
actually of the wrong type. This constant should be the same no
matter the bitness.)
Jan
On 07.09.21 19:30, Jan Beulich wrote:
> On 07.09.2021 15:33, Oleksandr Andrushchenko wrote:
>> On 06.09.21 17:31, Jan Beulich wrote:
>>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
>>>> --- a/xen/drivers/vpci/header.c
>>>> +++ b/xen/drivers/vpci/header.c
>>>> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>> uint32_t val, void *data)
>>>> {
>>>> + struct vpci_bar *bar = data;
>>>> + bool hi = false;
>>>> +
>>>> + if ( bar->type == VPCI_BAR_MEM64_HI )
>>>> + {
>>>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>> + bar--;
>>>> + hi = true;
>>>> + }
>>>> + else
>>>> + val &= PCI_BASE_ADDRESS_MEM_MASK;
>>>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>>> What you store here is not the address that's going to be used,
>> bar->guest_addr is never used directly to be reported to a guest.
> And this is what I question, as an approach. Your model _may_ work,
> but its needlessly deviating from how people like me would expect
> this to work. And if it's intended to be this way, how would I
> have known?
Well, I just tried to follow the model we already have in the existing code...
>
>> It is always used as an initial value which is then modified to reflect
>> lower bits, e.g. BAR type and if prefetchable, so I think this is perfectly
>> fine to have it this way.
> And it is also perfectly fine to store the value to be handed
> back to guests on the next read. Keeps the read path simple,
> which I think can be assumed to be taken more frequently than
> the write one. Plus stored values reflect reality.
>
> Plus - if what you say was really the case, why do you mask off
> PCI_BASE_ADDRESS_MEM_MASK here? You should then simply store
> the written value and do _all_ the processing in the read path.
> No point having partial logic in two places.
I will move the logic to the write handler, indeed we can save some
CPU cycles here
>
>>> as
>>> you don't mask off the low bits (to account for the BAR's size).
>>> When a BAR gets written with all ones, all writable bits get these
>>> ones stored. The address of the BAR, aiui, really changes to
>>> (typically) close below 4Gb (in the case of a 32-bit BAR), which
>>> is why memory / I/O decoding should be off while sizing BARs.
>>> Therefore you shouldn't look for the specific "all writable bits
>>> are ones" pattern (or worse, as you presently do, the "all bits
>>> outside of the type specifier are ones" one) on the read path.
>>> Instead mask the value appropriately here, and simply return back
>>> the stored value from the read path.
>> "PCI LOCAL BUS SPECIFICATION, REV. 3.0", "IMPLEMENTATION NOTE
>>
>> Sizing a 32-bit Base Address Register Example" says, that
>>
>> "Software saves the original value of the Base Address register, writes
>> 0 FFFF FFFFh to the register, then reads it back."
>>
>> The same applies for 64-bit BARs. So what's wrong if I try to catch such
>> a write when a guest tries to size the BAR? The only difference is that
>> I compare as
>> if ( (val & PCI_BASE_ADDRESS_MEM_MASK_32) == PCI_BASE_ADDRESS_MEM_MASK_32 )
>> which is because val in the question has lower bits cleared.
> Because while this matches what the spec says, it's not enough to
> match how hardware behaves.
But we should implement as the spec says, not like buggy hardware behaves
> Yet you want to mimic hardware behavior
> as closely as possible here. There is (iirc) at least one other
> place in the source tree were we had to adjust a similar initial
> implementation to be closer to one expected by guests,
Could you please point me to that code so I can consult and possibly
re-use the approach?
> no matter
> that they may not be following the spec to the letter. Don't
> forget that there may be bugs in kernels which don't surface until
> the kernel gets run on an overly simplistic emulation.
This is sad. And the kernel needs to be fixed then, not Xen
>
>>>> @@ -522,6 +582,13 @@ static int add_bar_handlers(struct pci_dev *pdev, bool is_hwdom)
>>>> if ( rc )
>>>> return rc;
>>>> }
>>>> + /*
>>>> + * It is neither safe nor secure to initialize guest's view of the BARs
>>>> + * with real values which are used by the hardware domain, so assign
>>>> + * all zeros to guest's view of the BARs, so the guest can perform
>>>> + * proper PCI device enumeration and assign BARs on its own.
>>>> + */
>>>> + bars[i].guest_addr = 0;
>>> I'm afraid I don't understand the comment: Without memory decoding
>>> enabled, the BARs are simple registers (with a few r/o bits).
>> My first implementation was that bar->guest_addr was initialized with
>> the value of bar->addr (physical BAR value), but talking on IRC with
>> Roger he suggested that this might be a security issue to let guest
>> a hint about physical values, so then I changed the assignment to be 0.
> Well, yes, that's certainly correct. It's perhaps too unobvious to me
> why one may want to use the host value here in the first place. It
> simply has no meaning here.
Do you want me to remove the comment?
>
>>>> --- a/xen/include/xen/pci_regs.h
>>>> +++ b/xen/include/xen/pci_regs.h
>>>> @@ -103,6 +103,7 @@
>>>> #define PCI_BASE_ADDRESS_MEM_TYPE_64 0x04 /* 64 bit address */
>>>> #define PCI_BASE_ADDRESS_MEM_PREFETCH 0x08 /* prefetchable? */
>>>> #define PCI_BASE_ADDRESS_MEM_MASK (~0x0fUL)
>>>> +#define PCI_BASE_ADDRESS_MEM_MASK_32 (~0x0fU)
>>> Please don't introduce an identical constant that's merely of
>>> different type. (uint32_t)PCI_BASE_ADDRESS_MEM_MASK at the use
>>> site (if actually still needed as per the comment above) would
>>> seem more clear to me.
>> Ok, I thought type casting is a bigger evil here
> Often it is, but imo here it is not. I hope you realize that ~0x0fU
> if not necessarily 0xfffffff0? We make certain assumptions on type
> widths. For unsigned int we assume it to be at least 32 bits wide.
> We should avoid assumptions of it being exactly 32 bits wide. Just
> like we cannot (more obviously) assume the width of unsigned long.
> (Which tells us that for 32-bit arches PCI_BASE_ADDRESS_MEM_MASK is
> actually of the wrong type. This constant should be the same no
> matter the bitness.)
Ok, I will not introduce a new define and use (uint32_t)
>
> Jan
>
Thank you,
Oleksandr
On 07.09.2021 19:39, Oleksandr Andrushchenko wrote:
> On 07.09.21 19:30, Jan Beulich wrote:
>> On 07.09.2021 15:33, Oleksandr Andrushchenko wrote:
>>> On 06.09.21 17:31, Jan Beulich wrote:
>>>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
>>>>> --- a/xen/drivers/vpci/header.c
>>>>> +++ b/xen/drivers/vpci/header.c
>>>>> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>> uint32_t val, void *data)
>>>>> {
>>>>> + struct vpci_bar *bar = data;
>>>>> + bool hi = false;
>>>>> +
>>>>> + if ( bar->type == VPCI_BAR_MEM64_HI )
>>>>> + {
>>>>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>>> + bar--;
>>>>> + hi = true;
>>>>> + }
>>>>> + else
>>>>> + val &= PCI_BASE_ADDRESS_MEM_MASK;
>>>>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>>>> What you store here is not the address that's going to be used,
>>>> as
>>>> you don't mask off the low bits (to account for the BAR's size).
>>>> When a BAR gets written with all ones, all writable bits get these
>>>> ones stored. The address of the BAR, aiui, really changes to
>>>> (typically) close below 4Gb (in the case of a 32-bit BAR), which
>>>> is why memory / I/O decoding should be off while sizing BARs.
>>>> Therefore you shouldn't look for the specific "all writable bits
>>>> are ones" pattern (or worse, as you presently do, the "all bits
>>>> outside of the type specifier are ones" one) on the read path.
>>>> Instead mask the value appropriately here, and simply return back
>>>> the stored value from the read path.
>>> "PCI LOCAL BUS SPECIFICATION, REV. 3.0", "IMPLEMENTATION NOTE
>>>
>>> Sizing a 32-bit Base Address Register Example" says, that
>>>
>>> "Software saves the original value of the Base Address register, writes
>>> 0 FFFF FFFFh to the register, then reads it back."
>>>
>>> The same applies for 64-bit BARs. So what's wrong if I try to catch such
>>> a write when a guest tries to size the BAR? The only difference is that
>>> I compare as
>>> if ( (val & PCI_BASE_ADDRESS_MEM_MASK_32) == PCI_BASE_ADDRESS_MEM_MASK_32 )
>>> which is because val in the question has lower bits cleared.
>> Because while this matches what the spec says, it's not enough to
>> match how hardware behaves.
> But we should implement as the spec says, not like buggy hardware behaves
The behavior I'm describing doesn't violate the spec. There merely is
more to it than what the spec says, or one might also view it the way
that the spec doesn't use the best way of expressing things.
>> Yet you want to mimic hardware behavior
>> as closely as possible here. There is (iirc) at least one other
>> place in the source tree were we had to adjust a similar initial
>> implementation to be closer to one expected by guests,
>
> Could you please point me to that code so I can consult and possibly
> re-use the approach?
I only recall the fact; this might have been hvmloader, vpci, or yet
somewhere else. I'm sorry.
>>>>> @@ -522,6 +582,13 @@ static int add_bar_handlers(struct pci_dev *pdev, bool is_hwdom)
>>>>> if ( rc )
>>>>> return rc;
>>>>> }
>>>>> + /*
>>>>> + * It is neither safe nor secure to initialize guest's view of the BARs
>>>>> + * with real values which are used by the hardware domain, so assign
>>>>> + * all zeros to guest's view of the BARs, so the guest can perform
>>>>> + * proper PCI device enumeration and assign BARs on its own.
>>>>> + */
>>>>> + bars[i].guest_addr = 0;
>>>> I'm afraid I don't understand the comment: Without memory decoding
>>>> enabled, the BARs are simple registers (with a few r/o bits).
>>> My first implementation was that bar->guest_addr was initialized with
>>> the value of bar->addr (physical BAR value), but talking on IRC with
>>> Roger he suggested that this might be a security issue to let guest
>>> a hint about physical values, so then I changed the assignment to be 0.
>> Well, yes, that's certainly correct. It's perhaps too unobvious to me
>> why one may want to use the host value here in the first place. It
>> simply has no meaning here.
> Do you want me to remove the comment?
Yes. I wonder whether the assignment is necessary in the first place:
I'd somehow expect the structure to come from xzalloc(). Albeit I
guess this function can be run through more than once without freeing
the structure and then allocating is anew?
Jan
On 08.09.21 12:27, Jan Beulich wrote:
> On 07.09.2021 19:39, Oleksandr Andrushchenko wrote:
>> On 07.09.21 19:30, Jan Beulich wrote:
>>> On 07.09.2021 15:33, Oleksandr Andrushchenko wrote:
>>>> On 06.09.21 17:31, Jan Beulich wrote:
>>>>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>> uint32_t val, void *data)
>>>>>> {
>>>>>> + struct vpci_bar *bar = data;
>>>>>> + bool hi = false;
>>>>>> +
>>>>>> + if ( bar->type == VPCI_BAR_MEM64_HI )
>>>>>> + {
>>>>>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>>>> + bar--;
>>>>>> + hi = true;
>>>>>> + }
>>>>>> + else
>>>>>> + val &= PCI_BASE_ADDRESS_MEM_MASK;
>>>>>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>>>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>>>>> What you store here is not the address that's going to be used,
>>>>> as
>>>>> you don't mask off the low bits (to account for the BAR's size).
>>>>> When a BAR gets written with all ones, all writable bits get these
>>>>> ones stored. The address of the BAR, aiui, really changes to
>>>>> (typically) close below 4Gb (in the case of a 32-bit BAR), which
>>>>> is why memory / I/O decoding should be off while sizing BARs.
>>>>> Therefore you shouldn't look for the specific "all writable bits
>>>>> are ones" pattern (or worse, as you presently do, the "all bits
>>>>> outside of the type specifier are ones" one) on the read path.
>>>>> Instead mask the value appropriately here, and simply return back
>>>>> the stored value from the read path.
But in case of BAR sizing I need to actually return BAR size.
So, the comparison is the way to tell if the guest wants to read
actual (configured) BAR value or it tries to determine BAR's size.
This is why I compare and use the result as the answer to what needs
to be supplied to the guest. So, if I don't compare with 0xffffffff for the
hi part and 0xfffffff0 for the low then how do I know when to return
configured BAR or return the size?
>>>> "PCI LOCAL BUS SPECIFICATION, REV. 3.0", "IMPLEMENTATION NOTE
>>>>
>>>> Sizing a 32-bit Base Address Register Example" says, that
>>>>
>>>> "Software saves the original value of the Base Address register, writes
>>>> 0 FFFF FFFFh to the register, then reads it back."
>>>>
>>>> The same applies for 64-bit BARs. So what's wrong if I try to catch such
>>>> a write when a guest tries to size the BAR? The only difference is that
>>>> I compare as
>>>> if ( (val & PCI_BASE_ADDRESS_MEM_MASK_32) == PCI_BASE_ADDRESS_MEM_MASK_32 )
>>>> which is because val in the question has lower bits cleared.
>>> Because while this matches what the spec says, it's not enough to
>>> match how hardware behaves.
>> But we should implement as the spec says, not like buggy hardware behaves
> The behavior I'm describing doesn't violate the spec. There merely is
> more to it than what the spec says, or one might also view it the way
> that the spec doesn't use the best way of expressing things.
Well, the spec explicitly says "write 0xffffffff and read back"
I can't see any way it can be read differently
>
>>> Yet you want to mimic hardware behavior
>>> as closely as possible here. There is (iirc) at least one other
>>> place in the source tree were we had to adjust a similar initial
>>> implementation to be closer to one expected by guests,
>> Could you please point me to that code so I can consult and possibly
>> re-use the approach?
> I only recall the fact; this might have been hvmloader, vpci, or yet
> somewhere else. I'm sorry.
No problem
>
>>>>>> @@ -522,6 +582,13 @@ static int add_bar_handlers(struct pci_dev *pdev, bool is_hwdom)
>>>>>> if ( rc )
>>>>>> return rc;
>>>>>> }
>>>>>> + /*
>>>>>> + * It is neither safe nor secure to initialize guest's view of the BARs
>>>>>> + * with real values which are used by the hardware domain, so assign
>>>>>> + * all zeros to guest's view of the BARs, so the guest can perform
>>>>>> + * proper PCI device enumeration and assign BARs on its own.
>>>>>> + */
>>>>>> + bars[i].guest_addr = 0;
>>>>> I'm afraid I don't understand the comment: Without memory decoding
>>>>> enabled, the BARs are simple registers (with a few r/o bits).
>>>> My first implementation was that bar->guest_addr was initialized with
>>>> the value of bar->addr (physical BAR value), but talking on IRC with
>>>> Roger he suggested that this might be a security issue to let guest
>>>> a hint about physical values, so then I changed the assignment to be 0.
>>> Well, yes, that's certainly correct. It's perhaps too unobvious to me
>>> why one may want to use the host value here in the first place. It
>>> simply has no meaning here.
>> Do you want me to remove the comment?
> Yes. I wonder whether the assignment is necessary in the first place:
> I'd somehow expect the structure to come from xzalloc(). Albeit I
> guess this function can be run through more than once without freeing
> the structure and then allocating is anew?
I'll check that and if the structure is already zeroed then I'll remove both the
assignment and the comment
>
> Jan
>
Thank you,
Oleksandr
On 08.09.2021 11:43, Oleksandr Andrushchenko wrote:
>
> On 08.09.21 12:27, Jan Beulich wrote:
>> On 07.09.2021 19:39, Oleksandr Andrushchenko wrote:
>>> On 07.09.21 19:30, Jan Beulich wrote:
>>>> On 07.09.2021 15:33, Oleksandr Andrushchenko wrote:
>>>>> On 06.09.21 17:31, Jan Beulich wrote:
>>>>>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
>>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>>> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>>> uint32_t val, void *data)
>>>>>>> {
>>>>>>> + struct vpci_bar *bar = data;
>>>>>>> + bool hi = false;
>>>>>>> +
>>>>>>> + if ( bar->type == VPCI_BAR_MEM64_HI )
>>>>>>> + {
>>>>>>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>>>>> + bar--;
>>>>>>> + hi = true;
>>>>>>> + }
>>>>>>> + else
>>>>>>> + val &= PCI_BASE_ADDRESS_MEM_MASK;
>>>>>>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>>>>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>>>>>> What you store here is not the address that's going to be used,
>>>>>> as
>>>>>> you don't mask off the low bits (to account for the BAR's size).
>>>>>> When a BAR gets written with all ones, all writable bits get these
>>>>>> ones stored. The address of the BAR, aiui, really changes to
>>>>>> (typically) close below 4Gb (in the case of a 32-bit BAR), which
>>>>>> is why memory / I/O decoding should be off while sizing BARs.
>>>>>> Therefore you shouldn't look for the specific "all writable bits
>>>>>> are ones" pattern (or worse, as you presently do, the "all bits
>>>>>> outside of the type specifier are ones" one) on the read path.
>>>>>> Instead mask the value appropriately here, and simply return back
>>>>>> the stored value from the read path.
>
> But in case of BAR sizing I need to actually return BAR size.
> So, the comparison is the way to tell if the guest wants to read
> actual (configured) BAR value or it tries to determine BAR's size.
> This is why I compare and use the result as the answer to what needs
> to be supplied to the guest. So, if I don't compare with 0xffffffff for the
> hi part and 0xfffffff0 for the low then how do I know when to return
> configured BAR or return the size?
Well, but that's the common misunderstanding that I've been trying
to point out: There's no difference between these two forms of
reads. The BARs are simply registers with some r/o bits. There's
no hidden 2nd register recording what was last written. When you
write 0xffffffff, all you do is set all writable bits to 1. When
you read back from the register, you will find all r/o bits
unchanged (which in particular means all lower address bits are
zero, thus allowing you to determine the size).
When the spec says to write 0xffffffff for sizing purposes, OSes
should follow that, yes. This doesn't preclude them to use other
forms of writes for whichever purpose. Hence you do not want to
special case sizing, but instead you want to emulate correctly
all forms of writes, including subsequent reads to uniformly
return the intended / expected values.
Just to give an example (perhaps a little contrived): To size a
64-bit BAR, in principle you'd first need to write 0xffffffff to
both halves. But there's nothing wrong with doing this in a
different order: Act on the low half alone first, and then act
on the high half. The acting on the high half could even be
skipped if the low half sizing produced at least bit 31 set. Now
if you were to special case seeing ffffffff:fffffff? as the
last written pair of values, you'd break that (imo legitimate)
alternative process of sizing.
Jan
On 08.09.21 13:03, Jan Beulich wrote:
> On 08.09.2021 11:43, Oleksandr Andrushchenko wrote:
>> On 08.09.21 12:27, Jan Beulich wrote:
>>> On 07.09.2021 19:39, Oleksandr Andrushchenko wrote:
>>>> On 07.09.21 19:30, Jan Beulich wrote:
>>>>> On 07.09.2021 15:33, Oleksandr Andrushchenko wrote:
>>>>>> On 06.09.21 17:31, Jan Beulich wrote:
>>>>>>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
>>>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>>>> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>> uint32_t val, void *data)
>>>>>>>> {
>>>>>>>> + struct vpci_bar *bar = data;
>>>>>>>> + bool hi = false;
>>>>>>>> +
>>>>>>>> + if ( bar->type == VPCI_BAR_MEM64_HI )
>>>>>>>> + {
>>>>>>>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>>>>>> + bar--;
>>>>>>>> + hi = true;
>>>>>>>> + }
>>>>>>>> + else
>>>>>>>> + val &= PCI_BASE_ADDRESS_MEM_MASK;
>>>>>>>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>>>>>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>>>>>>> What you store here is not the address that's going to be used,
>>>>>>> as
>>>>>>> you don't mask off the low bits (to account for the BAR's size).
>>>>>>> When a BAR gets written with all ones, all writable bits get these
>>>>>>> ones stored. The address of the BAR, aiui, really changes to
>>>>>>> (typically) close below 4Gb (in the case of a 32-bit BAR), which
>>>>>>> is why memory / I/O decoding should be off while sizing BARs.
>>>>>>> Therefore you shouldn't look for the specific "all writable bits
>>>>>>> are ones" pattern (or worse, as you presently do, the "all bits
>>>>>>> outside of the type specifier are ones" one) on the read path.
>>>>>>> Instead mask the value appropriately here, and simply return back
>>>>>>> the stored value from the read path.
>> But in case of BAR sizing I need to actually return BAR size.
>> So, the comparison is the way to tell if the guest wants to read
>> actual (configured) BAR value or it tries to determine BAR's size.
>> This is why I compare and use the result as the answer to what needs
>> to be supplied to the guest. So, if I don't compare with 0xffffffff for the
>> hi part and 0xfffffff0 for the low then how do I know when to return
>> configured BAR or return the size?
> Well, but that's the common misunderstanding that I've been trying
> to point out: There's no difference between these two forms of
> reads. The BARs are simply registers with some r/o bits. There's
> no hidden 2nd register recording what was last written. When you
> write 0xffffffff, all you do is set all writable bits to 1. When
> you read back from the register, you will find all r/o bits
> unchanged (which in particular means all lower address bits are
> zero, thus allowing you to determine the size).
>
> When the spec says to write 0xffffffff for sizing purposes, OSes
> should follow that, yes. This doesn't preclude them to use other
> forms of writes for whichever purpose. Hence you do not want to
> special case sizing, but instead you want to emulate correctly
> all forms of writes, including subsequent reads to uniformly
> return the intended / expected values.
>
> Just to give an example (perhaps a little contrived): To size a
> 64-bit BAR, in principle you'd first need to write 0xffffffff to
> both halves. But there's nothing wrong with doing this in a
> different order: Act on the low half alone first, and then act
> on the high half. The acting on the high half could even be
> skipped if the low half sizing produced at least bit 31 set. Now
> if you were to special case seeing ffffffff:fffffff? as the
> last written pair of values, you'd break that (imo legitimate)
> alternative process of sizing.
How about:
static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
uint32_t val, void *data)
{
struct vpci_bar *bar = data;
bool hi = false;
if ( bar->type == VPCI_BAR_MEM64_HI )
{
ASSERT(reg > PCI_BASE_ADDRESS_0);
bar--;
hi = true;
}
else
{
val &= PCI_BASE_ADDRESS_MEM_MASK;
val |= bar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
: PCI_BASE_ADDRESS_MEM_TYPE_64;
val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
}
bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
bar->guest_addr &= ~(bar->size - 1) | ~PCI_BASE_ADDRESS_MEM_MASK;
}
static uint32_t guest_bar_read(const struct pci_dev *pdev, unsigned int reg,
void *data)
{
struct vpci_bar *bar = data;
if ( bar->type == VPCI_BAR_MEM64_HI )
return bar->guest_addr >> 32;
return bar->guest_addr;
}
It seems to solve all the questions we have: more work on write path,
no comparison with 0xffffffff: BAR's size is used to mask unwanted bits.
BTW, bars[i].guest_addr = 0; is needed as this field can be re-used.
>
> Jan
>
Thank you,
Oleksandr
On 08.09.2021 15:33, Oleksandr Andrushchenko wrote:
>
> On 08.09.21 13:03, Jan Beulich wrote:
>> On 08.09.2021 11:43, Oleksandr Andrushchenko wrote:
>>> On 08.09.21 12:27, Jan Beulich wrote:
>>>> On 07.09.2021 19:39, Oleksandr Andrushchenko wrote:
>>>>> On 07.09.21 19:30, Jan Beulich wrote:
>>>>>> On 07.09.2021 15:33, Oleksandr Andrushchenko wrote:
>>>>>>> On 06.09.21 17:31, Jan Beulich wrote:
>>>>>>>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
>>>>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>>>>> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>> uint32_t val, void *data)
>>>>>>>>> {
>>>>>>>>> + struct vpci_bar *bar = data;
>>>>>>>>> + bool hi = false;
>>>>>>>>> +
>>>>>>>>> + if ( bar->type == VPCI_BAR_MEM64_HI )
>>>>>>>>> + {
>>>>>>>>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>>>>>>> + bar--;
>>>>>>>>> + hi = true;
>>>>>>>>> + }
>>>>>>>>> + else
>>>>>>>>> + val &= PCI_BASE_ADDRESS_MEM_MASK;
>>>>>>>>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>>>>>>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>>>>>>>> What you store here is not the address that's going to be used,
>>>>>>>> as
>>>>>>>> you don't mask off the low bits (to account for the BAR's size).
>>>>>>>> When a BAR gets written with all ones, all writable bits get these
>>>>>>>> ones stored. The address of the BAR, aiui, really changes to
>>>>>>>> (typically) close below 4Gb (in the case of a 32-bit BAR), which
>>>>>>>> is why memory / I/O decoding should be off while sizing BARs.
>>>>>>>> Therefore you shouldn't look for the specific "all writable bits
>>>>>>>> are ones" pattern (or worse, as you presently do, the "all bits
>>>>>>>> outside of the type specifier are ones" one) on the read path.
>>>>>>>> Instead mask the value appropriately here, and simply return back
>>>>>>>> the stored value from the read path.
>>> But in case of BAR sizing I need to actually return BAR size.
>>> So, the comparison is the way to tell if the guest wants to read
>>> actual (configured) BAR value or it tries to determine BAR's size.
>>> This is why I compare and use the result as the answer to what needs
>>> to be supplied to the guest. So, if I don't compare with 0xffffffff for the
>>> hi part and 0xfffffff0 for the low then how do I know when to return
>>> configured BAR or return the size?
>> Well, but that's the common misunderstanding that I've been trying
>> to point out: There's no difference between these two forms of
>> reads. The BARs are simply registers with some r/o bits. There's
>> no hidden 2nd register recording what was last written. When you
>> write 0xffffffff, all you do is set all writable bits to 1. When
>> you read back from the register, you will find all r/o bits
>> unchanged (which in particular means all lower address bits are
>> zero, thus allowing you to determine the size).
>>
>> When the spec says to write 0xffffffff for sizing purposes, OSes
>> should follow that, yes. This doesn't preclude them to use other
>> forms of writes for whichever purpose. Hence you do not want to
>> special case sizing, but instead you want to emulate correctly
>> all forms of writes, including subsequent reads to uniformly
>> return the intended / expected values.
>>
>> Just to give an example (perhaps a little contrived): To size a
>> 64-bit BAR, in principle you'd first need to write 0xffffffff to
>> both halves. But there's nothing wrong with doing this in a
>> different order: Act on the low half alone first, and then act
>> on the high half. The acting on the high half could even be
>> skipped if the low half sizing produced at least bit 31 set. Now
>> if you were to special case seeing ffffffff:fffffff? as the
>> last written pair of values, you'd break that (imo legitimate)
>> alternative process of sizing.
>
> How about:
Yes, that's what I was after. Just one nit right away:
> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
> uint32_t val, void *data)
> {
> struct vpci_bar *bar = data;
> bool hi = false;
>
> if ( bar->type == VPCI_BAR_MEM64_HI )
> {
> ASSERT(reg > PCI_BASE_ADDRESS_0);
> bar--;
> hi = true;
> }
> else
> {
> val &= PCI_BASE_ADDRESS_MEM_MASK;
> val |= bar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
> : PCI_BASE_ADDRESS_MEM_TYPE_64;
> val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
> }
>
> bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
> bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>
> bar->guest_addr &= ~(bar->size - 1) | ~PCI_BASE_ADDRESS_MEM_MASK;
> }
>
> static uint32_t guest_bar_read(const struct pci_dev *pdev, unsigned int reg,
> void *data)
> {
> struct vpci_bar *bar = data;
const please.
Jan
On 08.09.21 17:46, Jan Beulich wrote:
> On 08.09.2021 15:33, Oleksandr Andrushchenko wrote:
>> On 08.09.21 13:03, Jan Beulich wrote:
>>> On 08.09.2021 11:43, Oleksandr Andrushchenko wrote:
>>>> On 08.09.21 12:27, Jan Beulich wrote:
>>>>> On 07.09.2021 19:39, Oleksandr Andrushchenko wrote:
>>>>>> On 07.09.21 19:30, Jan Beulich wrote:
>>>>>>> On 07.09.2021 15:33, Oleksandr Andrushchenko wrote:
>>>>>>>> On 06.09.21 17:31, Jan Beulich wrote:
>>>>>>>>> On 03.09.2021 12:08, Oleksandr Andrushchenko wrote:
>>>>>>>>>> --- a/xen/drivers/vpci/header.c
>>>>>>>>>> +++ b/xen/drivers/vpci/header.c
>>>>>>>>>> @@ -400,12 +400,72 @@ static void bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>>>>>>>> uint32_t val, void *data)
>>>>>>>>>> {
>>>>>>>>>> + struct vpci_bar *bar = data;
>>>>>>>>>> + bool hi = false;
>>>>>>>>>> +
>>>>>>>>>> + if ( bar->type == VPCI_BAR_MEM64_HI )
>>>>>>>>>> + {
>>>>>>>>>> + ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>>>>>>>> + bar--;
>>>>>>>>>> + hi = true;
>>>>>>>>>> + }
>>>>>>>>>> + else
>>>>>>>>>> + val &= PCI_BASE_ADDRESS_MEM_MASK;
>>>>>>>>>> + bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>>>>>>>>>> + bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>>>>>>>>> What you store here is not the address that's going to be used,
>>>>>>>>> as
>>>>>>>>> you don't mask off the low bits (to account for the BAR's size).
>>>>>>>>> When a BAR gets written with all ones, all writable bits get these
>>>>>>>>> ones stored. The address of the BAR, aiui, really changes to
>>>>>>>>> (typically) close below 4Gb (in the case of a 32-bit BAR), which
>>>>>>>>> is why memory / I/O decoding should be off while sizing BARs.
>>>>>>>>> Therefore you shouldn't look for the specific "all writable bits
>>>>>>>>> are ones" pattern (or worse, as you presently do, the "all bits
>>>>>>>>> outside of the type specifier are ones" one) on the read path.
>>>>>>>>> Instead mask the value appropriately here, and simply return back
>>>>>>>>> the stored value from the read path.
>>>> But in case of BAR sizing I need to actually return BAR size.
>>>> So, the comparison is the way to tell if the guest wants to read
>>>> actual (configured) BAR value or it tries to determine BAR's size.
>>>> This is why I compare and use the result as the answer to what needs
>>>> to be supplied to the guest. So, if I don't compare with 0xffffffff for the
>>>> hi part and 0xfffffff0 for the low then how do I know when to return
>>>> configured BAR or return the size?
>>> Well, but that's the common misunderstanding that I've been trying
>>> to point out: There's no difference between these two forms of
>>> reads. The BARs are simply registers with some r/o bits. There's
>>> no hidden 2nd register recording what was last written. When you
>>> write 0xffffffff, all you do is set all writable bits to 1. When
>>> you read back from the register, you will find all r/o bits
>>> unchanged (which in particular means all lower address bits are
>>> zero, thus allowing you to determine the size).
>>>
>>> When the spec says to write 0xffffffff for sizing purposes, OSes
>>> should follow that, yes. This doesn't preclude them to use other
>>> forms of writes for whichever purpose. Hence you do not want to
>>> special case sizing, but instead you want to emulate correctly
>>> all forms of writes, including subsequent reads to uniformly
>>> return the intended / expected values.
>>>
>>> Just to give an example (perhaps a little contrived): To size a
>>> 64-bit BAR, in principle you'd first need to write 0xffffffff to
>>> both halves. But there's nothing wrong with doing this in a
>>> different order: Act on the low half alone first, and then act
>>> on the high half. The acting on the high half could even be
>>> skipped if the low half sizing produced at least bit 31 set. Now
>>> if you were to special case seeing ffffffff:fffffff? as the
>>> last written pair of values, you'd break that (imo legitimate)
>>> alternative process of sizing.
>> How about:
> Yes, that's what I was after. Just one nit right away:
>
>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>> uint32_t val, void *data)
>> {
>> struct vpci_bar *bar = data;
>> bool hi = false;
>>
>> if ( bar->type == VPCI_BAR_MEM64_HI )
>> {
>> ASSERT(reg > PCI_BASE_ADDRESS_0);
>> bar--;
>> hi = true;
>> }
>> else
>> {
>> val &= PCI_BASE_ADDRESS_MEM_MASK;
>> val |= bar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>> : PCI_BASE_ADDRESS_MEM_TYPE_64;
>> val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>> }
>>
>> bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
Do you think this needs to be 0xfffffffful, not 0xffffffffull?
e.g. s/ull/ul
>> bar->guest_addr |= (uint64_t)val << (hi ? 32 : 0);
>>
>> bar->guest_addr &= ~(bar->size - 1) | ~PCI_BASE_ADDRESS_MEM_MASK;
>> }
>>
>> static uint32_t guest_bar_read(const struct pci_dev *pdev, unsigned int reg,
>> void *data)
>> {
>> struct vpci_bar *bar = data;
> const please.
Sure
>
> Jan
>
Thank you for helping with this!!
Oleksandr
On 08.09.2021 17:14, Oleksandr Andrushchenko wrote:
> On 08.09.21 17:46, Jan Beulich wrote:
>> On 08.09.2021 15:33, Oleksandr Andrushchenko wrote:
>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>> uint32_t val, void *data)
>>> {
>>> struct vpci_bar *bar = data;
>>> bool hi = false;
>>>
>>> if ( bar->type == VPCI_BAR_MEM64_HI )
>>> {
>>> ASSERT(reg > PCI_BASE_ADDRESS_0);
>>> bar--;
>>> hi = true;
>>> }
>>> else
>>> {
>>> val &= PCI_BASE_ADDRESS_MEM_MASK;
>>> val |= bar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>>> : PCI_BASE_ADDRESS_MEM_TYPE_64;
>>> val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>>> }
>>>
>>> bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>
> Do you think this needs to be 0xfffffffful, not 0xffffffffull?
>
> e.g. s/ull/ul
If guest_addr is uint64_t then ull would seem more correct to me,
especially when considering (hypothetical?) 32-bit architectures
potentially wanting to use this code.
Jan
On 08.09.21 18:29, Jan Beulich wrote:
> On 08.09.2021 17:14, Oleksandr Andrushchenko wrote:
>> On 08.09.21 17:46, Jan Beulich wrote:
>>> On 08.09.2021 15:33, Oleksandr Andrushchenko wrote:
>>>> static void guest_bar_write(const struct pci_dev *pdev, unsigned int reg,
>>>> uint32_t val, void *data)
>>>> {
>>>> struct vpci_bar *bar = data;
>>>> bool hi = false;
>>>>
>>>> if ( bar->type == VPCI_BAR_MEM64_HI )
>>>> {
>>>> ASSERT(reg > PCI_BASE_ADDRESS_0);
>>>> bar--;
>>>> hi = true;
>>>> }
>>>> else
>>>> {
>>>> val &= PCI_BASE_ADDRESS_MEM_MASK;
>>>> val |= bar->type == VPCI_BAR_MEM32 ? PCI_BASE_ADDRESS_MEM_TYPE_32
>>>> : PCI_BASE_ADDRESS_MEM_TYPE_64;
>>>> val |= bar->prefetchable ? PCI_BASE_ADDRESS_MEM_PREFETCH : 0;
>>>> }
>>>>
>>>> bar->guest_addr &= ~(0xffffffffull << (hi ? 32 : 0));
>> Do you think this needs to be 0xfffffffful, not 0xffffffffull?
>>
>> e.g. s/ull/ul
> If guest_addr is uint64_t then ull would seem more correct to me,
> especially when considering (hypothetical?) 32-bit architectures
> potentially wanting to use this code.
Ok, then I'll keep ull
>
> Jan
>
Thank you,
Oleksandr
© 2016 - 2026 Red Hat, Inc.