xen/arm: vpci: Remove PCI I/O ranges property value

[PATCH] xen/arm: vpci: Remove PCI I/O ranges property value

Posted by Rahul Singh 4 years, 1 month ago

IO ports on ARM don't exist so all IO ports related hypercalls are going
to fail on ARM when we passthrough a PCI device.
Failure of xc_domain_ioport_permission(..) would turn into a critical
failure at domain creation. We need to avoid this outcome, instead we
want to continue with domain creation as normal even if
xc_domain_ioport_permission(..) fails. XEN_DOMCTL_ioport_permission
is not implemented on ARM so it would return -ENOSYS.

To solve above issue remove PCI I/O ranges property value from dom0
device tree node so that dom0 linux will not allocate I/O space for PCI
devices if pci-passthrough is enabled.

Another valid reason to remove I/O ranges is that PCI I/O space are not
mapped to dom0 when PCI passthrough is enabled, also there is no vpci
trap handler register for IO bar.

Signed-off-by: Rahul Singh <rahul.singh@arm.com>
---
 xen/arch/arm/domain_build.c   | 14 +++++++
 xen/common/device_tree.c      | 72 +++++++++++++++++++++++++++++++++++
 xen/include/xen/device_tree.h | 10 +++++
 3 files changed, 96 insertions(+)

diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index d02bacbcd1..60f6b2c73b 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -749,6 +749,11 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
                 continue;
         }
 
+        if ( is_pci_passthrough_enabled() &&
+                dt_device_type_is_equal(node, "pci") )
+            if ( dt_property_name_is_equal(prop, "ranges") )
+                continue;
+
         res = fdt_property(kinfo->fdt, prop->name, prop_data, prop_len);
 
         if ( res )
@@ -769,6 +774,15 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
             if ( res )
                 return res;
         }
+
+        /*
+         * PCI IO bar are not mapped to dom0 when PCI passthrough is enabled,
+         * also there is no trap handler registered for IO bar therefor remove
+         * the IO range property from the device tree node for dom0.
+         */
+        res = dt_pci_remove_io_ranges(kinfo->fdt, node);
+        if ( res )
+            return res;
     }
 
     /*
diff --git a/xen/common/device_tree.c b/xen/common/device_tree.c
index 4aae281e89..9fa25f6723 100644
--- a/xen/common/device_tree.c
+++ b/xen/common/device_tree.c
@@ -2195,6 +2195,78 @@ int dt_get_pci_domain_nr(struct dt_device_node *node)
     return (u16)domain;
 }
 
+int dt_pci_remove_io_ranges(void *fdt, const struct dt_device_node *dev)
+{
+    const struct dt_device_node *parent = NULL;
+    const struct dt_bus *bus, *pbus;
+    unsigned int rlen;
+    int na, ns, pna, pns, rone, ret;
+    const __be32 *ranges;
+    __be32 regs[((GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS + 1)
+               * 2];
+    __be32 *addr = &regs[0];
+
+    bus = dt_match_bus(dev);
+    if ( !bus )
+        return 0; /* device is not a bus */
+
+    parent = dt_get_parent(dev);
+    if ( parent == NULL )
+        return -EINVAL;
+
+    ranges = dt_get_property(dev, "ranges", &rlen);
+    if ( ranges == NULL )
+    {
+        printk(XENLOG_ERR "DT: no ranges; cannot enumerate %s\n",
+               dev->full_name);
+        return -EINVAL;
+    }
+    if ( rlen == 0 ) /* Nothing to do */
+        return 0;
+
+    bus->count_cells(dev, &na, &ns);
+    if ( !DT_CHECK_COUNTS(na, ns) )
+    {
+        printk(XENLOG_ERR "dt_parse: Bad cell count for device %s\n",
+                  dev->full_name);
+        return -EINVAL;
+    }
+    pbus = dt_match_bus(parent);
+    if ( pbus == NULL )
+    {
+        printk("DT: %s is not a valid bus\n", parent->full_name);
+        return -EINVAL;
+    }
+
+    pbus->count_cells(dev, &pna, &pns);
+    if ( !DT_CHECK_COUNTS(pna, pns) )
+    {
+        printk(XENLOG_ERR "dt_parse: Bad cell count for parent %s\n",
+               dev->full_name);
+        return -EINVAL;
+    }
+    /* Now walk through the ranges */
+    rlen /= 4;
+    rone = na + pna + ns;
+
+    for ( ; rlen >= rone; rlen -= rone, ranges += rone )
+    {
+        unsigned int flags = bus->get_flags(ranges);
+        if ( flags & IORESOURCE_IO )
+            continue;
+
+        memcpy(addr, ranges, 4 * rone);
+
+        addr += rone;
+    }
+
+    ret = fdt_property(fdt, "ranges", regs, sizeof(regs));
+    if ( ret )
+        return ret;
+
+    return 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/include/xen/device_tree.h b/xen/include/xen/device_tree.h
index fd6cd00b43..ad2e905595 100644
--- a/xen/include/xen/device_tree.h
+++ b/xen/include/xen/device_tree.h
@@ -849,6 +849,16 @@ int dt_count_phandle_with_args(const struct dt_device_node *np,
  */
 int dt_get_pci_domain_nr(struct dt_device_node *node);
 
+/**
+ * dt_get_remove_io_range - Remove the PCI I/O range property value.
+ * @fdt: Pointer to the file descriptor tree.
+ * @node: Device tree node.
+ *
+ * This function will remove the PCI IO range property from the PCI device tree
+ * node.
+ */
+int dt_pci_remove_io_ranges(void *fdt, const struct dt_device_node *node);
+
 struct dt_device_node *dt_find_node_by_phandle(dt_phandle handle);
 
 #ifdef CONFIG_DEVICE_TREE_DEBUG
-- 
2.25.1

Re: [PATCH] xen/arm: vpci: Remove PCI I/O ranges property value

Posted by Julien Grall 4 years, 1 month ago

Hi Rahul,

I have a few comments on top of what Stefano already wrote.

On 14/12/2021 10:45, Rahul Singh wrote:
> IO ports on ARM don't exist so all IO ports related hypercalls are going
> to fail on ARM when we passthrough a PCI device.

Well. Arm doesn't have specific instructions to access I/O port. But 
they still exists because they are mapped in the memory address space.

It is quite likely we would need the xc_domain_ioport_permission() & co 
to work on Arm once we decide to expose the I/O region to the guest.

> Failure of xc_domain_ioport_permission(..) would turn into a critical
> failure at domain creation. We need to avoid this outcome, instead we
> want to continue with domain creation as normal even if
> xc_domain_ioport_permission(..) fails. XEN_DOMCTL_ioport_permission
> is not implemented on ARM so it would return -ENOSYS.
> 
> To solve above issue remove PCI I/O ranges property value from dom0
> device tree node so that dom0 linux will not allocate I/O space for PCI
> devices if pci-passthrough is enabled.
> 
> Another valid reason to remove I/O ranges is that PCI I/O space are not
> mapped to dom0 when PCI passthrough is enabled, also there is no vpci
> trap handler register for IO bar.

TBH, this should be the main reason of this change. We should not expose 
the PCI I/O space because the vPCI is not supporting it.

The rest is just an implementation details to avoid major refactoring 
that may need some revert in the future.

> 
> Signed-off-by: Rahul Singh <rahul.singh@arm.com>
> ---
>   xen/arch/arm/domain_build.c   | 14 +++++++
>   xen/common/device_tree.c      | 72 +++++++++++++++++++++++++++++++++++
>   xen/include/xen/device_tree.h | 10 +++++
>   3 files changed, 96 insertions(+)
> 
> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> index d02bacbcd1..60f6b2c73b 100644
> --- a/xen/arch/arm/domain_build.c
> +++ b/xen/arch/arm/domain_build.c
> @@ -749,6 +749,11 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
>                   continue;
>           }
>   
> +        if ( is_pci_passthrough_enabled() &&
> +                dt_device_type_is_equal(node, "pci") )

This check is not going to change for a given node. So I think this 
wants to be moved outside of the loop to avoid expensive check.

In addition to that, this may also cover PCI devices. I think we want to 
use the same heuristic as in handle_linux_pci_domain(). So I would move 
the logic in a separate helper.

> +            if ( dt_property_name_is_equal(prop, "ranges") )
> +                continue;
> +
>           res = fdt_property(kinfo->fdt, prop->name, prop_data, prop_len);
>   
>           if ( res )
> @@ -769,6 +774,15 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
>               if ( res )
>                   return res;
>           }
> +
> +        /*
> +         * PCI IO bar are not mapped to dom0 when PCI passthrough is enabled,
> +         * also there is no trap handler registered for IO bar therefor remove

Typo: s/therefor/therefore/

> +         * the IO range property from the device tree node for dom0.
> +         */
> +        res = dt_pci_remove_io_ranges(kinfo->fdt, node);

This is called unconditionally. Couldn't this potentially misinterpret 
some node?

> +        if ( res )
> +            return res;
>       }
>   
>       /*
> diff --git a/xen/common/device_tree.c b/xen/common/device_tree.c
> index 4aae281e89..9fa25f6723 100644
> --- a/xen/common/device_tree.c
> +++ b/xen/common/device_tree.c
> @@ -2195,6 +2195,78 @@ int dt_get_pci_domain_nr(struct dt_device_node *node)
>       return (u16)domain;
>   }
>   
> +int dt_pci_remove_io_ranges(void *fdt, const struct dt_device_node *dev)
> +{
> +    const struct dt_device_node *parent = NULL;
> +    const struct dt_bus *bus, *pbus;
> +    unsigned int rlen;
> +    int na, ns, pna, pns, rone, ret;
> +    const __be32 *ranges;
> +    __be32 regs[((GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS + 1)
> +               * 2];
> +    __be32 *addr = &regs[0];
> +
> +    bus = dt_match_bus(dev);
> +    if ( !bus )

NIT: I don't particularly care whether we use !bus or bus == 0 but it 
would be nice to stay consistent at least within a function (below you 
use rlen == 0).

> +        return 0; /* device is not a bus */
> +
> +    parent = dt_get_parent(dev);
> +    if ( parent == NULL )
> +        return -EINVAL;
> +
> +    ranges = dt_get_property(dev, "ranges", &rlen);
> +    if ( ranges == NULL )
> +    {
> +        printk(XENLOG_ERR "DT: no ranges; cannot enumerate %s\n",
> +               dev->full_name);
> +        return -EINVAL;
> +    }
> +    if ( rlen == 0 ) /* Nothing to do */
> +        return 0;
> +
> +    bus->count_cells(dev, &na, &ns);
> +    if ( !DT_CHECK_COUNTS(na, ns) )
> +    {
> +        printk(XENLOG_ERR "dt_parse: Bad cell count for device %s\n",
> +                  dev->full_name);
> +        return -EINVAL;
> +    }
> +    pbus = dt_match_bus(parent);
> +    if ( pbus == NULL )
> +    {
> +        printk("DT: %s is not a valid bus\n", parent->full_name);
> +        return -EINVAL;
> +    }
> +
> +    pbus->count_cells(dev, &pna, &pns);
> +    if ( !DT_CHECK_COUNTS(pna, pns) )
> +    {
> +        printk(XENLOG_ERR "dt_parse: Bad cell count for parent %s\n",
> +               dev->full_name);
> +        return -EINVAL;
> +    }
> +    /* Now walk through the ranges */
> +    rlen /= 4;
> +    rone = na + pna + ns;
> +
> +    for ( ; rlen >= rone; rlen -= rone, ranges += rone )
> +    {
> +        unsigned int flags = bus->get_flags(ranges);
> +        if ( flags & IORESOURCE_IO )
> +            continue;
> +
> +        memcpy(addr, ranges, 4 * rone);
> +
> +        addr += rone;
> +    }
> +
> +    ret = fdt_property(fdt, "ranges", regs, sizeof(regs));
> +    if ( ret )
> +        return ret;
> +
> +    return 0;
> +}
> +
>   /*
>    * Local variables:
>    * mode: C
> diff --git a/xen/include/xen/device_tree.h b/xen/include/xen/device_tree.h
> index fd6cd00b43..ad2e905595 100644
> --- a/xen/include/xen/device_tree.h
> +++ b/xen/include/xen/device_tree.h
> @@ -849,6 +849,16 @@ int dt_count_phandle_with_args(const struct dt_device_node *np,
>    */
>   int dt_get_pci_domain_nr(struct dt_device_node *node);
>   
> +/**
> + * dt_get_remove_io_range - Remove the PCI I/O range property value.
> + * @fdt: Pointer to the file descriptor tree.
> + * @node: Device tree node.
> + *
> + * This function will remove the PCI IO range property from the PCI device tree
> + * node.
> + */
> +int dt_pci_remove_io_ranges(void *fdt, const struct dt_device_node *node);
> +
>   struct dt_device_node *dt_find_node_by_phandle(dt_phandle handle);
>   
>   #ifdef CONFIG_DEVICE_TREE_DEBUG

Cheers,

-- 
Julien Grall

Re: [PATCH] xen/arm: vpci: Remove PCI I/O ranges property value

Posted by Stefano Stabellini 4 years, 1 month ago

On Tue, 14 Dec 2021, Rahul Singh wrote:
> IO ports on ARM don't exist so all IO ports related hypercalls are going
> to fail on ARM when we passthrough a PCI device.
> Failure of xc_domain_ioport_permission(..) would turn into a critical
> failure at domain creation. We need to avoid this outcome, instead we
> want to continue with domain creation as normal even if
> xc_domain_ioport_permission(..) fails. XEN_DOMCTL_ioport_permission
> is not implemented on ARM so it would return -ENOSYS.
> 
> To solve above issue remove PCI I/O ranges property value from dom0
> device tree node so that dom0 linux will not allocate I/O space for PCI
> devices if pci-passthrough is enabled.
> 
> Another valid reason to remove I/O ranges is that PCI I/O space are not
> mapped to dom0 when PCI passthrough is enabled, also there is no vpci
> trap handler register for IO bar.
> 
> Signed-off-by: Rahul Singh <rahul.singh@arm.com>
> ---
>  xen/arch/arm/domain_build.c   | 14 +++++++
>  xen/common/device_tree.c      | 72 +++++++++++++++++++++++++++++++++++
>  xen/include/xen/device_tree.h | 10 +++++
>  3 files changed, 96 insertions(+)
> 
> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> index d02bacbcd1..60f6b2c73b 100644
> --- a/xen/arch/arm/domain_build.c
> +++ b/xen/arch/arm/domain_build.c
> @@ -749,6 +749,11 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
>                  continue;
>          }
>  
> +        if ( is_pci_passthrough_enabled() &&
> +                dt_device_type_is_equal(node, "pci") )
> +            if ( dt_property_name_is_equal(prop, "ranges") )
> +                continue;

It looks like we are skipping the "ranges" property entirely for the PCI
node, is that right? Wouldn't that also remove the other (not ioports)
address ranges?


>          res = fdt_property(kinfo->fdt, prop->name, prop_data, prop_len);
>  
>          if ( res )
> @@ -769,6 +774,15 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
>              if ( res )
>                  return res;
>          }
> +
> +        /*
> +         * PCI IO bar are not mapped to dom0 when PCI passthrough is enabled,
> +         * also there is no trap handler registered for IO bar therefor remove
> +         * the IO range property from the device tree node for dom0.
> +         */
> +        res = dt_pci_remove_io_ranges(kinfo->fdt, node);
> +        if ( res )
> +            return res;

I tried to apply this patch to staging to make it easier to review but I
think this chuck got applied wrongly: I can see
dt_pci_remove_io_ranges() added to the function "handle_prop_pfdt" which
is for guest partial DTBs and not for dom0.

Is dt_pci_remove_io_ranges() meant to be called from write_properties
instead? It looks like it would be best to call it from
write_properties, maybe it could be combined with the other new check
just above in this patch?


>      /*
> diff --git a/xen/common/device_tree.c b/xen/common/device_tree.c
> index 4aae281e89..9fa25f6723 100644
> --- a/xen/common/device_tree.c
> +++ b/xen/common/device_tree.c
> @@ -2195,6 +2195,78 @@ int dt_get_pci_domain_nr(struct dt_device_node *node)
>      return (u16)domain;
>  }
>  
> +int dt_pci_remove_io_ranges(void *fdt, const struct dt_device_node *dev)
> +{
> +    const struct dt_device_node *parent = NULL;
> +    const struct dt_bus *bus, *pbus;
> +    unsigned int rlen;
> +    int na, ns, pna, pns, rone, ret;
> +    const __be32 *ranges;
> +    __be32 regs[((GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS + 1)
> +               * 2];
> +    __be32 *addr = &regs[0];
> +
> +    bus = dt_match_bus(dev);
> +    if ( !bus )
> +        return 0; /* device is not a bus */
> +
> +    parent = dt_get_parent(dev);
> +    if ( parent == NULL )
> +        return -EINVAL;
> +
> +    ranges = dt_get_property(dev, "ranges", &rlen);
> +    if ( ranges == NULL )
> +    {
> +        printk(XENLOG_ERR "DT: no ranges; cannot enumerate %s\n",
> +               dev->full_name);
> +        return -EINVAL;
> +    }
> +    if ( rlen == 0 ) /* Nothing to do */
> +        return 0;
> +
> +    bus->count_cells(dev, &na, &ns);
> +    if ( !DT_CHECK_COUNTS(na, ns) )
> +    {
> +        printk(XENLOG_ERR "dt_parse: Bad cell count for device %s\n",
> +                  dev->full_name);
> +        return -EINVAL;
> +    }
> +    pbus = dt_match_bus(parent);
> +    if ( pbus == NULL )
> +    {
> +        printk("DT: %s is not a valid bus\n", parent->full_name);
> +        return -EINVAL;
> +    }
> +
> +    pbus->count_cells(dev, &pna, &pns);
> +    if ( !DT_CHECK_COUNTS(pna, pns) )
> +    {
> +        printk(XENLOG_ERR "dt_parse: Bad cell count for parent %s\n",
> +               dev->full_name);
> +        return -EINVAL;
> +    }
> +    /* Now walk through the ranges */
> +    rlen /= 4;
> +    rone = na + pna + ns;
> +
> +    for ( ; rlen >= rone; rlen -= rone, ranges += rone )
> +    {
> +        unsigned int flags = bus->get_flags(ranges);
> +        if ( flags & IORESOURCE_IO )
> +            continue;
> +
> +        memcpy(addr, ranges, 4 * rone);
> +
> +        addr += rone;
> +    }
> +
> +    ret = fdt_property(fdt, "ranges", regs, sizeof(regs));
> +    if ( ret )
> +        return ret;
> +
> +    return 0;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/include/xen/device_tree.h b/xen/include/xen/device_tree.h
> index fd6cd00b43..ad2e905595 100644
> --- a/xen/include/xen/device_tree.h
> +++ b/xen/include/xen/device_tree.h
> @@ -849,6 +849,16 @@ int dt_count_phandle_with_args(const struct dt_device_node *np,
>   */
>  int dt_get_pci_domain_nr(struct dt_device_node *node);
>  
> +/**
> + * dt_get_remove_io_range - Remove the PCI I/O range property value.
> + * @fdt: Pointer to the file descriptor tree.
> + * @node: Device tree node.
> + *
> + * This function will remove the PCI IO range property from the PCI device tree
> + * node.
> + */
> +int dt_pci_remove_io_ranges(void *fdt, const struct dt_device_node *node);
> +
>  struct dt_device_node *dt_find_node_by_phandle(dt_phandle handle);
>  
>  #ifdef CONFIG_DEVICE_TREE_DEBUG
> -- 
> 2.25.1
>

Re: [PATCH] xen/arm: vpci: Remove PCI I/O ranges property value

Posted by Rahul Singh 4 years, 1 month ago

Hi Stefano,

> On 16 Dec 2021, at 2:33 am, Stefano Stabellini <sstabellini@kernel.org> wrote:
> 
> On Tue, 14 Dec 2021, Rahul Singh wrote:
>> IO ports on ARM don't exist so all IO ports related hypercalls are going
>> to fail on ARM when we passthrough a PCI device.
>> Failure of xc_domain_ioport_permission(..) would turn into a critical
>> failure at domain creation. We need to avoid this outcome, instead we
>> want to continue with domain creation as normal even if
>> xc_domain_ioport_permission(..) fails. XEN_DOMCTL_ioport_permission
>> is not implemented on ARM so it would return -ENOSYS.
>> 
>> To solve above issue remove PCI I/O ranges property value from dom0
>> device tree node so that dom0 linux will not allocate I/O space for PCI
>> devices if pci-passthrough is enabled.
>> 
>> Another valid reason to remove I/O ranges is that PCI I/O space are not
>> mapped to dom0 when PCI passthrough is enabled, also there is no vpci
>> trap handler register for IO bar.
>> 
>> Signed-off-by: Rahul Singh <rahul.singh@arm.com>
>> ---
>> xen/arch/arm/domain_build.c   | 14 +++++++
>> xen/common/device_tree.c      | 72 +++++++++++++++++++++++++++++++++++
>> xen/include/xen/device_tree.h | 10 +++++
>> 3 files changed, 96 insertions(+)
>> 
>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>> index d02bacbcd1..60f6b2c73b 100644
>> --- a/xen/arch/arm/domain_build.c
>> +++ b/xen/arch/arm/domain_build.c
>> @@ -749,6 +749,11 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
>>                 continue;
>>         }
>> 
>> +        if ( is_pci_passthrough_enabled() &&
>> +                dt_device_type_is_equal(node, "pci") )
>> +            if ( dt_property_name_is_equal(prop, "ranges") )
>> +                continue;
> 
> It looks like we are skipping the "ranges" property entirely for the PCI
> node, is that right? Wouldn't that also remove the other (not ioports)
> address ranges?

We are skipping the “ranges” property here to avoid setting the “ranges” property when
pci_passthrough is enabled. We will remove only remove IO port and set the other ‘ranges” property 
value in dt_pci_remove_io_ranges() in next if() condition.
 

>>         res = fdt_property(kinfo->fdt, prop->name, prop_data, prop_len);
>> 
>>         if ( res )
>> @@ -769,6 +774,15 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
>>             if ( res )
>>                 return res;
>>         }
>> +
>> +        /*
>> +         * PCI IO bar are not mapped to dom0 when PCI passthrough is enabled,
>> +         * also there is no trap handler registered for IO bar therefor remove
>> +         * the IO range property from the device tree node for dom0.
>> +         */
>> +        res = dt_pci_remove_io_ranges(kinfo->fdt, node);
>> +        if ( res )
>> +            return res;
> 
> I tried to apply this patch to staging to make it easier to review but I
> think this chuck got applied wrongly: I can see
> dt_pci_remove_io_ranges() added to the function "handle_prop_pfdt" which
> is for guest partial DTBs and not for dom0.

Oleksandr’s patch series was merged yesterday because of that there is conflict in applying 
this patch. I will rebase the patch and will send it again for review.

> 
> Is dt_pci_remove_io_ranges() meant to be called from write_properties
> instead? It looks like it would be best to call it from
> write_properties, maybe it could be combined with the other new check
> just above in this patch?

Yes dt_pci_remove_io_ranges() is to be called from write_properties().

Regards,
Rahul
> 
> 
>>     /*
>> diff --git a/xen/common/device_tree.c b/xen/common/device_tree.c
>> index 4aae281e89..9fa25f6723 100644
>> --- a/xen/common/device_tree.c
>> +++ b/xen/common/device_tree.c
>> @@ -2195,6 +2195,78 @@ int dt_get_pci_domain_nr(struct dt_device_node *node)
>>     return (u16)domain;
>> }
>> 
>> +int dt_pci_remove_io_ranges(void *fdt, const struct dt_device_node *dev)
>> +{
>> +    const struct dt_device_node *parent = NULL;
>> +    const struct dt_bus *bus, *pbus;
>> +    unsigned int rlen;
>> +    int na, ns, pna, pns, rone, ret;
>> +    const __be32 *ranges;
>> +    __be32 regs[((GUEST_ROOT_ADDRESS_CELLS * 2) + GUEST_ROOT_SIZE_CELLS + 1)
>> +               * 2];
>> +    __be32 *addr = &regs[0];
>> +
>> +    bus = dt_match_bus(dev);
>> +    if ( !bus )
>> +        return 0; /* device is not a bus */
>> +
>> +    parent = dt_get_parent(dev);
>> +    if ( parent == NULL )
>> +        return -EINVAL;
>> +
>> +    ranges = dt_get_property(dev, "ranges", &rlen);
>> +    if ( ranges == NULL )
>> +    {
>> +        printk(XENLOG_ERR "DT: no ranges; cannot enumerate %s\n",
>> +               dev->full_name);
>> +        return -EINVAL;
>> +    }
>> +    if ( rlen == 0 ) /* Nothing to do */
>> +        return 0;
>> +
>> +    bus->count_cells(dev, &na, &ns);
>> +    if ( !DT_CHECK_COUNTS(na, ns) )
>> +    {
>> +        printk(XENLOG_ERR "dt_parse: Bad cell count for device %s\n",
>> +                  dev->full_name);
>> +        return -EINVAL;
>> +    }
>> +    pbus = dt_match_bus(parent);
>> +    if ( pbus == NULL )
>> +    {
>> +        printk("DT: %s is not a valid bus\n", parent->full_name);
>> +        return -EINVAL;
>> +    }
>> +
>> +    pbus->count_cells(dev, &pna, &pns);
>> +    if ( !DT_CHECK_COUNTS(pna, pns) )
>> +    {
>> +        printk(XENLOG_ERR "dt_parse: Bad cell count for parent %s\n",
>> +               dev->full_name);
>> +        return -EINVAL;
>> +    }
>> +    /* Now walk through the ranges */
>> +    rlen /= 4;
>> +    rone = na + pna + ns;
>> +
>> +    for ( ; rlen >= rone; rlen -= rone, ranges += rone )
>> +    {
>> +        unsigned int flags = bus->get_flags(ranges);
>> +        if ( flags & IORESOURCE_IO )
>> +            continue;
>> +
>> +        memcpy(addr, ranges, 4 * rone);
>> +
>> +        addr += rone;
>> +    }
>> +
>> +    ret = fdt_property(fdt, "ranges", regs, sizeof(regs));
>> +    if ( ret )
>> +        return ret;
>> +
>> +    return 0;
>> +}
>> +
>> /*
>>  * Local variables:
>>  * mode: C
>> diff --git a/xen/include/xen/device_tree.h b/xen/include/xen/device_tree.h
>> index fd6cd00b43..ad2e905595 100644
>> --- a/xen/include/xen/device_tree.h
>> +++ b/xen/include/xen/device_tree.h
>> @@ -849,6 +849,16 @@ int dt_count_phandle_with_args(const struct dt_device_node *np,
>>  */
>> int dt_get_pci_domain_nr(struct dt_device_node *node);
>> 
>> +/**
>> + * dt_get_remove_io_range - Remove the PCI I/O range property value.
>> + * @fdt: Pointer to the file descriptor tree.
>> + * @node: Device tree node.
>> + *
>> + * This function will remove the PCI IO range property from the PCI device tree
>> + * node.
>> + */
>> +int dt_pci_remove_io_ranges(void *fdt, const struct dt_device_node *node);
>> +
>> struct dt_device_node *dt_find_node_by_phandle(dt_phandle handle);
>> 
>> #ifdef CONFIG_DEVICE_TREE_DEBUG
>> -- 
>> 2.25.1

Re: [PATCH] xen/arm: vpci: Remove PCI I/O ranges property value

Posted by Stefano Stabellini 4 years, 1 month ago

On Thu, 16 Dec 2021, Rahul Singh wrote:
> Hi Stefano,
> 
> > On 16 Dec 2021, at 2:33 am, Stefano Stabellini <sstabellini@kernel.org> wrote:
> > 
> > On Tue, 14 Dec 2021, Rahul Singh wrote:
> >> IO ports on ARM don't exist so all IO ports related hypercalls are going
> >> to fail on ARM when we passthrough a PCI device.
> >> Failure of xc_domain_ioport_permission(..) would turn into a critical
> >> failure at domain creation. We need to avoid this outcome, instead we
> >> want to continue with domain creation as normal even if
> >> xc_domain_ioport_permission(..) fails. XEN_DOMCTL_ioport_permission
> >> is not implemented on ARM so it would return -ENOSYS.
> >> 
> >> To solve above issue remove PCI I/O ranges property value from dom0
> >> device tree node so that dom0 linux will not allocate I/O space for PCI
> >> devices if pci-passthrough is enabled.
> >> 
> >> Another valid reason to remove I/O ranges is that PCI I/O space are not
> >> mapped to dom0 when PCI passthrough is enabled, also there is no vpci
> >> trap handler register for IO bar.
> >> 
> >> Signed-off-by: Rahul Singh <rahul.singh@arm.com>
> >> ---
> >> xen/arch/arm/domain_build.c   | 14 +++++++
> >> xen/common/device_tree.c      | 72 +++++++++++++++++++++++++++++++++++
> >> xen/include/xen/device_tree.h | 10 +++++
> >> 3 files changed, 96 insertions(+)
> >> 
> >> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
> >> index d02bacbcd1..60f6b2c73b 100644
> >> --- a/xen/arch/arm/domain_build.c
> >> +++ b/xen/arch/arm/domain_build.c
> >> @@ -749,6 +749,11 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
> >>                 continue;
> >>         }
> >> 
> >> +        if ( is_pci_passthrough_enabled() &&
> >> +                dt_device_type_is_equal(node, "pci") )
> >> +            if ( dt_property_name_is_equal(prop, "ranges") )
> >> +                continue;
> > 
> > It looks like we are skipping the "ranges" property entirely for the PCI
> > node, is that right? Wouldn't that also remove the other (not ioports)
> > address ranges?
> 
> We are skipping the “ranges” property here to avoid setting the “ranges” property when
> pci_passthrough is enabled. We will remove only remove IO port and set the other ‘ranges” property 
> value in dt_pci_remove_io_ranges() in next if() condition.
>  
> 
> >>         res = fdt_property(kinfo->fdt, prop->name, prop_data, prop_len);
> >> 
> >>         if ( res )
> >> @@ -769,6 +774,15 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
> >>             if ( res )
> >>                 return res;
> >>         }
> >> +
> >> +        /*
> >> +         * PCI IO bar are not mapped to dom0 when PCI passthrough is enabled,
> >> +         * also there is no trap handler registered for IO bar therefor remove
> >> +         * the IO range property from the device tree node for dom0.
> >> +         */
> >> +        res = dt_pci_remove_io_ranges(kinfo->fdt, node);
> >> +        if ( res )
> >> +            return res;
> > 
> > I tried to apply this patch to staging to make it easier to review but I
> > think this chuck got applied wrongly: I can see
> > dt_pci_remove_io_ranges() added to the function "handle_prop_pfdt" which
> > is for guest partial DTBs and not for dom0.
> 
> Oleksandr’s patch series was merged yesterday because of that there is conflict in applying 
> this patch. I will rebase the patch and will send it again for review.
> 
> > 
> > Is dt_pci_remove_io_ranges() meant to be called from write_properties
> > instead? It looks like it would be best to call it from
> > write_properties, maybe it could be combined with the other new check
> > just above in this patch?
> 
> Yes dt_pci_remove_io_ranges() is to be called from write_properties().

OK. In that case the only feedback that is I have is that it might be
possible to avoid the first change of this patch to skip "ranges" by
moving the call to dt_pci_remove_io_ranges() earlier in the
write_properties function.

Re: [PATCH] xen/arm: vpci: Remove PCI I/O ranges property value

Posted by Rahul Singh 4 years, 1 month ago

Hi Stefano,

> On 16 Dec 2021, at 9:48 pm, Stefano Stabellini <sstabellini@kernel.org> wrote:
> 
> On Thu, 16 Dec 2021, Rahul Singh wrote:
>> Hi Stefano,
>> 
>>> On 16 Dec 2021, at 2:33 am, Stefano Stabellini <sstabellini@kernel.org> wrote:
>>> 
>>> On Tue, 14 Dec 2021, Rahul Singh wrote:
>>>> IO ports on ARM don't exist so all IO ports related hypercalls are going
>>>> to fail on ARM when we passthrough a PCI device.
>>>> Failure of xc_domain_ioport_permission(..) would turn into a critical
>>>> failure at domain creation. We need to avoid this outcome, instead we
>>>> want to continue with domain creation as normal even if
>>>> xc_domain_ioport_permission(..) fails. XEN_DOMCTL_ioport_permission
>>>> is not implemented on ARM so it would return -ENOSYS.
>>>> 
>>>> To solve above issue remove PCI I/O ranges property value from dom0
>>>> device tree node so that dom0 linux will not allocate I/O space for PCI
>>>> devices if pci-passthrough is enabled.
>>>> 
>>>> Another valid reason to remove I/O ranges is that PCI I/O space are not
>>>> mapped to dom0 when PCI passthrough is enabled, also there is no vpci
>>>> trap handler register for IO bar.
>>>> 
>>>> Signed-off-by: Rahul Singh <rahul.singh@arm.com>
>>>> ---
>>>> xen/arch/arm/domain_build.c   | 14 +++++++
>>>> xen/common/device_tree.c      | 72 +++++++++++++++++++++++++++++++++++
>>>> xen/include/xen/device_tree.h | 10 +++++
>>>> 3 files changed, 96 insertions(+)
>>>> 
>>>> diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
>>>> index d02bacbcd1..60f6b2c73b 100644
>>>> --- a/xen/arch/arm/domain_build.c
>>>> +++ b/xen/arch/arm/domain_build.c
>>>> @@ -749,6 +749,11 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
>>>>                continue;
>>>>        }
>>>> 
>>>> +        if ( is_pci_passthrough_enabled() &&
>>>> +                dt_device_type_is_equal(node, "pci") )
>>>> +            if ( dt_property_name_is_equal(prop, "ranges") )
>>>> +                continue;
>>> 
>>> It looks like we are skipping the "ranges" property entirely for the PCI
>>> node, is that right? Wouldn't that also remove the other (not ioports)
>>> address ranges?
>> 
>> We are skipping the “ranges” property here to avoid setting the “ranges” property when
>> pci_passthrough is enabled. We will remove only remove IO port and set the other ‘ranges” property 
>> value in dt_pci_remove_io_ranges() in next if() condition.
>> 
>> 
>>>>        res = fdt_property(kinfo->fdt, prop->name, prop_data, prop_len);
>>>> 
>>>>        if ( res )
>>>> @@ -769,6 +774,15 @@ static int __init write_properties(struct domain *d, struct kernel_info *kinfo,
>>>>            if ( res )
>>>>                return res;
>>>>        }
>>>> +
>>>> +        /*
>>>> +         * PCI IO bar are not mapped to dom0 when PCI passthrough is enabled,
>>>> +         * also there is no trap handler registered for IO bar therefor remove
>>>> +         * the IO range property from the device tree node for dom0.
>>>> +         */
>>>> +        res = dt_pci_remove_io_ranges(kinfo->fdt, node);
>>>> +        if ( res )
>>>> +            return res;
>>> 
>>> I tried to apply this patch to staging to make it easier to review but I
>>> think this chuck got applied wrongly: I can see
>>> dt_pci_remove_io_ranges() added to the function "handle_prop_pfdt" which
>>> is for guest partial DTBs and not for dom0.
>> 
>> Oleksandr’s patch series was merged yesterday because of that there is conflict in applying 
>> this patch. I will rebase the patch and will send it again for review.
>> 
>>> 
>>> Is dt_pci_remove_io_ranges() meant to be called from write_properties
>>> instead? It looks like it would be best to call it from
>>> write_properties, maybe it could be combined with the other new check
>>> just above in this patch?
>> 
>> Yes dt_pci_remove_io_ranges() is to be called from write_properties().
> 
> OK. In that case the only feedback that is I have is that it might be
> possible to avoid the first change of this patch to skip "ranges" by
> moving the call to dt_pci_remove_io_ranges() earlier in the
> write_properties function.

Ok. I will modify the code based on your comment.

Regards,
Rahul

[PATCH] xen/vpci: msix: move x86 specific code to x86 file

Posted by Rahul Singh 4 years, 1 month ago

vpci/msix.c file will be used for arm architecture when vpci msix
support will be added to ARM, but there is x86 specific code in this
file.

Move x86 specific code to the x86_msix.c file to make sure common code
will be used for other architecture.

No functional change intended.

Signed-off-by: Rahul Singh <rahul.singh@arm.com>
---
 xen/arch/x86/msi.c                       |   2 +-
 xen/drivers/passthrough/amd/iommu_init.c |   1 +
 xen/drivers/vpci/Makefile                |   1 +
 xen/drivers/vpci/msi.c                   |   3 +-
 xen/drivers/vpci/msix.c                  | 134 +++++---------------
 xen/drivers/vpci/x86_msix.c              | 155 +++++++++++++++++++++++
 xen/include/asm-x86/msi.h                |  28 ----
 xen/include/xen/msi.h                    |  28 ++++
 xen/include/xen/vpci.h                   |  21 +++
 9 files changed, 239 insertions(+), 134 deletions(-)
 create mode 100644 xen/drivers/vpci/x86_msix.c

diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 5febc0ea4b..2b120f897f 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -23,7 +23,7 @@
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
-#include <asm/msi.h>
+#include <xen/msi.h>
 #include <asm/fixmap.h>
 #include <asm/p2m.h>
 #include <mach_apic.h>
diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c
index 559a734bda..fc385959c7 100644
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -20,6 +20,7 @@
 #include <xen/acpi.h>
 #include <xen/delay.h>
 #include <xen/keyhandler.h>
+#include <xen/msi.h>
 
 #include "iommu.h"
 
diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index 1a1413b93e..543c265199 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1,2 +1,3 @@
 obj-y += vpci.o header.o
 obj-$(CONFIG_HAS_PCI_MSI) += msi.o msix.o
+obj-$(CONFIG_X86) += x86_msix.o
diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
index 5757a7aed2..8fc82a9b8d 100644
--- a/xen/drivers/vpci/msi.c
+++ b/xen/drivers/vpci/msi.c
@@ -16,12 +16,11 @@
  * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <xen/msi.h>
 #include <xen/sched.h>
 #include <xen/softirq.h>
 #include <xen/vpci.h>
 
-#include <asm/msi.h>
-
 static uint32_t control_read(const struct pci_dev *pdev, unsigned int reg,
                              void *data)
 {
diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
index 846f1b8d70..7a9b02f1a5 100644
--- a/xen/drivers/vpci/msix.c
+++ b/xen/drivers/vpci/msix.c
@@ -17,15 +17,24 @@
  * License along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <xen/msi.h>
 #include <xen/sched.h>
 #include <xen/vpci.h>
 
-#include <asm/msi.h>
 #include <asm/p2m.h>
 
-#define VMSIX_ADDR_IN_RANGE(addr, vpci, nr)                               \
-    ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
-     (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
+/*
+ * The return value is different for the MMIO handler on ARM and x86
+ * architecture. To make the code common for both architectures create
+ * generic return code with architecture dependent values.
+ */
+#ifdef CONFIG_X86
+#define VPCI_EMUL_OKAY      X86EMUL_OKAY
+#define VPCI_EMUL_RETRY     X86EMUL_RETRY
+#else
+#define VPCI_EMUL_OKAY      1
+#define VPCI_EMUL_RETRY     VPCI_EMUL_OKAY
+#endif
 
 static uint32_t control_read(const struct pci_dev *pdev, unsigned int reg,
                              void *data)
@@ -138,29 +147,6 @@ static void control_write(const struct pci_dev *pdev, unsigned int reg,
         pci_conf_write16(pdev->sbdf, reg, val);
 }
 
-static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr)
-{
-    struct vpci_msix *msix;
-
-    list_for_each_entry ( msix, &d->arch.hvm.msix_tables, next )
-    {
-        const struct vpci_bar *bars = msix->pdev->vpci->header.bars;
-        unsigned int i;
-
-        for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
-            if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled &&
-                 VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) )
-                return msix;
-    }
-
-    return NULL;
-}
-
-static int msix_accept(struct vcpu *v, unsigned long addr)
-{
-    return !!msix_find(v->domain, addr);
-}
-
 static bool access_allowed(const struct pci_dev *pdev, unsigned long addr,
                            unsigned int len)
 {
@@ -182,21 +168,19 @@ static struct vpci_msix_entry *get_entry(struct vpci_msix *msix,
     return &msix->entries[(addr - start) / PCI_MSIX_ENTRY_SIZE];
 }
 
-static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
-                     unsigned long *data)
+int msix_read(struct vpci_msix *msix, unsigned long addr, unsigned int len,
+              unsigned long *data)
 {
-    const struct domain *d = v->domain;
-    struct vpci_msix *msix = msix_find(d, addr);
     const struct vpci_msix_entry *entry;
     unsigned int offset;
 
     *data = ~0ul;
 
     if ( !msix )
-        return X86EMUL_RETRY;
+        return VPCI_EMUL_RETRY;
 
     if ( !access_allowed(msix->pdev, addr, len) )
-        return X86EMUL_OKAY;
+        return VPCI_EMUL_OKAY;
 
     if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
     {
@@ -210,11 +194,11 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
         switch ( len )
         {
         case 4:
-            *data = readl(addr);
+            *data = vpci_arch_readl(addr);
             break;
 
         case 8:
-            *data = readq(addr);
+            *data = vpci_arch_readq(addr);
             break;
 
         default:
@@ -222,7 +206,7 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
             break;
         }
 
-        return X86EMUL_OKAY;
+        return VPCI_EMUL_OKAY;
     }
 
     spin_lock(&msix->pdev->vpci->lock);
@@ -256,22 +240,20 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
     }
     spin_unlock(&msix->pdev->vpci->lock);
 
-    return X86EMUL_OKAY;
+    return VPCI_EMUL_OKAY;
 }
 
-static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
-                      unsigned long data)
+int msix_write(const struct domain *d, struct vpci_msix *msix,
+               unsigned long addr, unsigned int len, unsigned long data)
 {
-    const struct domain *d = v->domain;
-    struct vpci_msix *msix = msix_find(d, addr);
     struct vpci_msix_entry *entry;
     unsigned int offset;
 
     if ( !msix )
-        return X86EMUL_RETRY;
+        return VPCI_EMUL_RETRY;
 
     if ( !access_allowed(msix->pdev, addr, len) )
-        return X86EMUL_OKAY;
+        return VPCI_EMUL_OKAY;
 
     if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
     {
@@ -281,11 +263,11 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
             switch ( len )
             {
             case 4:
-                writel(data, addr);
+                vpci_arch_writel(data, addr);
                 break;
 
             case 8:
-                writeq(data, addr);
+                vpci_arch_writeq(data, addr);
                 break;
 
             default:
@@ -294,7 +276,7 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
             }
         }
 
-        return X86EMUL_OKAY;
+        return VPCI_EMUL_OKAY;
     }
 
     spin_lock(&msix->pdev->vpci->lock);
@@ -372,60 +354,7 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
     }
     spin_unlock(&msix->pdev->vpci->lock);
 
-    return X86EMUL_OKAY;
-}
-
-static const struct hvm_mmio_ops vpci_msix_table_ops = {
-    .check = msix_accept,
-    .read = msix_read,
-    .write = msix_write,
-};
-
-int vpci_make_msix_hole(const struct pci_dev *pdev)
-{
-    struct domain *d = pdev->domain;
-    unsigned int i;
-
-    if ( !pdev->vpci->msix )
-        return 0;
-
-    /* Make sure there's a hole for the MSIX table/PBA in the p2m. */
-    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->msix->tables); i++ )
-    {
-        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
-        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
-                                     vmsix_table_size(pdev->vpci, i) - 1);
-
-        for ( ; start <= end; start++ )
-        {
-            p2m_type_t t;
-            mfn_t mfn = get_gfn_query(d, start, &t);
-
-            switch ( t )
-            {
-            case p2m_mmio_dm:
-            case p2m_invalid:
-                break;
-            case p2m_mmio_direct:
-                if ( mfn_x(mfn) == start )
-                {
-                    clear_identity_p2m_entry(d, start);
-                    break;
-                }
-                /* fallthrough. */
-            default:
-                put_gfn(d, start);
-                gprintk(XENLOG_WARNING,
-                        "%pp: existing mapping (mfn: %" PRI_mfn
-                        "type: %d) at %#lx clobbers MSIX MMIO area\n",
-                        &pdev->sbdf, mfn_x(mfn), t, start);
-                return -EEXIST;
-            }
-            put_gfn(d, start);
-        }
-    }
-
-    return 0;
+    return VPCI_EMUL_OKAY;
 }
 
 static int init_msix(struct pci_dev *pdev)
@@ -472,11 +401,10 @@ static int init_msix(struct pci_dev *pdev)
         vpci_msix_arch_init_entry(&msix->entries[i]);
     }
 
-    if ( list_empty(&d->arch.hvm.msix_tables) )
-        register_mmio_handler(d, &vpci_msix_table_ops);
+    register_msix_mmio_handler(d);
+    vpci_msix_add_to_msix_table(msix, d);
 
     pdev->vpci->msix = msix;
-    list_add(&msix->next, &d->arch.hvm.msix_tables);
 
     return 0;
 }
diff --git a/xen/drivers/vpci/x86_msix.c b/xen/drivers/vpci/x86_msix.c
new file mode 100644
index 0000000000..b38b52e410
--- /dev/null
+++ b/xen/drivers/vpci/x86_msix.c
@@ -0,0 +1,155 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/sched.h>
+#include <xen/vpci.h>
+
+#include <asm/msi.h>
+#include <asm/p2m.h>
+
+u32 vpci_arch_readl(unsigned long addr)
+{
+    return readl(addr);
+}
+
+u64 vpci_arch_readq(unsigned long addr)
+{
+    return readq(addr);
+}
+
+void vpci_arch_writel(u32 data, unsigned long addr)
+{
+    writel(data, addr);
+}
+
+void vpci_arch_writeq(u64 data, unsigned long addr)
+{
+    writeq(data, addr);
+}
+
+int vpci_make_msix_hole(const struct pci_dev *pdev)
+{
+    struct domain *d = pdev->domain;
+    unsigned int i;
+
+    if ( !pdev->vpci->msix )
+        return 0;
+
+    /* Make sure there's a hole for the MSIX table/PBA in the p2m. */
+    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->msix->tables); i++ )
+    {
+        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
+        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
+                                     vmsix_table_size(pdev->vpci, i) - 1);
+
+        for ( ; start <= end; start++ )
+        {
+            p2m_type_t t;
+            mfn_t mfn = get_gfn_query(d, start, &t);
+
+            switch ( t )
+            {
+            case p2m_mmio_dm:
+            case p2m_invalid:
+                break;
+            case p2m_mmio_direct:
+                if ( mfn_x(mfn) == start )
+                {
+                    clear_identity_p2m_entry(d, start);
+                    break;
+                }
+                /* fallthrough. */
+            default:
+                put_gfn(d, start);
+                gprintk(XENLOG_WARNING,
+                        "%pp: existing mapping (mfn: %" PRI_mfn
+                        "type: %d) at %#lx clobbers MSIX MMIO area\n",
+                        &pdev->sbdf, mfn_x(mfn), t, start);
+                return -EEXIST;
+            }
+            put_gfn(d, start);
+        }
+    }
+
+    return 0;
+}
+
+struct vpci_msix *msix_find(const struct domain *d, unsigned long addr)
+{
+    struct vpci_msix *msix;
+
+    list_for_each_entry ( msix, &d->arch.hvm.msix_tables, next )
+    {
+        const struct vpci_bar *bars = msix->pdev->vpci->header.bars;
+        unsigned int i;
+
+        for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
+            if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled &&
+                 VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) )
+                return msix;
+    }
+
+    return NULL;
+}
+
+static int x86_msix_accept(struct vcpu *v, unsigned long addr)
+{
+    return !!msix_find(v->domain, addr);
+}
+
+static int x86_msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
+                          unsigned long data)
+{
+    const struct domain *d = v->domain;
+    struct vpci_msix *msix = msix_find(d, addr);
+
+    return msix_write(d, msix, addr, len, data);
+}
+
+static int x86_msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
+                         unsigned long *data)
+{
+    const struct domain *d = v->domain;
+    struct vpci_msix *msix = msix_find(d, addr);
+
+    return msix_read(msix, addr, len, data);
+}
+
+static const struct hvm_mmio_ops vpci_msix_table_ops = {
+    .check = x86_msix_accept,
+    .read = x86_msix_read,
+    .write = x86_msix_write,
+};
+
+void register_msix_mmio_handler(struct domain *d)
+{
+    if ( list_empty(&d->arch.hvm.msix_tables) )
+        register_mmio_handler(d, &vpci_msix_table_ops);
+}
+
+void vpci_msix_add_to_msix_table(struct vpci_msix *msix,
+                                 struct domain *d)
+{
+    list_add(&msix->next, &d->arch.hvm.msix_tables);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/msi.h b/xen/include/asm-x86/msi.h
index e228b0f3f3..0a7912e9be 100644
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -148,34 +148,6 @@ int msi_free_irq(struct msi_desc *entry);
  */
 #define NR_HP_RESERVED_VECTORS 	20
 
-#define msi_control_reg(base)		(base + PCI_MSI_FLAGS)
-#define msi_lower_address_reg(base)	(base + PCI_MSI_ADDRESS_LO)
-#define msi_upper_address_reg(base)	(base + PCI_MSI_ADDRESS_HI)
-#define msi_data_reg(base, is64bit)	\
-	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
-#define msi_mask_bits_reg(base, is64bit) \
-	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
-#define msi_pending_bits_reg(base, is64bit) \
-	((base) + PCI_MSI_MASK_BIT + ((is64bit) ? 4 : 0))
-#define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
-#define multi_msi_capable(control) \
-	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
-#define multi_msi_enable(control, num) \
-	control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE);
-#define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
-#define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
-#define msi_enable(control, num) multi_msi_enable(control, num); \
-	control |= PCI_MSI_FLAGS_ENABLE
-
-#define msix_control_reg(base)		(base + PCI_MSIX_FLAGS)
-#define msix_table_offset_reg(base)	(base + PCI_MSIX_TABLE)
-#define msix_pba_offset_reg(base)	(base + PCI_MSIX_PBA)
-#define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
-#define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
-#define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
-#define msix_unmask(address)	 	(address & ~PCI_MSIX_VECTOR_BITMASK)
-#define msix_mask(address)		(address | PCI_MSIX_VECTOR_BITMASK)
-
 /*
  * MSI Defined Data Structures
  */
diff --git a/xen/include/xen/msi.h b/xen/include/xen/msi.h
index c903d0050c..1c22c9a4a7 100644
--- a/xen/include/xen/msi.h
+++ b/xen/include/xen/msi.h
@@ -3,6 +3,34 @@
 
 #include <xen/pci.h>
 
+#define msi_control_reg(base)       (base + PCI_MSI_FLAGS)
+#define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO)
+#define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI)
+#define msi_data_reg(base, is64bit) \
+	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
+#define msi_mask_bits_reg(base, is64bit) \
+	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
+#define msi_pending_bits_reg(base, is64bit) \
+	((base) + PCI_MSI_MASK_BIT + ((is64bit) ? 4 : 0))
+#define msi_disable(control)        control &= ~PCI_MSI_FLAGS_ENABLE
+#define multi_msi_capable(control) \
+	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
+#define multi_msi_enable(control, num) \
+	control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE);
+#define is_64bit_address(control)   (!!(control & PCI_MSI_FLAGS_64BIT))
+#define is_mask_bit_support(control)    (!!(control & PCI_MSI_FLAGS_MASKBIT))
+#define msi_enable(control, num) multi_msi_enable(control, num); \
+	control |= PCI_MSI_FLAGS_ENABLE
+
+#define msix_control_reg(base)      (base + PCI_MSIX_FLAGS)
+#define msix_table_offset_reg(base) (base + PCI_MSIX_TABLE)
+#define msix_pba_offset_reg(base)   (base + PCI_MSIX_PBA)
+#define msix_enable(control)        control |= PCI_MSIX_FLAGS_ENABLE
+#define msix_disable(control)       control &= ~PCI_MSIX_FLAGS_ENABLE
+#define msix_table_size(control)    ((control & PCI_MSIX_FLAGS_QSIZE)+1)
+#define msix_unmask(address)        (address & ~PCI_MSIX_VECTOR_BITMASK)
+#define msix_mask(address)          (address | PCI_MSIX_VECTOR_BITMASK)
+
 #ifdef CONFIG_HAS_PCI_MSI
 
 #include <asm/msi.h>
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 9ea66e033f..4cb7665d54 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -150,6 +150,11 @@ struct vpci_vcpu {
 };
 
 #ifdef __XEN__
+
+#define VMSIX_ADDR_IN_RANGE(addr, vpci, nr)                               \
+    ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
+     (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
+
 void vpci_dump_msi(void);
 
 /* Make sure there's a hole in the p2m for the MSIX mmio areas. */
@@ -220,6 +225,22 @@ bool vpci_ecam_write(pci_sbdf_t sbdf, unsigned int reg, unsigned int len,
 bool vpci_ecam_read(pci_sbdf_t sbdf, unsigned int reg, unsigned int len,
                     unsigned long *data);
 
+void register_msix_mmio_handler(struct domain *d);
+
+void vpci_msix_add_to_msix_table(struct vpci_msix *msix, struct domain *d);
+
+int msix_write(const struct domain *d, struct vpci_msix *msix,
+               unsigned long addr, unsigned int len, unsigned long data);
+
+int msix_read(struct vpci_msix *msix, unsigned long addr, unsigned int len,
+              unsigned long *data);
+
+u32 vpci_arch_readl(unsigned long addr);
+u64 vpci_arch_readq(unsigned long addr);
+
+void vpci_arch_writel(u32 data, unsigned long addr);
+void vpci_arch_writeq(u64 data, unsigned long addr);
+
 #endif /* __XEN__ */
 
 #else /* !CONFIG_HAS_VPCI */
-- 
2.25.1

Re: [PATCH] xen/vpci: msix: move x86 specific code to x86 file

Posted by Roger Pau Monné 4 years, 1 month ago

On Tue, Dec 14, 2021 at 10:45:17AM +0000, Rahul Singh wrote:
> vpci/msix.c file will be used for arm architecture when vpci msix
> support will be added to ARM, but there is x86 specific code in this
> file.
> 
> Move x86 specific code to the x86_msix.c file to make sure common code
> will be used for other architecture.
> 
> No functional change intended.
> 
> Signed-off-by: Rahul Singh <rahul.singh@arm.com>
> ---
>  xen/arch/x86/msi.c                       |   2 +-
>  xen/drivers/passthrough/amd/iommu_init.c |   1 +
>  xen/drivers/vpci/Makefile                |   1 +
>  xen/drivers/vpci/msi.c                   |   3 +-
>  xen/drivers/vpci/msix.c                  | 134 +++++---------------
>  xen/drivers/vpci/x86_msix.c              | 155 +++++++++++++++++++++++

This should go into xen/arch/x86/hvm/vmsi.c there's already vPCI MSI
specific code in there.

>  xen/include/asm-x86/msi.h                |  28 ----
>  xen/include/xen/msi.h                    |  28 ++++
>  xen/include/xen/vpci.h                   |  21 +++
>  9 files changed, 239 insertions(+), 134 deletions(-)
>  create mode 100644 xen/drivers/vpci/x86_msix.c
> 
> diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
> index 5febc0ea4b..2b120f897f 100644
> --- a/xen/arch/x86/msi.c
> +++ b/xen/arch/x86/msi.c
> @@ -23,7 +23,7 @@
>  #include <asm/io.h>
>  #include <asm/smp.h>
>  #include <asm/desc.h>
> -#include <asm/msi.h>
> +#include <xen/msi.h>

You likely need to move this up to the xen/ prefixed include block.

>  #include <asm/fixmap.h>
>  #include <asm/p2m.h>
>  #include <mach_apic.h>
> diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c
> index 559a734bda..fc385959c7 100644
> --- a/xen/drivers/passthrough/amd/iommu_init.c
> +++ b/xen/drivers/passthrough/amd/iommu_init.c
> @@ -20,6 +20,7 @@
>  #include <xen/acpi.h>
>  #include <xen/delay.h>
>  #include <xen/keyhandler.h>
> +#include <xen/msi.h>
>  
>  #include "iommu.h"

Might be better to replace the asm/msi.h in include in iommu.h with
xen/msi.h instead (or just add the xen/msi.h include instead of
replace).

>  
> diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
> index 1a1413b93e..543c265199 100644
> --- a/xen/drivers/vpci/Makefile
> +++ b/xen/drivers/vpci/Makefile
> @@ -1,2 +1,3 @@
>  obj-y += vpci.o header.o
>  obj-$(CONFIG_HAS_PCI_MSI) += msi.o msix.o
> +obj-$(CONFIG_X86) += x86_msix.o
> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
> index 5757a7aed2..8fc82a9b8d 100644
> --- a/xen/drivers/vpci/msi.c
> +++ b/xen/drivers/vpci/msi.c
> @@ -16,12 +16,11 @@
>   * License along with this program; If not, see <http://www.gnu.org/licenses/>.
>   */
>  
> +#include <xen/msi.h>
>  #include <xen/sched.h>
>  #include <xen/softirq.h>
>  #include <xen/vpci.h>
>  
> -#include <asm/msi.h>
> -
>  static uint32_t control_read(const struct pci_dev *pdev, unsigned int reg,
>                               void *data)
>  {
> diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
> index 846f1b8d70..7a9b02f1a5 100644
> --- a/xen/drivers/vpci/msix.c
> +++ b/xen/drivers/vpci/msix.c
> @@ -17,15 +17,24 @@
>   * License along with this program; If not, see <http://www.gnu.org/licenses/>.
>   */
>  
> +#include <xen/msi.h>
>  #include <xen/sched.h>
>  #include <xen/vpci.h>
>  
> -#include <asm/msi.h>
>  #include <asm/p2m.h>
>  
> -#define VMSIX_ADDR_IN_RANGE(addr, vpci, nr)                               \
> -    ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
> -     (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
> +/*
> + * The return value is different for the MMIO handler on ARM and x86
> + * architecture. To make the code common for both architectures create
> + * generic return code with architecture dependent values.
> + */
> +#ifdef CONFIG_X86
> +#define VPCI_EMUL_OKAY      X86EMUL_OKAY
> +#define VPCI_EMUL_RETRY     X86EMUL_RETRY
> +#else
> +#define VPCI_EMUL_OKAY      1
> +#define VPCI_EMUL_RETRY     VPCI_EMUL_OKAY
> +#endif

Since msix_{read/write} are no longer directly used by the MMIO
handlers you might as well just return an error code (or a boolean)
and let the caller translate that into the per-arch return code.

>  
>  static uint32_t control_read(const struct pci_dev *pdev, unsigned int reg,
>                               void *data)
> @@ -138,29 +147,6 @@ static void control_write(const struct pci_dev *pdev, unsigned int reg,
>          pci_conf_write16(pdev->sbdf, reg, val);
>  }
>  
> -static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr)
> -{
> -    struct vpci_msix *msix;
> -
> -    list_for_each_entry ( msix, &d->arch.hvm.msix_tables, next )
> -    {
> -        const struct vpci_bar *bars = msix->pdev->vpci->header.bars;
> -        unsigned int i;
> -
> -        for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
> -            if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled &&
> -                 VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) )
> -                return msix;
> -    }
> -
> -    return NULL;
> -}
> -
> -static int msix_accept(struct vcpu *v, unsigned long addr)
> -{
> -    return !!msix_find(v->domain, addr);
> -}
> -
>  static bool access_allowed(const struct pci_dev *pdev, unsigned long addr,
>                             unsigned int len)
>  {
> @@ -182,21 +168,19 @@ static struct vpci_msix_entry *get_entry(struct vpci_msix *msix,
>      return &msix->entries[(addr - start) / PCI_MSIX_ENTRY_SIZE];
>  }
>  
> -static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
> -                     unsigned long *data)
> +int msix_read(struct vpci_msix *msix, unsigned long addr, unsigned int len,

This now requires a vpci_ prefix, since it's a global function.
Plain msix_{read,write} is way to generic.

> +              unsigned long *data)
>  {
> -    const struct domain *d = v->domain;
> -    struct vpci_msix *msix = msix_find(d, addr);
>      const struct vpci_msix_entry *entry;
>      unsigned int offset;
>  
>      *data = ~0ul;
>  
>      if ( !msix )
> -        return X86EMUL_RETRY;
> +        return VPCI_EMUL_RETRY;
>  
>      if ( !access_allowed(msix->pdev, addr, len) )
> -        return X86EMUL_OKAY;
> +        return VPCI_EMUL_OKAY;
>  
>      if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
>      {
> @@ -210,11 +194,11 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
>          switch ( len )
>          {
>          case 4:
> -            *data = readl(addr);
> +            *data = vpci_arch_readl(addr);

Why do you need a vpci wrapper around the read/write handlers? AFAICT
arm64 also has {read,write}{l,q}. And you likely want to protect the
64bit read with CONFIG_64BIT if this code is to be made available to
arm32.

>              break;
>  
>          case 8:
> -            *data = readq(addr);
> +            *data = vpci_arch_readq(addr);
>              break;
>  
>          default:
> @@ -222,7 +206,7 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
>              break;
>          }
>  
> -        return X86EMUL_OKAY;
> +        return VPCI_EMUL_OKAY;
>      }
>  
>      spin_lock(&msix->pdev->vpci->lock);
> @@ -256,22 +240,20 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
>      }
>      spin_unlock(&msix->pdev->vpci->lock);
>  
> -    return X86EMUL_OKAY;
> +    return VPCI_EMUL_OKAY;
>  }
>  
> -static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
> -                      unsigned long data)
> +int msix_write(const struct domain *d, struct vpci_msix *msix,
> +               unsigned long addr, unsigned int len, unsigned long data)
>  {
> -    const struct domain *d = v->domain;
> -    struct vpci_msix *msix = msix_find(d, addr);
>      struct vpci_msix_entry *entry;
>      unsigned int offset;
>  
>      if ( !msix )
> -        return X86EMUL_RETRY;
> +        return VPCI_EMUL_RETRY;
>  
>      if ( !access_allowed(msix->pdev, addr, len) )
> -        return X86EMUL_OKAY;
> +        return VPCI_EMUL_OKAY;
>  
>      if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
>      {
> @@ -281,11 +263,11 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
>              switch ( len )
>              {
>              case 4:
> -                writel(data, addr);
> +                vpci_arch_writel(data, addr);
>                  break;
>  
>              case 8:
> -                writeq(data, addr);
> +                vpci_arch_writeq(data, addr);
>                  break;
>  
>              default:
> @@ -294,7 +276,7 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
>              }
>          }
>  
> -        return X86EMUL_OKAY;
> +        return VPCI_EMUL_OKAY;
>      }
>  
>      spin_lock(&msix->pdev->vpci->lock);
> @@ -372,60 +354,7 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
>      }
>      spin_unlock(&msix->pdev->vpci->lock);
>  
> -    return X86EMUL_OKAY;
> -}
> -
> -static const struct hvm_mmio_ops vpci_msix_table_ops = {
> -    .check = msix_accept,
> -    .read = msix_read,
> -    .write = msix_write,
> -};
> -
> -int vpci_make_msix_hole(const struct pci_dev *pdev)
> -{
> -    struct domain *d = pdev->domain;
> -    unsigned int i;
> -
> -    if ( !pdev->vpci->msix )
> -        return 0;
> -
> -    /* Make sure there's a hole for the MSIX table/PBA in the p2m. */
> -    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->msix->tables); i++ )
> -    {
> -        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
> -        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
> -                                     vmsix_table_size(pdev->vpci, i) - 1);
> -
> -        for ( ; start <= end; start++ )
> -        {
> -            p2m_type_t t;
> -            mfn_t mfn = get_gfn_query(d, start, &t);
> -
> -            switch ( t )
> -            {
> -            case p2m_mmio_dm:
> -            case p2m_invalid:
> -                break;
> -            case p2m_mmio_direct:
> -                if ( mfn_x(mfn) == start )
> -                {
> -                    clear_identity_p2m_entry(d, start);
> -                    break;
> -                }
> -                /* fallthrough. */
> -            default:
> -                put_gfn(d, start);
> -                gprintk(XENLOG_WARNING,
> -                        "%pp: existing mapping (mfn: %" PRI_mfn
> -                        "type: %d) at %#lx clobbers MSIX MMIO area\n",
> -                        &pdev->sbdf, mfn_x(mfn), t, start);
> -                return -EEXIST;
> -            }
> -            put_gfn(d, start);
> -        }
> -    }
> -
> -    return 0;
> +    return VPCI_EMUL_OKAY;
>  }
>  
>  static int init_msix(struct pci_dev *pdev)
> @@ -472,11 +401,10 @@ static int init_msix(struct pci_dev *pdev)
>          vpci_msix_arch_init_entry(&msix->entries[i]);
>      }
>  
> -    if ( list_empty(&d->arch.hvm.msix_tables) )
> -        register_mmio_handler(d, &vpci_msix_table_ops);
> +    register_msix_mmio_handler(d);
> +    vpci_msix_add_to_msix_table(msix, d);
>  
>      pdev->vpci->msix = msix;
> -    list_add(&msix->next, &d->arch.hvm.msix_tables);

You could likely do the registering of the handler and the addition of
the table in the same handler: vpci_msix_arch_register or similar.

Thanks, Roger.

Re: [PATCH] xen/vpci: msix: move x86 specific code to x86 file

Posted by Rahul Singh 4 years, 1 month ago

Hi Roger,

Thanks for reviewing the code.

> On 14 Dec 2021, at 12:37 pm, Roger Pau Monné <roger.pau@citrix.com> wrote:
> 
> On Tue, Dec 14, 2021 at 10:45:17AM +0000, Rahul Singh wrote:
>> vpci/msix.c file will be used for arm architecture when vpci msix
>> support will be added to ARM, but there is x86 specific code in this
>> file.
>> 
>> Move x86 specific code to the x86_msix.c file to make sure common code
>> will be used for other architecture.
>> 
>> No functional change intended.
>> 
>> Signed-off-by: Rahul Singh <rahul.singh@arm.com>
>> ---
>> xen/arch/x86/msi.c                       |   2 +-
>> xen/drivers/passthrough/amd/iommu_init.c |   1 +
>> xen/drivers/vpci/Makefile                |   1 +
>> xen/drivers/vpci/msi.c                   |   3 +-
>> xen/drivers/vpci/msix.c                  | 134 +++++---------------
>> xen/drivers/vpci/x86_msix.c              | 155 +++++++++++++++++++++++
> 
> This should go into xen/arch/x86/hvm/vmsi.c there's already vPCI MSI
> specific code in there.
Ok.
>> xen/include/asm-x86/msi.h                |  28 ----
>> xen/include/xen/msi.h                    |  28 ++++
>> xen/include/xen/vpci.h                   |  21 +++
>> 9 files changed, 239 insertions(+), 134 deletions(-)
>> create mode 100644 xen/drivers/vpci/x86_msix.c
>> 
>> diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
>> index 5febc0ea4b..2b120f897f 100644
>> --- a/xen/arch/x86/msi.c
>> +++ b/xen/arch/x86/msi.c
>> @@ -23,7 +23,7 @@
>> #include <asm/io.h>
>> #include <asm/smp.h>
>> #include <asm/desc.h>
>> -#include <asm/msi.h>
>> +#include <xen/msi.h>
> 
> You likely need to move this up to the xen/ prefixed include block.
Ok.
> 
>> #include <asm/fixmap.h>
>> #include <asm/p2m.h>
>> #include <mach_apic.h>
>> diff --git a/xen/drivers/passthrough/amd/iommu_init.c b/xen/drivers/passthrough/amd/iommu_init.c
>> index 559a734bda..fc385959c7 100644
>> --- a/xen/drivers/passthrough/amd/iommu_init.c
>> +++ b/xen/drivers/passthrough/amd/iommu_init.c
>> @@ -20,6 +20,7 @@
>> #include <xen/acpi.h>
>> #include <xen/delay.h>
>> #include <xen/keyhandler.h>
>> +#include <xen/msi.h>
>> 
>> #include "iommu.h"
> 
> Might be better to replace the asm/msi.h in include in iommu.h with
> xen/msi.h instead (or just add the xen/msi.h include instead of
> replace).

Ok.
> 
>> 
>> diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
>> index 1a1413b93e..543c265199 100644
>> --- a/xen/drivers/vpci/Makefile
>> +++ b/xen/drivers/vpci/Makefile
>> @@ -1,2 +1,3 @@
>> obj-y += vpci.o header.o
>> obj-$(CONFIG_HAS_PCI_MSI) += msi.o msix.o
>> +obj-$(CONFIG_X86) += x86_msix.o
>> diff --git a/xen/drivers/vpci/msi.c b/xen/drivers/vpci/msi.c
>> index 5757a7aed2..8fc82a9b8d 100644
>> --- a/xen/drivers/vpci/msi.c
>> +++ b/xen/drivers/vpci/msi.c
>> @@ -16,12 +16,11 @@
>>  * License along with this program; If not, see <http://www.gnu.org/licenses/>.
>>  */
>> 
>> +#include <xen/msi.h>
>> #include <xen/sched.h>
>> #include <xen/softirq.h>
>> #include <xen/vpci.h>
>> 
>> -#include <asm/msi.h>
>> -
>> static uint32_t control_read(const struct pci_dev *pdev, unsigned int reg,
>>                              void *data)
>> {
>> diff --git a/xen/drivers/vpci/msix.c b/xen/drivers/vpci/msix.c
>> index 846f1b8d70..7a9b02f1a5 100644
>> --- a/xen/drivers/vpci/msix.c
>> +++ b/xen/drivers/vpci/msix.c
>> @@ -17,15 +17,24 @@
>>  * License along with this program; If not, see <http://www.gnu.org/licenses/>.
>>  */
>> 
>> +#include <xen/msi.h>
>> #include <xen/sched.h>
>> #include <xen/vpci.h>
>> 
>> -#include <asm/msi.h>
>> #include <asm/p2m.h>
>> 
>> -#define VMSIX_ADDR_IN_RANGE(addr, vpci, nr)                               \
>> -    ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
>> -     (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
>> +/*
>> + * The return value is different for the MMIO handler on ARM and x86
>> + * architecture. To make the code common for both architectures create
>> + * generic return code with architecture dependent values.
>> + */
>> +#ifdef CONFIG_X86
>> +#define VPCI_EMUL_OKAY      X86EMUL_OKAY
>> +#define VPCI_EMUL_RETRY     X86EMUL_RETRY
>> +#else
>> +#define VPCI_EMUL_OKAY      1
>> +#define VPCI_EMUL_RETRY     VPCI_EMUL_OKAY
>> +#endif
> 
> Since msix_{read/write} are no longer directly used by the MMIO
> handlers you might as well just return an error code (or a boolean)
> and let the caller translate that into the per-arch return code.

Ok.
> 
>> 
>> static uint32_t control_read(const struct pci_dev *pdev, unsigned int reg,
>>                              void *data)
>> @@ -138,29 +147,6 @@ static void control_write(const struct pci_dev *pdev, unsigned int reg,
>>         pci_conf_write16(pdev->sbdf, reg, val);
>> }
>> 
>> -static struct vpci_msix *msix_find(const struct domain *d, unsigned long addr)
>> -{
>> -    struct vpci_msix *msix;
>> -
>> -    list_for_each_entry ( msix, &d->arch.hvm.msix_tables, next )
>> -    {
>> -        const struct vpci_bar *bars = msix->pdev->vpci->header.bars;
>> -        unsigned int i;
>> -
>> -        for ( i = 0; i < ARRAY_SIZE(msix->tables); i++ )
>> -            if ( bars[msix->tables[i] & PCI_MSIX_BIRMASK].enabled &&
>> -                 VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, i) )
>> -                return msix;
>> -    }
>> -
>> -    return NULL;
>> -}
>> -
>> -static int msix_accept(struct vcpu *v, unsigned long addr)
>> -{
>> -    return !!msix_find(v->domain, addr);
>> -}
>> -
>> static bool access_allowed(const struct pci_dev *pdev, unsigned long addr,
>>                            unsigned int len)
>> {
>> @@ -182,21 +168,19 @@ static struct vpci_msix_entry *get_entry(struct vpci_msix *msix,
>>     return &msix->entries[(addr - start) / PCI_MSIX_ENTRY_SIZE];
>> }
>> 
>> -static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
>> -                     unsigned long *data)
>> +int msix_read(struct vpci_msix *msix, unsigned long addr, unsigned int len,
> 
> This now requires a vpci_ prefix, since it's a global function.
> Plain msix_{read,write} is way to generic.
Ack. 
> 
>> +              unsigned long *data)
>> {
>> -    const struct domain *d = v->domain;
>> -    struct vpci_msix *msix = msix_find(d, addr);
>>     const struct vpci_msix_entry *entry;
>>     unsigned int offset;
>> 
>>     *data = ~0ul;
>> 
>>     if ( !msix )
>> -        return X86EMUL_RETRY;
>> +        return VPCI_EMUL_RETRY;
>> 
>>     if ( !access_allowed(msix->pdev, addr, len) )
>> -        return X86EMUL_OKAY;
>> +        return VPCI_EMUL_OKAY;
>> 
>>     if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
>>     {
>> @@ -210,11 +194,11 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
>>         switch ( len )
>>         {
>>         case 4:
>> -            *data = readl(addr);
>> +            *data = vpci_arch_readl(addr);
> 
> Why do you need a vpci wrapper around the read/write handlers? AFAICT
> arm64 also has {read,write}{l,q}. And you likely want to protect the
> 64bit read with CONFIG_64BIT if this code is to be made available to
> arm32.

I need the wrapper because {read,write}{l,q} function argument is different for ARM and x86.
ARM {read,wrie}(l,q}  function argument is pointer to the address whereas X86  {read,wrie}(l,q} 
function argument is address itself.

> 
>>             break;
>> 
>>         case 8:
>> -            *data = readq(addr);
>> +            *data = vpci_arch_readq(addr);
>>             break;
>> 
>>         default:
>> @@ -222,7 +206,7 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
>>             break;
>>         }
>> 
>> -        return X86EMUL_OKAY;
>> +        return VPCI_EMUL_OKAY;
>>     }
>> 
>>     spin_lock(&msix->pdev->vpci->lock);
>> @@ -256,22 +240,20 @@ static int msix_read(struct vcpu *v, unsigned long addr, unsigned int len,
>>     }
>>     spin_unlock(&msix->pdev->vpci->lock);
>> 
>> -    return X86EMUL_OKAY;
>> +    return VPCI_EMUL_OKAY;
>> }
>> 
>> -static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
>> -                      unsigned long data)
>> +int msix_write(const struct domain *d, struct vpci_msix *msix,
>> +               unsigned long addr, unsigned int len, unsigned long data)
>> {
>> -    const struct domain *d = v->domain;
>> -    struct vpci_msix *msix = msix_find(d, addr);
>>     struct vpci_msix_entry *entry;
>>     unsigned int offset;
>> 
>>     if ( !msix )
>> -        return X86EMUL_RETRY;
>> +        return VPCI_EMUL_RETRY;
>> 
>>     if ( !access_allowed(msix->pdev, addr, len) )
>> -        return X86EMUL_OKAY;
>> +        return VPCI_EMUL_OKAY;
>> 
>>     if ( VMSIX_ADDR_IN_RANGE(addr, msix->pdev->vpci, VPCI_MSIX_PBA) )
>>     {
>> @@ -281,11 +263,11 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
>>             switch ( len )
>>             {
>>             case 4:
>> -                writel(data, addr);
>> +                vpci_arch_writel(data, addr);
>>                 break;
>> 
>>             case 8:
>> -                writeq(data, addr);
>> +                vpci_arch_writeq(data, addr);
>>                 break;
>> 
>>             default:
>> @@ -294,7 +276,7 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
>>             }
>>         }
>> 
>> -        return X86EMUL_OKAY;
>> +        return VPCI_EMUL_OKAY;
>>     }
>> 
>>     spin_lock(&msix->pdev->vpci->lock);
>> @@ -372,60 +354,7 @@ static int msix_write(struct vcpu *v, unsigned long addr, unsigned int len,
>>     }
>>     spin_unlock(&msix->pdev->vpci->lock);
>> 
>> -    return X86EMUL_OKAY;
>> -}
>> -
>> -static const struct hvm_mmio_ops vpci_msix_table_ops = {
>> -    .check = msix_accept,
>> -    .read = msix_read,
>> -    .write = msix_write,
>> -};
>> -
>> -int vpci_make_msix_hole(const struct pci_dev *pdev)
>> -{
>> -    struct domain *d = pdev->domain;
>> -    unsigned int i;
>> -
>> -    if ( !pdev->vpci->msix )
>> -        return 0;
>> -
>> -    /* Make sure there's a hole for the MSIX table/PBA in the p2m. */
>> -    for ( i = 0; i < ARRAY_SIZE(pdev->vpci->msix->tables); i++ )
>> -    {
>> -        unsigned long start = PFN_DOWN(vmsix_table_addr(pdev->vpci, i));
>> -        unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
>> -                                     vmsix_table_size(pdev->vpci, i) - 1);
>> -
>> -        for ( ; start <= end; start++ )
>> -        {
>> -            p2m_type_t t;
>> -            mfn_t mfn = get_gfn_query(d, start, &t);
>> -
>> -            switch ( t )
>> -            {
>> -            case p2m_mmio_dm:
>> -            case p2m_invalid:
>> -                break;
>> -            case p2m_mmio_direct:
>> -                if ( mfn_x(mfn) == start )
>> -                {
>> -                    clear_identity_p2m_entry(d, start);
>> -                    break;
>> -                }
>> -                /* fallthrough. */
>> -            default:
>> -                put_gfn(d, start);
>> -                gprintk(XENLOG_WARNING,
>> -                        "%pp: existing mapping (mfn: %" PRI_mfn
>> -                        "type: %d) at %#lx clobbers MSIX MMIO area\n",
>> -                        &pdev->sbdf, mfn_x(mfn), t, start);
>> -                return -EEXIST;
>> -            }
>> -            put_gfn(d, start);
>> -        }
>> -    }
>> -
>> -    return 0;
>> +    return VPCI_EMUL_OKAY;
>> }
>> 
>> static int init_msix(struct pci_dev *pdev)
>> @@ -472,11 +401,10 @@ static int init_msix(struct pci_dev *pdev)
>>         vpci_msix_arch_init_entry(&msix->entries[i]);
>>     }
>> 
>> -    if ( list_empty(&d->arch.hvm.msix_tables) )
>> -        register_mmio_handler(d, &vpci_msix_table_ops);
>> +    register_msix_mmio_handler(d);
>> +    vpci_msix_add_to_msix_table(msix, d);
>> 
>>     pdev->vpci->msix = msix;
>> -    list_add(&msix->next, &d->arch.hvm.msix_tables);
> 
> You could likely do the registering of the handler and the addition of
> the table in the same handler: vpci_msix_arch_register or similar.

Ok.

Regards,
Rahul
> 
> Thanks, Roger.

Re: [PATCH] xen/vpci: msix: move x86 specific code to x86 file

Posted by Jan Beulich 4 years, 1 month ago

On 14.12.2021 11:45, Rahul Singh wrote:
> --- a/xen/drivers/vpci/msix.c
> +++ b/xen/drivers/vpci/msix.c
> @@ -17,15 +17,24 @@
>   * License along with this program; If not, see <http://www.gnu.org/licenses/>.
>   */
>  
> +#include <xen/msi.h>
>  #include <xen/sched.h>
>  #include <xen/vpci.h>
>  
> -#include <asm/msi.h>
>  #include <asm/p2m.h>
>  
> -#define VMSIX_ADDR_IN_RANGE(addr, vpci, nr)                               \
> -    ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
> -     (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
> +/*
> + * The return value is different for the MMIO handler on ARM and x86
> + * architecture. To make the code common for both architectures create
> + * generic return code with architecture dependent values.
> + */
> +#ifdef CONFIG_X86
> +#define VPCI_EMUL_OKAY      X86EMUL_OKAY
> +#define VPCI_EMUL_RETRY     X86EMUL_RETRY
> +#else
> +#define VPCI_EMUL_OKAY      1
> +#define VPCI_EMUL_RETRY     VPCI_EMUL_OKAY
> +#endif

In addition to what Roger has said, at the example of the above I think
you want to split this change. The change in return value naming could
likely quite well be a separate thing. And then it'll be easier to see
which other suggested changes are really movement of x86-specific stuff
(looking over it I wasn't convinced everything you move really is).

> @@ -472,11 +401,10 @@ static int init_msix(struct pci_dev *pdev)
>          vpci_msix_arch_init_entry(&msix->entries[i]);
>      }
>  
> -    if ( list_empty(&d->arch.hvm.msix_tables) )
> -        register_mmio_handler(d, &vpci_msix_table_ops);
> +    register_msix_mmio_handler(d);
> +    vpci_msix_add_to_msix_table(msix, d);
>  
>      pdev->vpci->msix = msix;
> -    list_add(&msix->next, &d->arch.hvm.msix_tables);
>  
>      return 0;

May I ask that you don't alter the order of operations? I take it that
vpci_msix_add_to_msix_table() is the replacement of the list_add().
That should occur only after pdev->vcpi has been updated. I could in
fact imagine that in cases like this one for Arm barriers may need
adding.

> --- /dev/null
> +++ b/xen/drivers/vpci/x86_msix.c
> @@ -0,0 +1,155 @@
> +/*
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms and conditions of the GNU General Public
> + * License, version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this program; If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include <xen/sched.h>
> +#include <xen/vpci.h>
> +
> +#include <asm/msi.h>
> +#include <asm/p2m.h>
> +
> +u32 vpci_arch_readl(unsigned long addr)

Nit: No new uses of u<N> please; these are being phased out, with
uint<N>_t being the intended types.

> +{
> +    return readl(addr);
> +}
> +
> +u64 vpci_arch_readq(unsigned long addr)
> +{
> +    return readq(addr);
> +}
> +
> +void vpci_arch_writel(u32 data, unsigned long addr)
> +{
> +    writel(data, addr);
> +}
> +
> +void vpci_arch_writeq(u64 data, unsigned long addr)
> +{
> +    writeq(data, addr);
> +}

Functions like these (if, as Roger said, they need abstracting in the
first place) or ...

> +void register_msix_mmio_handler(struct domain *d)
> +{
> +    if ( list_empty(&d->arch.hvm.msix_tables) )
> +        register_mmio_handler(d, &vpci_msix_table_ops);
> +}
> +
> +void vpci_msix_add_to_msix_table(struct vpci_msix *msix,
> +                                 struct domain *d)
> +{
> +    list_add(&msix->next, &d->arch.hvm.msix_tables);
> +}

... these would imo better be inline helpers.

> --- a/xen/include/asm-x86/msi.h
> +++ b/xen/include/asm-x86/msi.h
> @@ -148,34 +148,6 @@ int msi_free_irq(struct msi_desc *entry);
>   */
>  #define NR_HP_RESERVED_VECTORS 	20
>  
> -#define msi_control_reg(base)		(base + PCI_MSI_FLAGS)
> -#define msi_lower_address_reg(base)	(base + PCI_MSI_ADDRESS_LO)
> -#define msi_upper_address_reg(base)	(base + PCI_MSI_ADDRESS_HI)
> -#define msi_data_reg(base, is64bit)	\
> -	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
> -#define msi_mask_bits_reg(base, is64bit) \
> -	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
> -#define msi_pending_bits_reg(base, is64bit) \
> -	((base) + PCI_MSI_MASK_BIT + ((is64bit) ? 4 : 0))
> -#define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
> -#define multi_msi_capable(control) \
> -	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
> -#define multi_msi_enable(control, num) \
> -	control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE);
> -#define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
> -#define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
> -#define msi_enable(control, num) multi_msi_enable(control, num); \
> -	control |= PCI_MSI_FLAGS_ENABLE
> -
> -#define msix_control_reg(base)		(base + PCI_MSIX_FLAGS)
> -#define msix_table_offset_reg(base)	(base + PCI_MSIX_TABLE)
> -#define msix_pba_offset_reg(base)	(base + PCI_MSIX_PBA)
> -#define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
> -#define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
> -#define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
> -#define msix_unmask(address)	 	(address & ~PCI_MSIX_VECTOR_BITMASK)
> -#define msix_mask(address)		(address | PCI_MSIX_VECTOR_BITMASK)
> -
>  /*
>   * MSI Defined Data Structures
>   */
> diff --git a/xen/include/xen/msi.h b/xen/include/xen/msi.h
> index c903d0050c..1c22c9a4a7 100644
> --- a/xen/include/xen/msi.h
> +++ b/xen/include/xen/msi.h
> @@ -3,6 +3,34 @@
>  
>  #include <xen/pci.h>
>  
> +#define msi_control_reg(base)       (base + PCI_MSI_FLAGS)
> +#define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO)
> +#define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI)
> +#define msi_data_reg(base, is64bit) \
> +	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )

As you move this code, please tidy is style-wise. For the construct
here, for example this would mean

#define msi_data_reg(base, is64bit) \
    ((is64bit) ? (base) + PCI_MSI_DATA_64 : (base) + PCI_MSI_DATA_32)

or perhaps even

#define msi_data_reg(base, is64bit) \
    ((base) + ((is64bit) ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32))

Further items would want similar adjustments.

Jan

Re: [PATCH] xen/vpci: msix: move x86 specific code to x86 file

Posted by Rahul Singh 4 years, 1 month ago

Hi Jan,

Thanks for reviewing the code.

> On 14 Dec 2021, at 2:15 pm, Jan Beulich <jbeulich@suse.com> wrote:
> 
> On 14.12.2021 11:45, Rahul Singh wrote:
>> --- a/xen/drivers/vpci/msix.c
>> +++ b/xen/drivers/vpci/msix.c
>> @@ -17,15 +17,24 @@
>>  * License along with this program; If not, see <http://www.gnu.org/licenses/>.
>>  */
>> 
>> +#include <xen/msi.h>
>> #include <xen/sched.h>
>> #include <xen/vpci.h>
>> 
>> -#include <asm/msi.h>
>> #include <asm/p2m.h>
>> 
>> -#define VMSIX_ADDR_IN_RANGE(addr, vpci, nr)                               \
>> -    ((addr) >= vmsix_table_addr(vpci, nr) &&                              \
>> -     (addr) < vmsix_table_addr(vpci, nr) + vmsix_table_size(vpci, nr))
>> +/*
>> + * The return value is different for the MMIO handler on ARM and x86
>> + * architecture. To make the code common for both architectures create
>> + * generic return code with architecture dependent values.
>> + */
>> +#ifdef CONFIG_X86
>> +#define VPCI_EMUL_OKAY      X86EMUL_OKAY
>> +#define VPCI_EMUL_RETRY     X86EMUL_RETRY
>> +#else
>> +#define VPCI_EMUL_OKAY      1
>> +#define VPCI_EMUL_RETRY     VPCI_EMUL_OKAY
>> +#endif
> 
> In addition to what Roger has said, at the example of the above I think
> you want to split this change. The change in return value naming could
> likely quite well be a separate thing. And then it'll be easier to see
> which other suggested changes are really movement of x86-specific stuff
> (looking over it I wasn't convinced everything you move really is).
> 

Ack. I will split the changes in next version.

>> @@ -472,11 +401,10 @@ static int init_msix(struct pci_dev *pdev)
>>         vpci_msix_arch_init_entry(&msix->entries[i]);
>>     }
>> 
>> -    if ( list_empty(&d->arch.hvm.msix_tables) )
>> -        register_mmio_handler(d, &vpci_msix_table_ops);
>> +    register_msix_mmio_handler(d);
>> +    vpci_msix_add_to_msix_table(msix, d);
>> 
>>     pdev->vpci->msix = msix;
>> -    list_add(&msix->next, &d->arch.hvm.msix_tables);
>> 
>>     return 0;
> 
> May I ask that you don't alter the order of operations? I take it that
> vpci_msix_add_to_msix_table() is the replacement of the list_add().
> That should occur only after pdev->vcpi has been updated. I could in
> fact imagine that in cases like this one for Arm barriers may need
> adding.

Yes I will not change the order I will fix this in next version.
> 
>> --- /dev/null
>> +++ b/xen/drivers/vpci/x86_msix.c
>> @@ -0,0 +1,155 @@
>> +/*
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms and conditions of the GNU General Public
>> + * License, version 2, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public
>> + * License along with this program; If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include <xen/sched.h>
>> +#include <xen/vpci.h>
>> +
>> +#include <asm/msi.h>
>> +#include <asm/p2m.h>
>> +
>> +u32 vpci_arch_readl(unsigned long addr)
> 
> Nit: No new uses of u<N> please; these are being phased out, with
> uint<N>_t being the intended types.

Ack .
> 
>> +{
>> +    return readl(addr);
>> +}
>> +
>> +u64 vpci_arch_readq(unsigned long addr)
>> +{
>> +    return readq(addr);
>> +}
>> +
>> +void vpci_arch_writel(u32 data, unsigned long addr)
>> +{
>> +    writel(data, addr);
>> +}
>> +
>> +void vpci_arch_writeq(u64 data, unsigned long addr)
>> +{
>> +    writeq(data, addr);
>> +}
> 
> Functions like these (if, as Roger said, they need abstracting in the
> first place) or ...
> 
>> +void register_msix_mmio_handler(struct domain *d)
>> +{
>> +    if ( list_empty(&d->arch.hvm.msix_tables) )
>> +        register_mmio_handler(d, &vpci_msix_table_ops);
>> +}
>> +
>> +void vpci_msix_add_to_msix_table(struct vpci_msix *msix,
>> +                                 struct domain *d)
>> +{
>> +    list_add(&msix->next, &d->arch.hvm.msix_tables);
>> +}
> 
> ... these would imo better be inline helpers.

Ack.
> 
>> --- a/xen/include/asm-x86/msi.h
>> +++ b/xen/include/asm-x86/msi.h
>> @@ -148,34 +148,6 @@ int msi_free_irq(struct msi_desc *entry);
>>  */
>> #define NR_HP_RESERVED_VECTORS 	20
>> 
>> -#define msi_control_reg(base)		(base + PCI_MSI_FLAGS)
>> -#define msi_lower_address_reg(base)	(base + PCI_MSI_ADDRESS_LO)
>> -#define msi_upper_address_reg(base)	(base + PCI_MSI_ADDRESS_HI)
>> -#define msi_data_reg(base, is64bit)	\
>> -	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
>> -#define msi_mask_bits_reg(base, is64bit) \
>> -	( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4)
>> -#define msi_pending_bits_reg(base, is64bit) \
>> -	((base) + PCI_MSI_MASK_BIT + ((is64bit) ? 4 : 0))
>> -#define msi_disable(control)		control &= ~PCI_MSI_FLAGS_ENABLE
>> -#define multi_msi_capable(control) \
>> -	(1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
>> -#define multi_msi_enable(control, num) \
>> -	control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE);
>> -#define is_64bit_address(control)	(!!(control & PCI_MSI_FLAGS_64BIT))
>> -#define is_mask_bit_support(control)	(!!(control & PCI_MSI_FLAGS_MASKBIT))
>> -#define msi_enable(control, num) multi_msi_enable(control, num); \
>> -	control |= PCI_MSI_FLAGS_ENABLE
>> -
>> -#define msix_control_reg(base)		(base + PCI_MSIX_FLAGS)
>> -#define msix_table_offset_reg(base)	(base + PCI_MSIX_TABLE)
>> -#define msix_pba_offset_reg(base)	(base + PCI_MSIX_PBA)
>> -#define msix_enable(control)	 	control |= PCI_MSIX_FLAGS_ENABLE
>> -#define msix_disable(control)	 	control &= ~PCI_MSIX_FLAGS_ENABLE
>> -#define msix_table_size(control) 	((control & PCI_MSIX_FLAGS_QSIZE)+1)
>> -#define msix_unmask(address)	 	(address & ~PCI_MSIX_VECTOR_BITMASK)
>> -#define msix_mask(address)		(address | PCI_MSIX_VECTOR_BITMASK)
>> -
>> /*
>>  * MSI Defined Data Structures
>>  */
>> diff --git a/xen/include/xen/msi.h b/xen/include/xen/msi.h
>> index c903d0050c..1c22c9a4a7 100644
>> --- a/xen/include/xen/msi.h
>> +++ b/xen/include/xen/msi.h
>> @@ -3,6 +3,34 @@
>> 
>> #include <xen/pci.h>
>> 
>> +#define msi_control_reg(base)       (base + PCI_MSI_FLAGS)
>> +#define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO)
>> +#define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI)
>> +#define msi_data_reg(base, is64bit) \
>> +	( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 )
> 
> As you move this code, please tidy is style-wise. For the construct
> here, for example this would mean
> 
> #define msi_data_reg(base, is64bit) \
>    ((is64bit) ? (base) + PCI_MSI_DATA_64 : (base) + PCI_MSI_DATA_32)
> 
> or perhaps even
> 
> #define msi_data_reg(base, is64bit) \
>    ((base) + ((is64bit) ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32))
> 
> Further items would want similar adjustments.

Ok . Let me try to fix this in next version

Regards,
Rahul
> 
> Jan
>