From: Jason Gunthorpe <jgg@nvidia.com>
All the iommu cases simply want to override the MSI page's address with
the IOVA that was mapped through the iommu. This doesn't need a cookie
pointer, we just need to store the IOVA and its page size in the msi_desc.
Instead provide msi_desc_set_iommu_msi_iova() which allows the IOMMU side
to specify the IOVA that the MSI page is placed during
iommu_dma_prepare(). This is stored in the msi_desc and then
iommu_dma_compose_msi_msg() is a simple inline that sets address_hi/lo.
The next patch will correct the naming.
This is done because we cannot correctly lock access to group->domain in
the atomic context that iommu_dma_compose_msi_msg() is called under. Today
the locking miss is tolerable because dma_iommu.c operates under an
assumption that the domain does not change while a driver is probed.
However iommufd now permits the domain to change while the driver is
probed and VFIO userspace can create races with IRQ changes calling
iommu_dma_prepare/compose_msi_msg() and changing/freeing the iommu_domain.
Removing the pointer, and critically, the call to
iommu_get_domain_for_dev() during compose resolves this race.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
include/linux/iommu.h | 6 ------
include/linux/msi.h | 45 +++++++++++++++++++++++----------------
drivers/iommu/dma-iommu.c | 30 +++++---------------------
3 files changed, 32 insertions(+), 49 deletions(-)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 318d27841130..3a4215966c1b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1513,7 +1513,6 @@ static inline void iommu_debugfs_setup(void) {}
int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr);
-void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg);
#else /* CONFIG_IOMMU_DMA */
@@ -1529,11 +1528,6 @@ static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_a
{
return 0;
}
-
-static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
-{
-}
-
#endif /* CONFIG_IOMMU_DMA */
/*
diff --git a/include/linux/msi.h b/include/linux/msi.h
index b10093c4d00e..d442b4a69d56 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -184,7 +184,8 @@ struct msi_desc {
struct msi_msg msg;
struct irq_affinity_desc *affinity;
#ifdef CONFIG_IRQ_MSI_IOMMU
- const void *iommu_cookie;
+ u64 iommu_msi_iova : 58;
+ u64 iommu_msi_page_shift : 6;
#endif
#ifdef CONFIG_SYSFS
struct device_attribute *sysfs_attrs;
@@ -285,28 +286,36 @@ struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
#define msi_desc_to_dev(desc) ((desc)->dev)
-#ifdef CONFIG_IRQ_MSI_IOMMU
-static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
-{
- return desc->iommu_cookie;
-}
-
-static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
- const void *iommu_cookie)
+static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc,
+ u64 msi_iova,
+ unsigned int page_shift)
{
- desc->iommu_cookie = iommu_cookie;
-}
-#else
-static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
-{
- return NULL;
+#ifdef CONFIG_IRQ_MSI_IOMMU
+ desc->iommu_msi_iova = msi_iova >> page_shift;
+ desc->iommu_msi_page_shift = page_shift;
+#endif
}
-static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
- const void *iommu_cookie)
+/**
+ * iommu_dma_compose_msi_msg() - Apply translation to an MSI message
+ * @desc: MSI descriptor prepared by iommu_dma_prepare_msi()
+ * @msg: MSI message containing target physical address
+ */
+static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc,
+ struct msi_msg *msg)
{
-}
+#ifdef CONFIG_IRQ_MSI_IOMMU
+ if (desc->iommu_msi_page_shift) {
+ u64 msi_iova = desc->iommu_msi_iova
+ << desc->iommu_msi_page_shift;
+
+ msg->address_hi = upper_32_bits(msi_iova);
+ msg->address_lo = lower_32_bits(msi_iova) |
+ (msg->address_lo &
+ ((1 << desc->iommu_msi_page_shift) - 1));
+ }
#endif
+}
int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
struct msi_desc *init_desc);
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 2a9fa0c8cc00..bf91e014d179 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1815,7 +1815,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
static DEFINE_MUTEX(msi_prepare_lock); /* see below */
if (!domain || !domain->iova_cookie) {
- desc->iommu_cookie = NULL;
+ msi_desc_set_iommu_msi_iova(desc, 0, 0);
return 0;
}
@@ -1827,33 +1827,13 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
mutex_lock(&msi_prepare_lock);
msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain);
mutex_unlock(&msi_prepare_lock);
-
- msi_desc_set_iommu_cookie(desc, msi_page);
-
if (!msi_page)
return -ENOMEM;
- return 0;
-}
-/**
- * iommu_dma_compose_msi_msg() - Apply translation to an MSI message
- * @desc: MSI descriptor prepared by iommu_dma_prepare_msi()
- * @msg: MSI message containing target physical address
- */
-void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
-{
- struct device *dev = msi_desc_to_dev(desc);
- const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- const struct iommu_dma_msi_page *msi_page;
-
- msi_page = msi_desc_get_iommu_cookie(desc);
-
- if (!domain || !domain->iova_cookie || WARN_ON(!msi_page))
- return;
-
- msg->address_hi = upper_32_bits(msi_page->iova);
- msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1;
- msg->address_lo += lower_32_bits(msi_page->iova);
+ msi_desc_set_iommu_msi_iova(
+ desc, msi_page->iova,
+ ilog2(cookie_msi_granule(domain->iova_cookie)));
+ return 0;
}
static int iommu_dma_init(void)
--
2.43.0
Hi Nicolin,
On 1/11/25 4:32 AM, Nicolin Chen wrote:
> From: Jason Gunthorpe <jgg@nvidia.com>
>
> All the iommu cases simply want to override the MSI page's address with
those which translate MSIs
> the IOVA that was mapped through the iommu. This doesn't need a cookie
> pointer, we just need to store the IOVA and its page size in the msi_desc.
>
> Instead provide msi_desc_set_iommu_msi_iova() which allows the IOMMU side
> to specify the IOVA that the MSI page is placed during
> iommu_dma_prepare(). This is stored in the msi_desc and then
iommu_dma_prepare_msi()
> iommu_dma_compose_msi_msg() is a simple inline that sets address_hi/lo.
>
> The next patch will correct the naming.
>
> This is done because we cannot correctly lock access to group->domain in
> the atomic context that iommu_dma_compose_msi_msg() is called under. Today
> the locking miss is tolerable because dma_iommu.c operates under an
> assumption that the domain does not change while a driver is probed.
>
> However iommufd now permits the domain to change while the driver is
> probed and VFIO userspace can create races with IRQ changes calling
> iommu_dma_prepare/compose_msi_msg() and changing/freeing the iommu_domain.
and is it safe in iommu_dma_prepare_msi()?
>
> Removing the pointer, and critically, the call to
> iommu_get_domain_for_dev() during compose resolves this race.
>
> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
> ---
> include/linux/iommu.h | 6 ------
> include/linux/msi.h | 45 +++++++++++++++++++++++----------------
> drivers/iommu/dma-iommu.c | 30 +++++---------------------
> 3 files changed, 32 insertions(+), 49 deletions(-)
>
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index 318d27841130..3a4215966c1b 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -1513,7 +1513,6 @@ static inline void iommu_debugfs_setup(void) {}
> int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
>
> int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr);
> -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg);
>
> #else /* CONFIG_IOMMU_DMA */
>
> @@ -1529,11 +1528,6 @@ static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_a
> {
> return 0;
> }
> -
> -static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
> -{
> -}
> -
> #endif /* CONFIG_IOMMU_DMA */
>
> /*
> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index b10093c4d00e..d442b4a69d56 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -184,7 +184,8 @@ struct msi_desc {
> struct msi_msg msg;
> struct irq_affinity_desc *affinity;
> #ifdef CONFIG_IRQ_MSI_IOMMU
> - const void *iommu_cookie;
you may add kernel doc comments above
> + u64 iommu_msi_iova : 58;
> + u64 iommu_msi_page_shift : 6;
> #endif
> #ifdef CONFIG_SYSFS
> struct device_attribute *sysfs_attrs;
> @@ -285,28 +286,36 @@ struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
>
> #define msi_desc_to_dev(desc) ((desc)->dev)
>
> -#ifdef CONFIG_IRQ_MSI_IOMMU
> -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
> -{
> - return desc->iommu_cookie;
> -}
> -
> -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
> - const void *iommu_cookie)
> +static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc,
> + u64 msi_iova,
> + unsigned int page_shift)
> {
> - desc->iommu_cookie = iommu_cookie;
> -}
> -#else
> -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc)
> -{
> - return NULL;
> +#ifdef CONFIG_IRQ_MSI_IOMMU
> + desc->iommu_msi_iova = msi_iova >> page_shift;
> + desc->iommu_msi_page_shift = page_shift;
> +#endif
> }
>
> -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc,
> - const void *iommu_cookie)
> +/**
> + * iommu_dma_compose_msi_msg() - Apply translation to an MSI message
> + * @desc: MSI descriptor prepared by iommu_dma_prepare_msi()
> + * @msg: MSI message containing target physical address
> + */
> +static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc,
> + struct msi_msg *msg)
> {
> -}
> +#ifdef CONFIG_IRQ_MSI_IOMMU
> + if (desc->iommu_msi_page_shift) {
> + u64 msi_iova = desc->iommu_msi_iova
> + << desc->iommu_msi_page_shift;
> +
> + msg->address_hi = upper_32_bits(msi_iova);
> + msg->address_lo = lower_32_bits(msi_iova) |
> + (msg->address_lo &
> + ((1 << desc->iommu_msi_page_shift) - 1));
> + }
> #endif
> +}
>
> int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
> struct msi_desc *init_desc);
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 2a9fa0c8cc00..bf91e014d179 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -1815,7 +1815,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
> static DEFINE_MUTEX(msi_prepare_lock); /* see below */
>
> if (!domain || !domain->iova_cookie) {
> - desc->iommu_cookie = NULL;
> + msi_desc_set_iommu_msi_iova(desc, 0, 0);
> return 0;
> }
>
> @@ -1827,33 +1827,13 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
> mutex_lock(&msi_prepare_lock);
> msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain);
> mutex_unlock(&msi_prepare_lock);
> -
> - msi_desc_set_iommu_cookie(desc, msi_page);
> -
> if (!msi_page)
> return -ENOMEM;
> - return 0;
> -}
>
> -/**
> - * iommu_dma_compose_msi_msg() - Apply translation to an MSI message
> - * @desc: MSI descriptor prepared by iommu_dma_prepare_msi()
> - * @msg: MSI message containing target physical address
> - */
> -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
> -{
> - struct device *dev = msi_desc_to_dev(desc);
> - const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> - const struct iommu_dma_msi_page *msi_page;
> -
> - msi_page = msi_desc_get_iommu_cookie(desc);
> -
> - if (!domain || !domain->iova_cookie || WARN_ON(!msi_page))
> - return;
> -
> - msg->address_hi = upper_32_bits(msi_page->iova);
> - msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1;
> - msg->address_lo += lower_32_bits(msi_page->iova);
> + msi_desc_set_iommu_msi_iova(
> + desc, msi_page->iova,
> + ilog2(cookie_msi_granule(domain->iova_cookie)));
> + return 0;
> }
>
> static int iommu_dma_init(void)
On Thu, Jan 23, 2025 at 06:10:48PM +0100, Eric Auger wrote:
> > However iommufd now permits the domain to change while the driver is
> > probed and VFIO userspace can create races with IRQ changes calling
> > iommu_dma_prepare/compose_msi_msg() and changing/freeing the iommu_domain.
> and is it safe in iommu_dma_prepare_msi()?
iommu_dma_prepare_msi() takes the group mutex:
int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
{
struct device *dev = msi_desc_to_dev(desc);
struct iommu_group *group = dev->iommu_group;
mutex_lock(&group->mutex);
if (group->domain && group->domain->sw_msi)
ret = group->domain->sw_msi(group->domain, desc, msi_addr);
Which prevents changing domain attachments during execution.
For iommufd, if the domain attachment changes immediately after
iommu_dma_prepare_msi() unlocks, then the information given to
msi_desc_set_iommu_msi_iova() is still valid on the new domain.
This is because the iommufd implementation of sw_msi keeps the same
IOVA for the same ITS page globally across all domains. Any racing
change of domain will attach a new domain with the right ITS IOVA
already mapped and populated.
It is why this series stops using the domain pointer as a cookie
inside the msi_desc, immediately after the group->mutex is unlocked
a new domain can be attached and the old domain can be freed, which
would UAF the domain pointer in the cookie.
> > diff --git a/include/linux/msi.h b/include/linux/msi.h
> > index b10093c4d00e..d442b4a69d56 100644
> > --- a/include/linux/msi.h
> > +++ b/include/linux/msi.h
> > @@ -184,7 +184,8 @@ struct msi_desc {
> > struct msi_msg msg;
> > struct irq_affinity_desc *affinity;
> > #ifdef CONFIG_IRQ_MSI_IOMMU
> > - const void *iommu_cookie;
> you may add kernel doc comments above
I wondered if internal stuff was not being documented as the old
iommu_cookie didn't have a comment..
But sure:
* @iommu_msi_iova: Optional IOVA from the IOMMU to overide the msi_addr.
* Only used if iommu_msi_page_shift != 0
* @iommu_msi_page_shift: Indicates how many bits of the original address
* should be preserved when using iommu_msi_iova.
Jason
Hi,
On 1/23/25 7:48 PM, Jason Gunthorpe wrote:
> On Thu, Jan 23, 2025 at 06:10:48PM +0100, Eric Auger wrote:
>
>>> However iommufd now permits the domain to change while the driver is
>>> probed and VFIO userspace can create races with IRQ changes calling
>>> iommu_dma_prepare/compose_msi_msg() and changing/freeing the iommu_domain.
>> and is it safe in iommu_dma_prepare_msi()?
> iommu_dma_prepare_msi() takes the group mutex:
>
> int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
> {
> struct device *dev = msi_desc_to_dev(desc);
> struct iommu_group *group = dev->iommu_group;
>
> mutex_lock(&group->mutex);
> if (group->domain && group->domain->sw_msi)
> ret = group->domain->sw_msi(group->domain, desc, msi_addr);
>
> Which prevents changing domain attachments during execution.
>
> For iommufd, if the domain attachment changes immediately after
> iommu_dma_prepare_msi() unlocks, then the information given to
> msi_desc_set_iommu_msi_iova() is still valid on the new domain.
>
> This is because the iommufd implementation of sw_msi keeps the same
> IOVA for the same ITS page globally across all domains. Any racing
> change of domain will attach a new domain with the right ITS IOVA
> already mapped and populated.
> It is why this series stops using the domain pointer as a cookie
> inside the msi_desc, immediately after the group->mutex is unlocked
> a new domain can be attached and the old domain can be freed, which
> would UAF the domain pointer in the cookie.
OK thank you for the clarification
>
>>> diff --git a/include/linux/msi.h b/include/linux/msi.h
>>> index b10093c4d00e..d442b4a69d56 100644
>>> --- a/include/linux/msi.h
>>> +++ b/include/linux/msi.h
>>> @@ -184,7 +184,8 @@ struct msi_desc {
>>> struct msi_msg msg;
>>> struct irq_affinity_desc *affinity;
>>> #ifdef CONFIG_IRQ_MSI_IOMMU
>>> - const void *iommu_cookie;
>> you may add kernel doc comments above
> I wondered if internal stuff was not being documented as the old
> iommu_cookie didn't have a comment..
>
> But sure:
>
> * @iommu_msi_iova: Optional IOVA from the IOMMU to overide the msi_addr.
> * Only used if iommu_msi_page_shift != 0
> * @iommu_msi_page_shift: Indicates how many bits of the original address
> * should be preserved when using iommu_msi_iova.
Sounds good
Eric
>
> Jason
>
© 2016 - 2026 Red Hat, Inc.