[v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size

Yang Shi posted 1 patch 1 month, 3 weeks ago
There is a newer version of this series
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 8 +++++---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 +++++
2 files changed, 10 insertions(+), 3 deletions(-)
[v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago
The commit ce410410f1a7 ("iommu/arm-smmu-v3: Add arm_smmu_strtab_l1/2_idx()")
calculated the last index of L1 stream table by 1 << smmu->sid_bits. 1
is 32 bit value.
However some platforms, for example, AmpereOne, have 32-bit stream id size.
This resulted in ouf-of-bound shift.  The disassembly of shift is:

    ldr     w2, [x19, 828]  //, smmu_7(D)->sid_bits
    mov     w20, 1
    lsl     w20, w20, w2

According to ARM spec, if the registers are 32 bit, the instruction actually
does:
    dest = src << (shift % 32)

So it actually shifted by zero bit.

This caused v6.12-rc1 failed to boot on AmpereOne and other platform [1].

UBSAN also reported:

UBSAN: shift-out-of-bounds in drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c:3628:29
shift exponent 32 is too large for 32-bit type 'int'
CPU: 70 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc1 #4
Hardware name: ZOLLNER SUNMOONLAKE/SunMoon Lake, BIOS 00.00. 2024-08-28 18:42:45 08/28/2024
Call trace:
 dump_backtrace+0xdc/0x140
 show_stack+0x20/0x40
 dump_stack_lvl+0x60/0x80
 dump_stack+0x18/0x28
 ubsan_epilogue+0x10/0x48
 __ubsan_handle_shift_out_of_bounds+0xd8/0x1a0
 arm_smmu_init_structures+0x374/0x3c8
 arm_smmu_device_probe+0x208/0x600
 platform_probe+0x70/0xe8
 really_probe+0xc8/0x3a0
 __driver_probe_device+0x84/0x160
 driver_probe_device+0x44/0x130
 __driver_attach+0xcc/0x208
 bus_for_each_dev+0x84/0x100
 driver_attach+0x2c/0x40
 bus_add_driver+0x158/0x290
 driver_register+0x70/0x138
 __platform_driver_register+0x2c/0x40
 arm_smmu_driver_init+0x28/0x40
 do_one_initcall+0x60/0x318
 do_initcalls+0x198/0x1e0
 kernel_init_freeable+0x18c/0x1e8
 kernel_init+0x28/0x160
 ret_from_fork+0x10/0x20

Using 64 bit immediate when doing shift can solve the problem.  The
disassembly after the fix looks like:
    ldr     w20, [x19, 828] //, smmu_7(D)->sid_bits
    mov     x0, 1
    lsl     x0, x0, x20

There are a couple of problematic places, extracted the shift into a helper.

[1] https://lore.kernel.org/lkml/d4b53bbb-333a-45b9-9eb0-23ddd0820a14@arm.com/
Fixes: ce410410f1a7 ("iommu/arm-smmu-v3: Add arm_smmu_strtab_l1/2_idx()")
Tested-by: James Morse <james.morse@arm.com>
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 8 +++++---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 +++++
 2 files changed, 10 insertions(+), 3 deletions(-)

v2: * Extracted the shift into a helper per Jason Gunthorpe.
    * Covered more places per Nicolin Chen and Jason Gunthorpe.
    * Used 1ULL instead of 1UL to guarantee 64 bit per Robin Murphy.
    * Made the subject more general since this is not AmpereOne specific
      problem per the report from James Morse.
    * Collected t-b tag from James Morse.
    * Added Fixes tag in commit log.

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 737c5b882355..4eafd9f04808 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3624,8 +3624,9 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 {
 	u32 l1size;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+	unsigned int max_sid = arm_smmu_strtab_max_sid(smmu);
 	unsigned int last_sid_idx =
-		arm_smmu_strtab_l1_idx((1 << smmu->sid_bits) - 1);
+		arm_smmu_strtab_l1_idx(max_sid - 1);
 
 	/* Calculate the L1 size, capped to the SIDSIZE. */
 	cfg->l2.num_l1_ents = min(last_sid_idx + 1, STRTAB_MAX_L1_ENTRIES);
@@ -3657,8 +3658,9 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
 {
 	u32 size;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+	unsigned int max_sid = arm_smmu_strtab_max_sid(smmu);
 
-	size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
+	size = max_sid * sizeof(struct arm_smmu_ste);
 	cfg->linear.table = dmam_alloc_coherent(smmu->dev, size,
 						&cfg->linear.ste_dma,
 						GFP_KERNEL);
@@ -3668,7 +3670,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
 			size);
 		return -ENOMEM;
 	}
-	cfg->linear.num_ents = 1 << smmu->sid_bits;
+	cfg->linear.num_ents = max_sid;
 
 	arm_smmu_init_initial_stes(cfg->linear.table, cfg->linear.num_ents);
 	return 0;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 1e9952ca989f..f7e8465c629a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -853,6 +853,11 @@ struct arm_smmu_master_domain {
 	ioasid_t ssid;
 };
 
+static inline unsigned int arm_smmu_strtab_max_sid(struct arm_smmu_device *smmu)
+{
+	return (1ULL << smmu->sid_bits);
+}
+
 static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
 {
 	return container_of(dom, struct arm_smmu_domain, domain);
-- 
2.41.0
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Wed, Oct 02, 2024 at 10:55:14AM -0700, Yang Shi wrote:

> Using 64 bit immediate when doing shift can solve the problem.  The
> disassembly after the fix looks like:
>     ldr     w20, [x19, 828] //, smmu_7(D)->sid_bits
>     mov     x0, 1
>     lsl     x0, x0, x20
> 
> There are a couple of problematic places, extracted the shift into a helper.
> 
> [1] https://lore.kernel.org/lkml/d4b53bbb-333a-45b9-9eb0-23ddd0820a14@arm.com/
> Fixes: ce410410f1a7 ("iommu/arm-smmu-v3: Add arm_smmu_strtab_l1/2_idx()")
> Tested-by: James Morse <james.morse@arm.com>
> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
> ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 8 +++++---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 +++++
>  2 files changed, 10 insertions(+), 3 deletions(-)

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Nicolin Chen 1 month, 3 weeks ago
I think both v1 and v2 are missing iommu@lists.linux.dev in CC.

On Wed, Oct 02, 2024 at 10:55:14AM -0700, Yang Shi wrote:
> +static inline unsigned int arm_smmu_strtab_max_sid(struct arm_smmu_device *smmu)
> +{
> +       return (1ULL << smmu->sid_bits);
> +}
> +

Hmm, why ULL gets truncated to unsigned int here?

Thanks
Nicolin
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/2/24 11:17 AM, Nicolin Chen wrote:
> I think both v1 and v2 are missing iommu@lists.linux.dev in CC.

Will do it for the later revision.

>
> On Wed, Oct 02, 2024 at 10:55:14AM -0700, Yang Shi wrote:
>> +static inline unsigned int arm_smmu_strtab_max_sid(struct arm_smmu_device *smmu)
>> +{
>> +       return (1ULL << smmu->sid_bits);
>> +}
>> +
> Hmm, why ULL gets truncated to unsigned int here?

No particular reason, but it should be better to not truncate here. Will 
fix it.

>
> Thanks
> Nicolin
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Nicolin Chen 1 month, 3 weeks ago
On Wed, Oct 02, 2024 at 12:04:32PM -0700, Yang Shi wrote:
> > On Wed, Oct 02, 2024 at 10:55:14AM -0700, Yang Shi wrote:
> > > +static inline unsigned int arm_smmu_strtab_max_sid(struct arm_smmu_device *smmu)
> > > +{
> > > +       return (1ULL << smmu->sid_bits);
> > > +}
> > > +
> > Hmm, why ULL gets truncated to unsigned int here?
> 
> No particular reason, but it should be better to not truncate here. Will
> fix it.

Yea, and looks like we are going to do with:
static inline u64 arm_smmu_strtab_num_sids(struct arm_smmu_device *smmu);

Then let's be careful at those return-value holders too:
-----------------------------------------------------------
static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
{
	u32 size;
	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;

	size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
        ^^^^
        overflow?
[...]
	cfg->linear.num_ents = 1 << smmu->sid_bits;
                    ^^^^^^^^
                    This is u32
-----------------------------------------------------------

Thanks
Nicolin
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/2/24 12:22 PM, Nicolin Chen wrote:
> On Wed, Oct 02, 2024 at 12:04:32PM -0700, Yang Shi wrote:
>>> On Wed, Oct 02, 2024 at 10:55:14AM -0700, Yang Shi wrote:
>>>> +static inline unsigned int arm_smmu_strtab_max_sid(struct arm_smmu_device *smmu)
>>>> +{
>>>> +       return (1ULL << smmu->sid_bits);
>>>> +}
>>>> +
>>> Hmm, why ULL gets truncated to unsigned int here?
>> No particular reason, but it should be better to not truncate here. Will
>> fix it.
> Yea, and looks like we are going to do with:
> static inline u64 arm_smmu_strtab_num_sids(struct arm_smmu_device *smmu);

Since this is an inline function, so the truncate should actually 
happens when it is used. But anyway using the correct type does make the 
code less confusing.

>
> Then let's be careful at those return-value holders too:
> -----------------------------------------------------------
> static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
> {
> 	u32 size;
> 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
>
> 	size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
>          ^^^^
>          overflow?
> [...]
> 	cfg->linear.num_ents = 1 << smmu->sid_bits;
>                      ^^^^^^^^
>                      This is u32
> -----------------------------------------------------------
>
> Thanks
> Nicolin
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Wed, Oct 02, 2024 at 12:22:48PM -0700, Nicolin Chen wrote:
> On Wed, Oct 02, 2024 at 12:04:32PM -0700, Yang Shi wrote:
> > > On Wed, Oct 02, 2024 at 10:55:14AM -0700, Yang Shi wrote:
> > > > +static inline unsigned int arm_smmu_strtab_max_sid(struct arm_smmu_device *smmu)
> > > > +{
> > > > +       return (1ULL << smmu->sid_bits);
> > > > +}
> > > > +
> > > Hmm, why ULL gets truncated to unsigned int here?
> > 
> > No particular reason, but it should be better to not truncate here. Will
> > fix it.
> 
> Yea, and looks like we are going to do with:
> static inline u64 arm_smmu_strtab_num_sids(struct arm_smmu_device *smmu);
> 
> Then let's be careful at those return-value holders too:
> -----------------------------------------------------------
> static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
> {
> 	u32 size;
> 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
> 
> 	size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
>         ^^^^
>         overflow?
> [...]
> 	cfg->linear.num_ents = 1 << smmu->sid_bits;
>                     ^^^^^^^^
>                     This is u32
> -----------------------------------------------------------

It would make some sense to have something like:

 u64 size = arm_smmu_strtab_max_sid()

 /* Would require too much memory */
 if (size > SZ_512M)
    return -EINVAL;

Just to reject bad configuration rather than truncate the allocation
and overflow STE array memory or something. Having drivers be robust
to this kind of stuff is a confidential compute topic :\

Jason
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/2/24 12:40 PM, Jason Gunthorpe wrote:
> On Wed, Oct 02, 2024 at 12:22:48PM -0700, Nicolin Chen wrote:
>> On Wed, Oct 02, 2024 at 12:04:32PM -0700, Yang Shi wrote:
>>>> On Wed, Oct 02, 2024 at 10:55:14AM -0700, Yang Shi wrote:
>>>>> +static inline unsigned int arm_smmu_strtab_max_sid(struct arm_smmu_device *smmu)
>>>>> +{
>>>>> +       return (1ULL << smmu->sid_bits);
>>>>> +}
>>>>> +
>>>> Hmm, why ULL gets truncated to unsigned int here?
>>> No particular reason, but it should be better to not truncate here. Will
>>> fix it.
>> Yea, and looks like we are going to do with:
>> static inline u64 arm_smmu_strtab_num_sids(struct arm_smmu_device *smmu);
>>
>> Then let's be careful at those return-value holders too:
>> -----------------------------------------------------------
>> static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
>> {
>> 	u32 size;
>> 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
>>
>> 	size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
>>          ^^^^
>>          overflow?
>> [...]
>> 	cfg->linear.num_ents = 1 << smmu->sid_bits;
>>                      ^^^^^^^^
>>                      This is u32
>> -----------------------------------------------------------
> It would make some sense to have something like:
>
>   u64 size = arm_smmu_strtab_max_sid()
>
>   /* Would require too much memory */
>   if (size > SZ_512M)
>      return -EINVAL;

Why not just check smmu->sid_bits?

For example,

if (smmu->sid_bits > 28)
     return -EINVAL;

The check can happen before the shift.

>
> Just to reject bad configuration rather than truncate the allocation
> and overflow STE array memory or something. Having drivers be robust
> to this kind of stuff is a confidential compute topic :\
>
> Jason

Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Wed, Oct 02, 2024 at 01:05:08PM -0700, Yang Shi wrote:
> > It would make some sense to have something like:
> > 
> >   u64 size = arm_smmu_strtab_max_sid()
> > 
> >   /* Would require too much memory */
> >   if (size > SZ_512M)
> >      return -EINVAL;
> 
> Why not just check smmu->sid_bits?
> 
> For example,
> 
> if (smmu->sid_bits > 28)
>     return -EINVAL;
> 
> The check can happen before the shift.

Sure, but IMHO it reads a bit better to check the size computed from
the helper

MAX_PAGE_ORDER is often 10, so kmalloc will always fail before we
reach 28 bits of sid space.

Jason
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/3/24 4:16 AM, Jason Gunthorpe wrote:
> On Wed, Oct 02, 2024 at 01:05:08PM -0700, Yang Shi wrote:
>>> It would make some sense to have something like:
>>>
>>>    u64 size = arm_smmu_strtab_max_sid()
>>>
>>>    /* Would require too much memory */
>>>    if (size > SZ_512M)
>>>       return -EINVAL;
>> Why not just check smmu->sid_bits?
>>
>> For example,
>>
>> if (smmu->sid_bits > 28)
>>      return -EINVAL;
>>
>> The check can happen before the shift.
> Sure, but IMHO it reads a bit better to check the size computed from
> the helper
>
> MAX_PAGE_ORDER is often 10, so kmalloc will always fail before we
> reach 28 bits of sid space.

I'm wondering we may just use 31 instead of using some magic number:

if (smmu->sid_bits > 31)
     return -EINVAL;

If I understand correctly, the check is mainly used to avoid the u64 -> 
u32 overflow. This check guarantee no overflow. If some crazy hardware 
really requests that large memory, the allocation will fail.

>
> Jason

Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Thu, Oct 03, 2024 at 08:31:23AM -0700, Yang Shi wrote:
> If I understand correctly, the check is mainly used to avoid the u64 -> u32
> overflow. This check guarantee no overflow. If some crazy hardware really
> requests that large memory, the allocation will fail.

Sure, the kalloc will print a warn on anyhow if it is too big

Jason
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/4/24 5:43 AM, Jason Gunthorpe wrote:
> On Thu, Oct 03, 2024 at 08:31:23AM -0700, Yang Shi wrote:
>> If I understand correctly, the check is mainly used to avoid the u64 -> u32
>> overflow. This check guarantee no overflow. If some crazy hardware really
>> requests that large memory, the allocation will fail.
> Sure, the kalloc will print a warn on anyhow if it is too big

Thank you. Will spin a new revision.

> Jason
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Fri, Oct 04, 2024 at 09:05:46AM -0700, Yang Shi wrote:
> 
> 
> On 10/4/24 5:43 AM, Jason Gunthorpe wrote:
> > On Thu, Oct 03, 2024 at 08:31:23AM -0700, Yang Shi wrote:
> > > If I understand correctly, the check is mainly used to avoid the u64 -> u32
> > > overflow. This check guarantee no overflow. If some crazy hardware really
> > > requests that large memory, the allocation will fail.
> > Sure, the kalloc will print a warn on anyhow if it is too big
> 
> Thank you. Will spin a new revision.

Oh wait a sec, it is not so simple, the 31 is too big because the
multiply will overflow or truncate to size_t too. This is why I picked
something lower.

Jason
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/4/24 9:14 AM, Jason Gunthorpe wrote:
> On Fri, Oct 04, 2024 at 09:05:46AM -0700, Yang Shi wrote:
>>
>> On 10/4/24 5:43 AM, Jason Gunthorpe wrote:
>>> On Thu, Oct 03, 2024 at 08:31:23AM -0700, Yang Shi wrote:
>>>> If I understand correctly, the check is mainly used to avoid the u64 -> u32
>>>> overflow. This check guarantee no overflow. If some crazy hardware really
>>>> requests that large memory, the allocation will fail.
>>> Sure, the kalloc will print a warn on anyhow if it is too big
>> Thank you. Will spin a new revision.
> Oh wait a sec, it is not so simple, the 31 is too big because the
> multiply will overflow or truncate to size_t too. This is why I picked
> something lower.

How about define the size as u64?

  static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
  {
-       u32 size;
+       u64 size;
         struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
         u32 num_sids;

It won't overflow and the large allocation will fail anyway.

>
> Jason

Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/4/24 9:29 AM, Yang Shi wrote:
>
>
> On 10/4/24 9:14 AM, Jason Gunthorpe wrote:
>> On Fri, Oct 04, 2024 at 09:05:46AM -0700, Yang Shi wrote:
>>>
>>> On 10/4/24 5:43 AM, Jason Gunthorpe wrote:
>>>> On Thu, Oct 03, 2024 at 08:31:23AM -0700, Yang Shi wrote:
>>>>> If I understand correctly, the check is mainly used to avoid the 
>>>>> u64 -> u32
>>>>> overflow. This check guarantee no overflow. If some crazy hardware 
>>>>> really
>>>>> requests that large memory, the allocation will fail.
>>>> Sure, the kalloc will print a warn on anyhow if it is too big
>>> Thank you. Will spin a new revision.
>> Oh wait a sec, it is not so simple, the 31 is too big because the
>> multiply will overflow or truncate to size_t too. This is why I picked
>> something lower.
>
> How about define the size as u64?
>
>  static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
>  {
> -       u32 size;
> +       u64 size;
>         struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
>         u32 num_sids;
>
> It won't overflow and the large allocation will fail anyway.

The size parameter passed to dmam_alloc_coherent() is size_t type. We 
may do:

+#define SIZE_4G (4 * 1024 * 1024 * 1024ULL)
+
  static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
  {
-       u32 size;
+       u64 size;
         struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
-       u32 num_sids;
+       u64 num_sids;

         /* It is not practical to have too large stream id size for 
linear */
         if (smmu->sid_bits > 31)
@@ -3667,6 +3669,9 @@ static int arm_smmu_init_strtab_linear(struct 
arm_smmu_device *smmu)
         num_sids = arm_smmu_strtab_num_sids(smmu);

         size = num_sids * sizeof(struct arm_smmu_ste);
+       if (size > SIZE_4G)
+               return -EINVAL;
+
         cfg->linear.table = dmam_alloc_coherent(smmu->dev, size,
&cfg->linear.ste_dma,
                                                 GFP_KERNEL);

>
>>
>> Jason
>

Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Fri, Oct 04, 2024 at 09:37:10AM -0700, Yang Shi wrote:
> 
>         size = num_sids * sizeof(struct arm_smmu_ste);
> +       if (size > SIZE_4G)
> +               return -EINVAL;

If you want to do it like that then 'size >= SIZE_MAX' is the right
expression

Jason
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/4/24 9:41 AM, Jason Gunthorpe wrote:
> On Fri, Oct 04, 2024 at 09:37:10AM -0700, Yang Shi wrote:
>>          size = num_sids * sizeof(struct arm_smmu_ste);
>> +       if (size > SIZE_4G)
>> +               return -EINVAL;
> If you want to do it like that then 'size >= SIZE_MAX' is the right
> expression

Thank you. Will take it.

And with the size check we don't need to check smmu->sid_bits.

>
> Jason
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/2/24 12:40 PM, Jason Gunthorpe wrote:
> On Wed, Oct 02, 2024 at 12:22:48PM -0700, Nicolin Chen wrote:
>> On Wed, Oct 02, 2024 at 12:04:32PM -0700, Yang Shi wrote:
>>>> On Wed, Oct 02, 2024 at 10:55:14AM -0700, Yang Shi wrote:
>>>>> +static inline unsigned int arm_smmu_strtab_max_sid(struct arm_smmu_device *smmu)
>>>>> +{
>>>>> +       return (1ULL << smmu->sid_bits);
>>>>> +}
>>>>> +
>>>> Hmm, why ULL gets truncated to unsigned int here?
>>> No particular reason, but it should be better to not truncate here. Will
>>> fix it.
>> Yea, and looks like we are going to do with:
>> static inline u64 arm_smmu_strtab_num_sids(struct arm_smmu_device *smmu);
>>
>> Then let's be careful at those return-value holders too:
>> -----------------------------------------------------------
>> static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
>> {
>> 	u32 size;
>> 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
>>
>> 	size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
>>          ^^^^
>>          overflow?
>> [...]
>> 	cfg->linear.num_ents = 1 << smmu->sid_bits;
>>                      ^^^^^^^^
>>                      This is u32
>> -----------------------------------------------------------

Instead of relying in separate emails, I'd prefer reply in one single place.

This is linear stream table code, IIUC nobody actually implements such 
large sid size with linear table per the earlier discussions.

> It would make some sense to have something like:
>
>   u64 size = arm_smmu_strtab_max_sid()
>
>   /* Would require too much memory */
>   if (size > SZ_512M)
>      return -EINVAL;

This extra check is definitely fine to me and makes sense. SZ_512M is 1 
<< 29, so it means no hardware actually has 29 bit sid size with linear 
stream table. It makes sense to me. But I'm not smmu expert, so just 
would like to double check such configuration is not existing.

>
> Just to reject bad configuration rather than truncate the allocation
> and overflow STE array memory or something. Having drivers be robust
> to this kind of stuff is a confidential compute topic :\
>
> Jason
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Robin Murphy 1 month, 3 weeks ago
On 2024-10-02 6:55 pm, Yang Shi wrote:
> The commit ce410410f1a7 ("iommu/arm-smmu-v3: Add arm_smmu_strtab_l1/2_idx()")
> calculated the last index of L1 stream table by 1 << smmu->sid_bits. 1
> is 32 bit value.
> However some platforms, for example, AmpereOne, have 32-bit stream id size.
> This resulted in ouf-of-bound shift.  The disassembly of shift is:
> 
>      ldr     w2, [x19, 828]  //, smmu_7(D)->sid_bits
>      mov     w20, 1
>      lsl     w20, w20, w2
> 
> According to ARM spec, if the registers are 32 bit, the instruction actually
> does:
>      dest = src << (shift % 32)
> 
> So it actually shifted by zero bit.
> 
> This caused v6.12-rc1 failed to boot on AmpereOne and other platform [1].

FWIW it's going to be seen on any platform with Arm MMU-700 since that 
always advertises 32-bit StreamID support (as other SMMU implementations 
may do too).

> UBSAN also reported:
> 
> UBSAN: shift-out-of-bounds in drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c:3628:29
> shift exponent 32 is too large for 32-bit type 'int'

At best, those two lines of actual UBSAN warning are the only part 
relevant to the point, the rest of the backtrace below is definitely 
not, please trim it.

> CPU: 70 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc1 #4
> Hardware name: ZOLLNER SUNMOONLAKE/SunMoon Lake, BIOS 00.00. 2024-08-28 18:42:45 08/28/2024
> Call trace:
>   dump_backtrace+0xdc/0x140
>   show_stack+0x20/0x40
>   dump_stack_lvl+0x60/0x80
>   dump_stack+0x18/0x28
>   ubsan_epilogue+0x10/0x48
>   __ubsan_handle_shift_out_of_bounds+0xd8/0x1a0
>   arm_smmu_init_structures+0x374/0x3c8
>   arm_smmu_device_probe+0x208/0x600
>   platform_probe+0x70/0xe8
>   really_probe+0xc8/0x3a0
>   __driver_probe_device+0x84/0x160
>   driver_probe_device+0x44/0x130
>   __driver_attach+0xcc/0x208
>   bus_for_each_dev+0x84/0x100
>   driver_attach+0x2c/0x40
>   bus_add_driver+0x158/0x290
>   driver_register+0x70/0x138
>   __platform_driver_register+0x2c/0x40
>   arm_smmu_driver_init+0x28/0x40
>   do_one_initcall+0x60/0x318
>   do_initcalls+0x198/0x1e0
>   kernel_init_freeable+0x18c/0x1e8
>   kernel_init+0x28/0x160
>   ret_from_fork+0x10/0x20
> 
> Using 64 bit immediate when doing shift can solve the problem.  The
> disassembly after the fix looks like:
>      ldr     w20, [x19, 828] //, smmu_7(D)->sid_bits
>      mov     x0, 1
>      lsl     x0, x0, x20
> 
> There are a couple of problematic places, extracted the shift into a helper.
> 
> [1] https://lore.kernel.org/lkml/d4b53bbb-333a-45b9-9eb0-23ddd0820a14@arm.com/
> Fixes: ce410410f1a7 ("iommu/arm-smmu-v3: Add arm_smmu_strtab_l1/2_idx()")
> Tested-by: James Morse <james.morse@arm.com>
> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
> ---
>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 8 +++++---
>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 +++++
>   2 files changed, 10 insertions(+), 3 deletions(-)
> 
> v2: * Extracted the shift into a helper per Jason Gunthorpe.
>      * Covered more places per Nicolin Chen and Jason Gunthorpe.
>      * Used 1ULL instead of 1UL to guarantee 64 bit per Robin Murphy.
>      * Made the subject more general since this is not AmpereOne specific
>        problem per the report from James Morse.
>      * Collected t-b tag from James Morse.
>      * Added Fixes tag in commit log.
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> index 737c5b882355..4eafd9f04808 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
> @@ -3624,8 +3624,9 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
>   {
>   	u32 l1size;
>   	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
> +	unsigned int max_sid = arm_smmu_strtab_max_sid(smmu);
>   	unsigned int last_sid_idx =
> -		arm_smmu_strtab_l1_idx((1 << smmu->sid_bits) - 1);
> +		arm_smmu_strtab_l1_idx(max_sid - 1);
>   
>   	/* Calculate the L1 size, capped to the SIDSIZE. */
>   	cfg->l2.num_l1_ents = min(last_sid_idx + 1, STRTAB_MAX_L1_ENTRIES);
> @@ -3657,8 +3658,9 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
>   {
>   	u32 size;
>   	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
> +	unsigned int max_sid = arm_smmu_strtab_max_sid(smmu);
>   
> -	size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
> +	size = max_sid * sizeof(struct arm_smmu_ste);
>   	cfg->linear.table = dmam_alloc_coherent(smmu->dev, size,
>   						&cfg->linear.ste_dma,
>   						GFP_KERNEL);
> @@ -3668,7 +3670,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
>   			size);
>   		return -ENOMEM;
>   	}
> -	cfg->linear.num_ents = 1 << smmu->sid_bits;
> +	cfg->linear.num_ents = max_sid;
>   
>   	arm_smmu_init_initial_stes(cfg->linear.table, cfg->linear.num_ents);
>   	return 0;
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> index 1e9952ca989f..f7e8465c629a 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
> @@ -853,6 +853,11 @@ struct arm_smmu_master_domain {
>   	ioasid_t ssid;
>   };
>   
> +static inline unsigned int arm_smmu_strtab_max_sid(struct arm_smmu_device *smmu)

Nit: "max_sid" implies returning the largest supported StreamID value, 
so it would be logical to either include the "- 1" in here and adjust 
the other callers, or instead call this something like "num_sids".

Thanks,
Robin.

> +{
> +	return (1ULL << smmu->sid_bits);
> +}
> +
>   static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
>   {
>   	return container_of(dom, struct arm_smmu_domain, domain);
Re: [v2 PATCH] iommu/arm-smmu-v3: Fix L1 stream table index calculation for 32-bit sid size
Posted by Yang Shi 1 month, 3 weeks ago

On 10/2/24 11:21 AM, Robin Murphy wrote:
> On 2024-10-02 6:55 pm, Yang Shi wrote:
>> The commit ce410410f1a7 ("iommu/arm-smmu-v3: Add 
>> arm_smmu_strtab_l1/2_idx()")
>> calculated the last index of L1 stream table by 1 << smmu->sid_bits. 1
>> is 32 bit value.
>> However some platforms, for example, AmpereOne, have 32-bit stream id 
>> size.
>> This resulted in ouf-of-bound shift.  The disassembly of shift is:
>>
>>      ldr     w2, [x19, 828]  //, smmu_7(D)->sid_bits
>>      mov     w20, 1
>>      lsl     w20, w20, w2
>>
>> According to ARM spec, if the registers are 32 bit, the instruction 
>> actually
>> does:
>>      dest = src << (shift % 32)
>>
>> So it actually shifted by zero bit.
>>
>> This caused v6.12-rc1 failed to boot on AmpereOne and other platform 
>> [1].
>
> FWIW it's going to be seen on any platform with Arm MMU-700 since that 
> always advertises 32-bit StreamID support (as other SMMU 
> implementations may do too).

I see. Will add this info to the commit log.

>
>> UBSAN also reported:
>>
>> UBSAN: shift-out-of-bounds in 
>> drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c:3628:29
>> shift exponent 32 is too large for 32-bit type 'int'
>
> At best, those two lines of actual UBSAN warning are the only part 
> relevant to the point, the rest of the backtrace below is definitely 
> not, please trim it.

OK.

>
>> CPU: 70 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc1 #4
>> Hardware name: ZOLLNER SUNMOONLAKE/SunMoon Lake, BIOS 00.00. 
>> 2024-08-28 18:42:45 08/28/2024
>> Call trace:
>>   dump_backtrace+0xdc/0x140
>>   show_stack+0x20/0x40
>>   dump_stack_lvl+0x60/0x80
>>   dump_stack+0x18/0x28
>>   ubsan_epilogue+0x10/0x48
>>   __ubsan_handle_shift_out_of_bounds+0xd8/0x1a0
>>   arm_smmu_init_structures+0x374/0x3c8
>>   arm_smmu_device_probe+0x208/0x600
>>   platform_probe+0x70/0xe8
>>   really_probe+0xc8/0x3a0
>>   __driver_probe_device+0x84/0x160
>>   driver_probe_device+0x44/0x130
>>   __driver_attach+0xcc/0x208
>>   bus_for_each_dev+0x84/0x100
>>   driver_attach+0x2c/0x40
>>   bus_add_driver+0x158/0x290
>>   driver_register+0x70/0x138
>>   __platform_driver_register+0x2c/0x40
>>   arm_smmu_driver_init+0x28/0x40
>>   do_one_initcall+0x60/0x318
>>   do_initcalls+0x198/0x1e0
>>   kernel_init_freeable+0x18c/0x1e8
>>   kernel_init+0x28/0x160
>>   ret_from_fork+0x10/0x20
>>
>> Using 64 bit immediate when doing shift can solve the problem. The
>> disassembly after the fix looks like:
>>      ldr     w20, [x19, 828] //, smmu_7(D)->sid_bits
>>      mov     x0, 1
>>      lsl     x0, x0, x20
>>
>> There are a couple of problematic places, extracted the shift into a 
>> helper.
>>
>> [1] 
>> https://lore.kernel.org/lkml/d4b53bbb-333a-45b9-9eb0-23ddd0820a14@arm.com/
>> Fixes: ce410410f1a7 ("iommu/arm-smmu-v3: Add 
>> arm_smmu_strtab_l1/2_idx()")
>> Tested-by: James Morse <james.morse@arm.com>
>> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
>> ---
>>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 8 +++++---
>>   drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 +++++
>>   2 files changed, 10 insertions(+), 3 deletions(-)
>>
>> v2: * Extracted the shift into a helper per Jason Gunthorpe.
>>      * Covered more places per Nicolin Chen and Jason Gunthorpe.
>>      * Used 1ULL instead of 1UL to guarantee 64 bit per Robin Murphy.
>>      * Made the subject more general since this is not AmpereOne 
>> specific
>>        problem per the report from James Morse.
>>      * Collected t-b tag from James Morse.
>>      * Added Fixes tag in commit log.
>>
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> index 737c5b882355..4eafd9f04808 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
>> @@ -3624,8 +3624,9 @@ static int arm_smmu_init_strtab_2lvl(struct 
>> arm_smmu_device *smmu)
>>   {
>>       u32 l1size;
>>       struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
>> +    unsigned int max_sid = arm_smmu_strtab_max_sid(smmu);
>>       unsigned int last_sid_idx =
>> -        arm_smmu_strtab_l1_idx((1 << smmu->sid_bits) - 1);
>> +        arm_smmu_strtab_l1_idx(max_sid - 1);
>>         /* Calculate the L1 size, capped to the SIDSIZE. */
>>       cfg->l2.num_l1_ents = min(last_sid_idx + 1, 
>> STRTAB_MAX_L1_ENTRIES);
>> @@ -3657,8 +3658,9 @@ static int arm_smmu_init_strtab_linear(struct 
>> arm_smmu_device *smmu)
>>   {
>>       u32 size;
>>       struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
>> +    unsigned int max_sid = arm_smmu_strtab_max_sid(smmu);
>>   -    size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
>> +    size = max_sid * sizeof(struct arm_smmu_ste);
>>       cfg->linear.table = dmam_alloc_coherent(smmu->dev, size,
>>                           &cfg->linear.ste_dma,
>>                           GFP_KERNEL);
>> @@ -3668,7 +3670,7 @@ static int arm_smmu_init_strtab_linear(struct 
>> arm_smmu_device *smmu)
>>               size);
>>           return -ENOMEM;
>>       }
>> -    cfg->linear.num_ents = 1 << smmu->sid_bits;
>> +    cfg->linear.num_ents = max_sid;
>>         arm_smmu_init_initial_stes(cfg->linear.table, 
>> cfg->linear.num_ents);
>>       return 0;
>> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h 
>> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
>> index 1e9952ca989f..f7e8465c629a 100644
>> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
>> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
>> @@ -853,6 +853,11 @@ struct arm_smmu_master_domain {
>>       ioasid_t ssid;
>>   };
>>   +static inline unsigned int arm_smmu_strtab_max_sid(struct 
>> arm_smmu_device *smmu)
>
> Nit: "max_sid" implies returning the largest supported StreamID value, 
> so it would be logical to either include the "- 1" in here and adjust 
> the other callers, or instead call this something like "num_sids".

Will use "num_sids".

>
> Thanks,
> Robin.
>
>> +{
>> +    return (1ULL << smmu->sid_bits);
>> +}
>> +
>>   static inline struct arm_smmu_domain *to_smmu_domain(struct 
>> iommu_domain *dom)
>>   {
>>       return container_of(dom, struct arm_smmu_domain, domain);