[v1] iommu/s390: Implement blocking domain

[PATCH] iommu/s390: Implement blocking domain

Posted by Niklas Schnelle 1 year, 6 months ago

This fixes a crash when surprise hot-unplugging a PCI device. This crash
happens because during hot-unplug __iommu_group_set_domain_nofail()
attaching the default domain fails when the platform no longer
recognizes the device as it has already been removed and we end up with
a NULL domain pointer and UAF. This is exactly the case referred to in
the second comment in __iommu_device_set_domain() and just as stated
there if we can instead attach the blocking domain the UAF is prevented
as this can handle the already removed device. Implement the blocking
domain to use this handling. This would still leave us with a warning
for a failed attach. As failing to attach when the device is no longer
present is expected behavior turn this into an explicit -ENODEV error
and don't warn for it. Also change the error return for a NULL zdev to
-EIO as we don't want to ignore this case that would be a serious bug.

Fixes: c76c067e488c ("s390/pci: Use dma-iommu layer")
Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
---
Note: I somewhat suspect this to be related to the following discussion
or at least we have seen the same backtraces in reports that we suspect
to be caused by the issue fixed with this patch. In the case I was able
to reproduce with vfio-pci pass-through to a KVM guest I got a different
trace though.

Organizational note: I'll be on vacation starting Thursday. Matt will
then take over and sent new revisions as necessary.
---
 drivers/iommu/iommu.c      |  7 ++++--
 drivers/iommu/s390-iommu.c | 55 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ed6c5cb60c5a..91b3b23bf55c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -119,8 +119,11 @@ static int __iommu_group_set_domain(struct iommu_group *group,
 static void __iommu_group_set_domain_nofail(struct iommu_group *group,
 					    struct iommu_domain *new_domain)
 {
-	WARN_ON(__iommu_group_set_domain_internal(
-		group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED));
+	int ret = __iommu_group_set_domain_internal(
+		group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED);
+
+	/* Allow attach to fail when the device is gone */
+	WARN_ON(ret && ret != -ENODEV);
 }
 
 static int iommu_setup_default_domain(struct iommu_group *group,
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index d8eaa7ea380b..3d8b5fe9a555 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -33,6 +33,8 @@ struct s390_domain {
 	struct rcu_head		rcu;
 };
 
+static struct s390_domain s390_blocking_domain;
+
 static inline unsigned int calc_rtx(dma_addr_t ptr)
 {
 	return ((unsigned long)ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK;
@@ -376,12 +378,21 @@ static void s390_iommu_detach_device(struct iommu_domain *domain,
 	struct zpci_dev *zdev = to_zpci_dev(dev);
 	unsigned long flags;
 
+	/*
+	 * The static blocking domain doesn't need to track devices nor
+	 * does it have an IOAT registered. As there is no harm
+	 * in keeping zdev->s390_domain set to blocking until
+	 * it is overwritten detach is a no-op.
+	 */
+	if (s390_domain->domain.type == IOMMU_DOMAIN_BLOCKED)
+		return;
+
 	spin_lock_irqsave(&s390_domain->list_lock, flags);
 	list_del_rcu(&zdev->iommu_list);
 	spin_unlock_irqrestore(&s390_domain->list_lock, flags);
 
 	zpci_unregister_ioat(zdev, 0);
-	zdev->s390_domain = NULL;
+	zdev->s390_domain = &s390_blocking_domain;
 	zdev->dma_table = NULL;
 }
 
@@ -395,7 +406,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
 	int cc;
 
 	if (!zdev)
-		return -ENODEV;
+		return -EIO;
 
 	if (WARN_ON(domain->geometry.aperture_start > zdev->end_dma ||
 		domain->geometry.aperture_end < zdev->start_dma))
@@ -403,16 +414,16 @@ static int s390_iommu_attach_device(struct iommu_domain *domain,
 
 	if (zdev->s390_domain)
 		s390_iommu_detach_device(&zdev->s390_domain->domain, dev);
-
+	/*
+	 * Detach set the blocking domain. If we fail now DMA remains blocked
+	 * and the blocking domain attached.
+	 */
 	cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
 				virt_to_phys(s390_domain->dma_table), &status);
-	/*
-	 * If the device is undergoing error recovery the reset code
-	 * will re-establish the new domain.
-	 */
-	if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL)
+	if (cc == 3 || status == ZPCI_PCI_ST_FUNC_NOT_AVAIL)
+		return -ENODEV;
+	else if (cc)
 		return -EIO;
-
 	zdev->dma_table = s390_domain->dma_table;
 	zdev->s390_domain = s390_domain;
 
@@ -702,6 +713,30 @@ struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev)
 	return &zdev->s390_domain->ctrs;
 }
 
+static int blocking_domain_attach_device(struct iommu_domain *domain,
+					 struct device *dev)
+{
+	struct s390_domain *s390_domain = to_s390_domain(domain);
+	struct zpci_dev *zdev = to_zpci_dev(dev);
+	unsigned long flags;
+
+	if (!zdev)
+		return 0;
+
+	/* Detach sets the blocking domain */
+	if (zdev->s390_domain)
+		s390_iommu_detach_device(&zdev->s390_domain->domain, dev);
+	return 0;
+}
+
+static struct s390_domain s390_blocking_domain = {
+	.domain = {
+		.type = IOMMU_DOMAIN_BLOCKED,
+		.ops = &(const struct iommu_domain_ops) {
+			.attach_dev	= blocking_domain_attach_device,
+	}}
+};
+
 int zpci_init_iommu(struct zpci_dev *zdev)
 {
 	u64 aperture_size;
@@ -777,6 +812,8 @@ static int __init s390_iommu_init(void)
 subsys_initcall(s390_iommu_init);
 
 static const struct iommu_ops s390_iommu_ops = {
+	.blocked_domain		= &s390_blocking_domain.domain,
+	.release_domain		= &s390_blocking_domain.domain,
 	.capable = s390_iommu_capable,
 	.domain_alloc_paging = s390_domain_alloc_paging,
 	.probe_device = s390_iommu_probe_device,

---
base-commit: de9c2c66ad8e787abec7c9d7eff4f8c3cdd28aed
change-id: 20240806-blocking_domain-25b6c2fc6a62

Best regards,
-- 
Niklas Schnelle

Re: [PATCH] iommu/s390: Implement blocking domain

Posted by Jason Gunthorpe 1 year, 6 months ago

On Tue, Aug 06, 2024 at 03:45:15PM +0200, Niklas Schnelle wrote:
> This fixes a crash when surprise hot-unplugging a PCI device. This crash
> happens because during hot-unplug __iommu_group_set_domain_nofail()
> attaching the default domain fails when the platform no longer
> recognizes the device as it has already been removed and we end up with
> a NULL domain pointer and UAF.

Huh?

A device can't be removed before telling the iommu subsystem about
it. How did the domain become NULL asynchronously while the iommu side
thought it was still alive?? That seems like the main bug here..

> Note: I somewhat suspect this to be related to the following discussion
> or at least we have seen the same backtraces in reports that we suspect
> to be caused by the issue fixed with this patch.

No, it shouldn't be. That bug is because the netstack is continuing to
use a struct device with the DMA API after the device driver has been
removed. That is just illegal.

> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index ed6c5cb60c5a..91b3b23bf55c 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -119,8 +119,11 @@ static int __iommu_group_set_domain(struct iommu_group *group,
>  static void __iommu_group_set_domain_nofail(struct iommu_group *group,
>  					    struct iommu_domain *new_domain)
>  {
> -	WARN_ON(__iommu_group_set_domain_internal(
> -		group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED));
> +	int ret = __iommu_group_set_domain_internal(
> +		group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED);
> +
> +	/* Allow attach to fail when the device is gone */
> +	WARN_ON(ret && ret != -ENODEV);
>  }

Like this doesn't really make sense. Until the iommu subystem removes
the device from the group it really cannot be "gone".

The hypervisor could fail attachment because the hypervisor has
already fenced the device. Sure, that make sense, but it is not really
that the device is gone from a Linux perspective, it is just now in a
forced-blocked state.

Like Lu says, if we need to add a new flow for devices that are now
force-blocking and cannot be changed then it will need its own error
code.

But what is the backtrace that runs into this warn on? VFIO exiting
and trying to put the device back to the DMA API?

Though I feel like more is needed here if you expect to allow the
nofail version of this to actually fail.. For instance a force-blocked
device should block driver binding through the dma_owner APIs.

> +static int blocking_domain_attach_device(struct iommu_domain *domain,
> +					 struct device *dev)
> +{
> +	struct s390_domain *s390_domain = to_s390_domain(domain);
> +	struct zpci_dev *zdev = to_zpci_dev(dev);
> +	unsigned long flags;
> +
> +	if (!zdev)
> +		return 0;
> +
> +	/* Detach sets the blocking domain */
> +	if (zdev->s390_domain)
> +		s390_iommu_detach_device(&zdev->s390_domain->domain, dev);

When I've done these conversions on other drivers it was the case that
zdev->s390_domain is never NULL. Instead probe_device immediately
starts it as a blocking_domain or fails to probe.

This way we don't ever have the ill defined notion of NULL here.

> @@ -777,6 +812,8 @@ static int __init s390_iommu_init(void)
>  subsys_initcall(s390_iommu_init);
>  
>  static const struct iommu_ops s390_iommu_ops = {
> +	.blocked_domain		= &s390_blocking_domain.domain,
> +	.release_domain		= &s390_blocking_domain.domain,

If you set release_domain then remove s390_iommu_release_device(), it
is the same code.

Jason

Re: [PATCH] iommu/s390: Implement blocking domain

Posted by Matthew Rosato 1 year, 6 months ago

On 8/8/24 9:44 AM, Jason Gunthorpe wrote:
> On Tue, Aug 06, 2024 at 03:45:15PM +0200, Niklas Schnelle wrote:
>> This fixes a crash when surprise hot-unplugging a PCI device. This crash
>> happens because during hot-unplug __iommu_group_set_domain_nofail()
>> attaching the default domain fails when the platform no longer
>> recognizes the device as it has already been removed and we end up with
>> a NULL domain pointer and UAF.
> 
> Huh?
> 
> A device can't be removed before telling the iommu subsystem about
> it. How did the domain become NULL asynchronously while the iommu side
> thought it was still alive?? That seems like the main bug here..
> 

Nilkas is away for a while, so I'll take this over -- So, peeling back the onion on this patch, it's actually trying to solve a few things simultaneously. 

Since c76c067e488c ("s390/pci: Use dma-iommu layer"), we stopped checking for a NULL zdev->s390_domain during s390_iommu_release_device.  This causes a crash specifically if the last attach_device call failed (we exit on -EIO with a NULL zdev->s390_domain).  I suppose we could also fix the specific crash by restoring the NULL check in s390_iommu_release_device, but this seems like a band-aid and I'm concerned that there's more lurking here when get to this state...  So instead implementing the blocking domain ensures we always fall back to it when attach fails, so we no longer can have that NULL pointer during iommu_release_device / always have at bare-minimum the blocked domain attached (once I address your probe_device comment below).


>> Note: I somewhat suspect this to be related to the following discussion
>> or at least we have seen the same backtraces in reports that we suspect
>> to be caused by the issue fixed with this patch.
> 
> No, it shouldn't be. That bug is because the netstack is continuing to
> use a struct device with the DMA API after the device driver has been
> removed. That is just illegal.> 

I wonder if the presence of a blocking domain masks it somehow.  We just happened to notice that we were unable to reproduce a similar backtrace once a blocking domain was implemented;  I'll have to go back to the repro scenario and have a closer look.

>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>> index ed6c5cb60c5a..91b3b23bf55c 100644
>> --- a/drivers/iommu/iommu.c
>> +++ b/drivers/iommu/iommu.c
>> @@ -119,8 +119,11 @@ static int __iommu_group_set_domain(struct iommu_group *group,
>>  static void __iommu_group_set_domain_nofail(struct iommu_group *group,
>>  					    struct iommu_domain *new_domain)
>>  {
>> -	WARN_ON(__iommu_group_set_domain_internal(
>> -		group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED));
>> +	int ret = __iommu_group_set_domain_internal(
>> +		group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED);
>> +
>> +	/* Allow attach to fail when the device is gone */
>> +	WARN_ON(ret && ret != -ENODEV);
>>  }
> 
> Like this doesn't really make sense. Until the iommu subystem removes
> the device from the group it really cannot be "gone".
 
The device isn't actually gone from the perspective of the kernel so much as known to be in the process of going way / in a state where we know that DMA is no longer possible.

> The hypervisor could fail attachment because the hypervisor has
> already fenced the device. Sure, that make sense, but it is not really
> that the device is gone from a Linux perspective, it is just now in a
> forced-blocked state.

Yes.  So, once we implement the blocking domain, no more crashes and we always fall back to the static blocking domain.  But, if the device were unceremoniously switched off while bound to vfio we would still get WARNed from __iommu_group_set_domain_nofail, because attempts to hand the device back from vfio->dma-iommu (e.g. _iommu_release_dma_ownership) are unable to attach the requested default domain and instead fall back to the blocking domain because we've effectively already fenced the device.  That is specifically what this change to drivers/iommu/iommu.c was attempting to resolve.

What about we fix the crash and implement the s390 blocking domain with this patch but drop this change to drivers/iommu/iommu.c.  This leaves us still hitting WARNs in this case for now, but then I would still like to try to sort that out as a follow-on so that we eventually can handle this DMA-owned+force-blocked scenario cleanly.

> 
> Like Lu says, if we need to add a new flow for devices that are now
> force-blocking and cannot be changed then it will need its own error
> code.
Makes sense.

> 
> But what is the backtrace that runs into this warn on? VFIO exiting
> and trying to put the device back to the DMA API?

Yes, exactly

[  198.067373] ------------[ cut here ]------------
[  198.067377] WARNING: CPU: 44 PID: 394 at drivers/iommu/iommu.c:122 __iommu_release_dma_ownership+0x72/0x80
[  198.067386] Modules linked in: macvtap macvlan tap vfio_pci vfio_pci_core irqbypass kvm sunrpc s390_trng eadm_sch tape_34xx tape tape_class vfio_ccw mdev vfio_iommu_type1 vfio zcrypt_cex4 sch_fq_codel loop configfs nfnetlink lcs ctcm fsm ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 nvme sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common zfcp scsi_transport_fc 8021q garp pkey zcrypt rng_core autofs4
[  198.067424] CPU: 44 UID: 0 PID: 394 Comm: kmcheck Not tainted 6.11.0-rc2 #111
[  198.067427] Hardware name: IBM 3906 M05 780 (LPAR)
[  198.067429] Krnl PSW : 0704c00180000000 000002bbfc744576 (__iommu_release_dma_ownership+0x76/0x80)
[  198.067433]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[  198.067437] Krnl GPRS: 0000000000000290 0000000000000000 fffffffffffffffb 000001b386842098
[  198.067439]            0000000000000000 000001b3865b5c00 000000000000255b 000002bb7c586068
[  198.067442]            000001b3ab82a018 000001b7ff0af6c0 000001b5d0e4af68 000001b5d0e4af00
[  198.067444]            000001b389772300 0000023bffe33e28 000002bbfc744560 0000023bffe339d0
[  198.067452] Krnl Code: 000002bbfc744568: f0a0000407fe	srp	4(11,%r0),2046,0
                          000002bbfc74456e: 47000700		bc	0,1792
                         #000002bbfc744572: af000000		mc	0,0
                         >000002bbfc744576: a7f4fff8		brc	15,000002bbfc744566
                          000002bbfc74457a: 0707		bcr	0,%r7
                          000002bbfc74457c: 0707		bcr	0,%r7
                          000002bbfc74457e: 0707		bcr	0,%r7
                          000002bbfc744580: c0040025f71c	brcl	0,000002bbfcc033b8
[  198.067468] Call Trace:
[  198.067482]  [<000002bbfc744576>] __iommu_release_dma_ownership+0x76/0x80 
[  198.067486] ([<000002bbfc744560>] __iommu_release_dma_ownership+0x60/0x80)
[  198.067488]  [<000002bbfc7445b8>] iommu_group_release_dma_owner+0x38/0x50 
[  198.067491]  [<000002bb7c2a1e24>] vfio_group_detach_container+0x154/0x180 [vfio] 
[  198.067500]  [<000002bb7c2a0d88>] vfio_device_remove_group+0xd8/0x140 [vfio] 
[  198.067505]  [<000002bb7c5648b4>] vfio_pci_core_unregister_device+0x34/0x80 [vfio_pci_core] 
[  198.067513]  [<000002bb7c5841cc>] vfio_pci_remove+0x2c/0x40 [vfio_pci] 
[  198.067517]  [<000002bbfc6ec7bc>] pci_device_remove+0x3c/0x90 
[  198.067520]  [<000002bbfc75dda4>] device_release_driver_internal+0x1b4/0x260 
[  198.067527]  [<000002bbfc6e2844>] pci_stop_bus_device+0x94/0xc0 
[  198.067531]  [<000002bbfc6e2b66>] pci_stop_and_remove_bus_device+0x26/0x40 
[  198.067534]  [<000002bbfc6e2bb0>] pci_stop_and_remove_bus_device_locked+0x30/0x40 
[  198.067537]  [<000002bbfbde2838>] zpci_bus_remove_device+0x68/0xb0 
[  198.067541]  [<000002bbfbde0780>] __zpci_event_availability+0x250/0x3a0 
[  198.067543]  [<000002bbfc7e4528>] chsc_process_crw+0x2a8/0x2c0 
[  198.067548]  [<000002bbfc7ec690>] crw_collect_info+0x2e0/0x360 
[  198.067550]  [<000002bbfbe15bde>] kthread+0x11e/0x130 
[  198.067556]  [<000002bbfbd930ec>] __ret_from_fork+0x3c/0x60 
[  198.067558]  [<000002bbfcb233aa>] ret_from_fork+0xa/0x38 
[  198.067564] Last Breaking-Event-Address:
[  198.067565]  [<000002bbfc744560>] __iommu_release_dma_ownership+0x60/0x80
[  198.067569] ---[ end trace 0000000000000000 ]---



> 
> Though I feel like more is needed here if you expect to allow the
> nofail version of this to actually fail.. For instance a force-blocked
> device should block driver binding through the dma_owner APIs.

Yeah, that makes sense too. 

> 
>> +static int blocking_domain_attach_device(struct iommu_domain *domain,
>> +					 struct device *dev)
>> +{
>> +	struct s390_domain *s390_domain = to_s390_domain(domain);
>> +	struct zpci_dev *zdev = to_zpci_dev(dev);
>> +	unsigned long flags;
>> +
>> +	if (!zdev)
>> +		return 0;
>> +
>> +	/* Detach sets the blocking domain */
>> +	if (zdev->s390_domain)
>> +		s390_iommu_detach_device(&zdev->s390_domain->domain, dev);
> 
> When I've done these conversions on other drivers it was the case that
> zdev->s390_domain is never NULL. Instead probe_device immediately
> starts it as a blocking_domain or fails to probe.

Good point, I'll look at setting in probe.

> 
> This way we don't ever have the ill defined notion of NULL here.
> 
>> @@ -777,6 +812,8 @@ static int __init s390_iommu_init(void)
>>  subsys_initcall(s390_iommu_init);
>>  
>>  static const struct iommu_ops s390_iommu_ops = {
>> +	.blocked_domain		= &s390_blocking_domain.domain,
>> +	.release_domain		= &s390_blocking_domain.domain,
> 
> If you set release_domain then remove s390_iommu_release_device(), it
> is the same code.
> 

Oh I see, via blocking_domain_attach_device.  Agree

Re: [PATCH] iommu/s390: Implement blocking domain

Posted by Jason Gunthorpe 1 year, 6 months ago

On Thu, Aug 08, 2024 at 01:05:36PM -0400, Matthew Rosato wrote:

> Since c76c067e488c ("s390/pci: Use dma-iommu layer"), we stopped
> checking for a NULL zdev->s390_domain during
> s390_iommu_release_device.  This causes a crash specifically if the
> last attach_device call failed (we exit on -EIO with a NULL
> zdev->s390_domain).  I suppose we could also fix the specific crash
> by restoring the NULL check in s390_iommu_release_device, but this
> seems like a band-aid and I'm concerned that there's more lurking
> here when get to this state...

Implementing the static blocking domain and using release_domain
should fix all these crashes, I think. The core code will sequence
everything and zdev->s390_domain should never be NULL. You'll get
trapped in a blocking domain and can't exit which will trigger
WARN_ON, but at least for the first step of this problem it should be
enough.

> > But what is the backtrace that runs into this warn on? VFIO exiting
> > and trying to put the device back to the DMA API?
> 
> Yes, exactly
> 
> [  198.067373] ------------[ cut here ]------------
> [  198.067377] WARNING: CPU: 44 PID: 394 at drivers/iommu/iommu.c:122 __iommu_release_dma_ownership+0x72/0x80
> [  198.067386] Modules linked in: macvtap macvlan tap vfio_pci vfio_pci_core irqbypass kvm sunrpc s390_trng eadm_sch tape_34xx tape tape_class vfio_ccw mdev vfio_iommu_type1 vfio zcrypt_cex4 sch_fq_codel loop configfs nfnetlink lcs ctcm fsm ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 nvme sha512_s390 sha256_s390 nvme_core sha1_s390 sha_common zfcp scsi_transport_fc 8021q garp pkey zcrypt rng_core autofs4
> [  198.067424] CPU: 44 UID: 0 PID: 394 Comm: kmcheck Not tainted 6.11.0-rc2 #111
> [  198.067427] Hardware name: IBM 3906 M05 780 (LPAR)
> [  198.067429] Krnl PSW : 0704c00180000000 000002bbfc744576 (__iommu_release_dma_ownership+0x76/0x80)
> [  198.067433]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> [  198.067437] Krnl GPRS: 0000000000000290 0000000000000000 fffffffffffffffb 000001b386842098
> [  198.067439]            0000000000000000 000001b3865b5c00 000000000000255b 000002bb7c586068
> [  198.067442]            000001b3ab82a018 000001b7ff0af6c0 000001b5d0e4af68 000001b5d0e4af00
> [  198.067444]            000001b389772300 0000023bffe33e28 000002bbfc744560 0000023bffe339d0
> [  198.067452] Krnl Code: 000002bbfc744568: f0a0000407fe	srp	4(11,%r0),2046,0
>                           000002bbfc74456e: 47000700		bc	0,1792
>                          #000002bbfc744572: af000000		mc	0,0
>                          >000002bbfc744576: a7f4fff8		brc	15,000002bbfc744566
>                           000002bbfc74457a: 0707		bcr	0,%r7
>                           000002bbfc74457c: 0707		bcr	0,%r7
>                           000002bbfc74457e: 0707		bcr	0,%r7
>                           000002bbfc744580: c0040025f71c	brcl	0,000002bbfcc033b8
> [  198.067468] Call Trace:
> [  198.067482]  [<000002bbfc744576>] __iommu_release_dma_ownership+0x76/0x80 
> [  198.067486] ([<000002bbfc744560>] __iommu_release_dma_ownership+0x60/0x80)
> [  198.067488]  [<000002bbfc7445b8>] iommu_group_release_dma_owner+0x38/0x50 
> [  198.067491]  [<000002bb7c2a1e24>] vfio_group_detach_container+0x154/0x180 [vfio] 
> [  198.067500]  [<000002bb7c2a0d88>] vfio_device_remove_group+0xd8/0x140 [vfio] 
> [  198.067505]  [<000002bb7c5648b4>] vfio_pci_core_unregister_device+0x34/0x80 [vfio_pci_core] 
> [  198.067513]  [<000002bb7c5841cc>] vfio_pci_remove+0x2c/0x40 [vfio_pci] 
> [  198.067517]  [<000002bbfc6ec7bc>] pci_device_remove+0x3c/0x90 
> [  198.067520]  [<000002bbfc75dda4>] device_release_driver_internal+0x1b4/0x260 
> [  198.067527]  [<000002bbfc6e2844>] pci_stop_bus_device+0x94/0xc0 
> [  198.067531]  [<000002bbfc6e2b66>] pci_stop_and_remove_bus_device+0x26/0x40 
> [  198.067534]  [<000002bbfc6e2bb0>] pci_stop_and_remove_bus_device_locked+0x30/0x40 
> [  198.067537]  [<000002bbfbde2838>] zpci_bus_remove_device+0x68/0xb0 
> [  198.067541]  [<000002bbfbde0780>] __zpci_event_availability+0x250/0x3a0 
> [  198.067543]  [<000002bbfc7e4528>] chsc_process_crw+0x2a8/0x2c0 
> [  198.067548]  [<000002bbfc7ec690>] crw_collect_info+0x2e0/0x360 
> [  198.067550]  [<000002bbfbe15bde>] kthread+0x11e/0x130 
> [  198.067556]  [<000002bbfbd930ec>] __ret_from_fork+0x3c/0x60 
> [  198.067558]  [<000002bbfcb233aa>] ret_from_fork+0xa/0x38 
> [  198.067564] Last Breaking-Event-Address:
> [  198.067565]  [<000002bbfc744560>] __iommu_release_dma_ownership+0x60/0x80
> [  198.067569] ---[ end trace 0000000000000000 ]---
> 
> > Though I feel like more is needed here if you expect to allow the
> > nofail version of this to actually fail.. For instance a force-blocked
> > device should block driver binding through the dma_owner APIs.
> 
> Yeah, that makes sense too. 

And I think fixing this should be another patch, though I'm not
entirely sure what the best approach will be.

One thought is that, presumably, when the hypervisor fences the device
it asynchronously makes it blocking, which is another way of saying it
has halted all DMA from that device.

So, even if we could attach a domain, it would never be used since
there is no possible DMA.

Thus, why fail the attach? There is no way for anyone to tell the
difference between an attached translation and not attached because
the DMA has been halted at the device.

If you can tell in the s390 driver that the hypervisor has fenced the
device then maybe just reporting success to the upper layer is the
right answer here.

Jason

Re: [PATCH] iommu/s390: Implement blocking domain

Posted by Baolu Lu 1 year, 6 months ago

On 2024/8/6 21:45, Niklas Schnelle wrote:
> This fixes a crash when surprise hot-unplugging a PCI device. This crash
> happens because during hot-unplug __iommu_group_set_domain_nofail()
> attaching the default domain fails when the platform no longer
> recognizes the device as it has already been removed and we end up with
> a NULL domain pointer and UAF. This is exactly the case referred to in
> the second comment in __iommu_device_set_domain() and just as stated
> there if we can instead attach the blocking domain the UAF is prevented
> as this can handle the already removed device. Implement the blocking
> domain to use this handling. This would still leave us with a warning
> for a failed attach. As failing to attach when the device is no longer
> present is expected behavior turn this into an explicit -ENODEV error
> and don't warn for it. Also change the error return for a NULL zdev to
> -EIO as we don't want to ignore this case that would be a serious bug.
> 
> Fixes: c76c067e488c ("s390/pci: Use dma-iommu layer")
> Signed-off-by: Niklas Schnelle<schnelle@linux.ibm.com>
> ---
> Note: I somewhat suspect this to be related to the following discussion
> or at least we have seen the same backtraces in reports that we suspect
> to be caused by the issue fixed with this patch. In the case I was able
> to reproduce with vfio-pci pass-through to a KVM guest I got a different
> trace though.
> 
> Organizational note: I'll be on vacation starting Thursday. Matt will
> then take over and sent new revisions as necessary.
> ---
>   drivers/iommu/iommu.c      |  7 ++++--
>   drivers/iommu/s390-iommu.c | 55 ++++++++++++++++++++++++++++++++++++++--------
>   2 files changed, 51 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index ed6c5cb60c5a..91b3b23bf55c 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -119,8 +119,11 @@ static int __iommu_group_set_domain(struct iommu_group *group,
>   static void __iommu_group_set_domain_nofail(struct iommu_group *group,
>   					    struct iommu_domain *new_domain)
>   {
> -	WARN_ON(__iommu_group_set_domain_internal(
> -		group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED));
> +	int ret = __iommu_group_set_domain_internal(
> +		group, new_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED);
> +
> +	/* Allow attach to fail when the device is gone */
> +	WARN_ON(ret && ret != -ENODEV);

The return values of attach_dev have been documented in include/linux
/iommu.h:

/**
  * struct iommu_domain_ops - domain specific operations
  * @attach_dev: attach an iommu domain to a device
  *  Return:
  * * 0          - success
  * * EINVAL     - can indicate that device and domain are incompatible 
due to
  *                some previous configuration of the domain, in which 
case the
  *                driver shouldn't log an error, since it is legitimate 
for a
  *                caller to test reuse of existing domains. Otherwise, 
it may
  *                still represent some other fundamental problem
  * * ENOMEM     - out of memory
  * * ENOSPC     - non-ENOMEM type of resource allocation failures
  * * EBUSY      - device is attached to a domain and cannot be changed
  * * ENODEV     - device specific errors, not able to be attached
  * * <others>   - treated as ENODEV by the caller. Use is discouraged

ENODEV is obviously not suitable for the case of 'device is gone'.

Thanks,
baolu

Re: [PATCH] iommu/s390: Implement blocking domain

Posted by Niklas Schnelle 1 year, 6 months ago

On Tue, 2024-08-06 at 15:45 +0200, Niklas Schnelle wrote:
> This fixes a crash when surprise hot-unplugging a PCI device. This crash
> happens because during hot-unplug __iommu_group_set_domain_nofail()
> attaching the default domain fails when the platform no longer
> recognizes the device as it has already been removed and we end up with
> a NULL domain pointer and UAF. This is exactly the case referred to in
> the second comment in __iommu_device_set_domain() and just as stated
> there if we can instead attach the blocking domain the UAF is prevented
> as this can handle the already removed device. Implement the blocking
> domain to use this handling. This would still leave us with a warning
> for a failed attach. As failing to attach when the device is no longer
> present is expected behavior turn this into an explicit -ENODEV error
> and don't warn for it. Also change the error return for a NULL zdev to
> -EIO as we don't want to ignore this case that would be a serious bug.
> 
> Fixes: c76c067e488c ("s390/pci: Use dma-iommu layer")
> Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
> ---
> Note: I somewhat suspect this to be related to the following discussion
> or at least we have seen the same backtraces in reports that we suspect
> to be caused by the issue fixed with this patch. In the case I was able
> to reproduce with vfio-pci pass-through to a KVM guest I got a different
> trace though.

Forgot the link:
https://lore.kernel.org/all/8743264a-9700-4227-a556-5f931c720211@huawei.com/

> 
> Organizational note: I'll be on vacation starting Thursday. Matt will
> then take over and sent new revisions as necessary.
> ---
>  drivers/iommu/iommu.c      |  7 ++++---
---8<---