KVM: arm64: SMMUv3 driver for pKVM (trap and emulate)

[PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Mostafa Saleh 1 month, 2 weeks ago

Don’t allow access to the command queue from the host:
- ARM_SMMU_CMDQ_BASE: Only allowed to be written when CMDQ is disabled, we
  use it to keep track of the host command queue base.
  Reads return the saved value.
- ARM_SMMU_CMDQ_PROD: Writes trigger command queue emulation which sanitises
  and filters the whole range. Reads returns the host copy.
- ARM_SMMU_CMDQ_CONS: Writes move the sw copy of the cons, but the host can’t
  skip commands once submitted. Reads return the emulated value and the error
  bits in the actual cons.

Signed-off-by: Mostafa Saleh <smostafa@google.com>
---
 .../iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c  | 108 +++++++++++++++++-
 1 file changed, 105 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
index 554229e466f3..10c6461bbf12 100644
--- a/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
@@ -325,6 +325,88 @@ static bool is_cmdq_enabled(struct hyp_arm_smmu_v3_device *smmu)
 	return FIELD_GET(CR0_CMDQEN, smmu->cr0);
 }
 
+static bool smmu_filter_command(struct hyp_arm_smmu_v3_device *smmu, u64 *command)
+{
+	u64 type = FIELD_GET(CMDQ_0_OP, command[0]);
+
+	switch (type) {
+	case CMDQ_OP_CFGI_STE:
+		/* TBD: SHADOW_STE*/
+		break;
+	case CMDQ_OP_CFGI_ALL:
+	{
+		/*
+		 * Linux doesn't use range STE invalidation, and only use this
+		 * for CFGI_ALL, which is done on reset and not on an new STE
+		 * being used.
+		 * Although, this is not architectural we rely on the current Linux
+		 * implementation.
+		 */
+		WARN_ON((FIELD_GET(CMDQ_CFGI_1_RANGE, command[1]) != 31));
+		break;
+	}
+	case CMDQ_OP_TLBI_NH_ASID:
+	case CMDQ_OP_TLBI_NH_VA:
+	case 0x13: /* CMD_TLBI_NH_VAA: Not used by Linux */
+	{
+		/* Only allow VMID = 0*/
+		if (FIELD_GET(CMDQ_TLBI_0_VMID, command[0]) == 0)
+			break;
+		break;
+	}
+	case 0x10: /* CMD_TLBI_NH_ALL: Not used by Linux */
+	case CMDQ_OP_TLBI_EL2_ALL:
+	case CMDQ_OP_TLBI_EL2_VA:
+	case CMDQ_OP_TLBI_EL2_ASID:
+	case CMDQ_OP_TLBI_S12_VMALL:
+	case 0x23: /* CMD_TLBI_EL2_VAA: Not used by Linux */
+		/* Malicous host */
+		return WARN_ON(true);
+	case CMDQ_OP_CMD_SYNC:
+		if (FIELD_GET(CMDQ_SYNC_0_CS, command[0]) == CMDQ_SYNC_0_CS_IRQ) {
+			/* Allow it, but let the host timeout, as this should never happen. */
+			command[0] &= ~CMDQ_SYNC_0_CS;
+			command[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
+			command[1] &= ~CMDQ_SYNC_1_MSIADDR_MASK;
+		}
+		break;
+	}
+
+	return false;
+}
+
+static void smmu_emulate_cmdq_insert(struct hyp_arm_smmu_v3_device *smmu)
+{
+	u64 *host_cmdq = hyp_phys_to_virt(smmu->cmdq_host.q_base & Q_BASE_ADDR_MASK);
+	int idx;
+	u64 cmd[CMDQ_ENT_DWORDS];
+	bool skip;
+
+	if (!is_cmdq_enabled(smmu))
+		return;
+
+	while (!queue_empty(&smmu->cmdq_host.llq)) {
+		/* Wait for the command queue to have some space. */
+		WARN_ON(smmu_wait_event(smmu, !smmu_cmdq_full(&smmu->cmdq)));
+
+		idx = Q_IDX(&smmu->cmdq_host.llq, smmu->cmdq_host.llq.cons);
+		/* Avoid TOCTOU */
+		memcpy(cmd, &host_cmdq[idx * CMDQ_ENT_DWORDS], CMDQ_ENT_DWORDS << 3);
+		skip = smmu_filter_command(smmu, cmd);
+		if (!skip)
+			smmu_add_cmd_raw(smmu, cmd);
+		queue_inc_cons(&smmu->cmdq_host.llq);
+	}
+
+	/*
+	 * Wait till consumed, this can be improved a bit by returning to the host
+	 * while flagging the current offset in the command queue with the host,
+	 * this would be maintained from the hyp entering command or when the
+	 * host issuing another read to cons.
+	 */
+	WARN_ON(smmu_wait_event(smmu, smmu_cmdq_empty(&smmu->cmdq)));
+}
+
 static void smmu_emulate_cmdq_enable(struct hyp_arm_smmu_v3_device *smmu)
 {
 	size_t cmdq_size;
@@ -360,17 +442,37 @@ static bool smmu_dabt_device(struct hyp_arm_smmu_v3_device *smmu,
 		mask = read_only & ~(IDR0_S2P | IDR0_VMID16 | IDR0_MSI | IDR0_HYP);
 		WARN_ON(len != sizeof(u32));
 		break;
-	/* Passthrough the register access for bisectiblity, handled later */
 	case ARM_SMMU_CMDQ_BASE:
 
 		/* Not allowed by the architecture */
 		WARN_ON(is_cmdq_enabled(smmu));
 		if (is_write)
 			smmu->cmdq_host.q_base = val;
-		mask = read_write;
-		break;
+		else
+			regs->regs[rd] = smmu->cmdq_host.q_base;
+		goto out_ret;
 	case ARM_SMMU_CMDQ_PROD:
+		if (is_write) {
+			smmu->cmdq_host.llq.prod = val;
+			smmu_emulate_cmdq_insert(smmu);
+		} else {
+			regs->regs[rd] = smmu->cmdq_host.llq.prod;
+		}
+		goto out_ret;
 	case ARM_SMMU_CMDQ_CONS:
+		if (is_write) {
+			/* Not allowed by the architecture */
+			WARN_ON(is_cmdq_enabled(smmu));
+			smmu->cmdq_host.llq.cons = val;
+		} else {
+			/* Propagate errors back to the host.*/
+			u32 cons = readl_relaxed(smmu->base + ARM_SMMU_CMDQ_CONS);
+			u32 err = CMDQ_CONS_ERR & cons;
+
+			regs->regs[rd] = smmu->cmdq_host.llq.cons | err;
+		}
+		goto out_ret;
+	/* Passthrough the register access for bisectiblity, handled later */
 	case ARM_SMMU_STRTAB_BASE:
 	case ARM_SMMU_STRTAB_BASE_CFG:
 	case ARM_SMMU_GBPA:
-- 
2.51.0.rc1.167.g924127e9c0-goog

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Will Deacon 3 weeks ago

On Tue, Aug 19, 2025 at 09:51:50PM +0000, Mostafa Saleh wrote:
> Don’t allow access to the command queue from the host:
> - ARM_SMMU_CMDQ_BASE: Only allowed to be written when CMDQ is disabled, we
>   use it to keep track of the host command queue base.
>   Reads return the saved value.
> - ARM_SMMU_CMDQ_PROD: Writes trigger command queue emulation which sanitises
>   and filters the whole range. Reads returns the host copy.
> - ARM_SMMU_CMDQ_CONS: Writes move the sw copy of the cons, but the host can’t
>   skip commands once submitted. Reads return the emulated value and the error
>   bits in the actual cons.
> 
> Signed-off-by: Mostafa Saleh <smostafa@google.com>
> ---
>  .../iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c  | 108 +++++++++++++++++-
>  1 file changed, 105 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
> index 554229e466f3..10c6461bbf12 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
> @@ -325,6 +325,88 @@ static bool is_cmdq_enabled(struct hyp_arm_smmu_v3_device *smmu)
>  	return FIELD_GET(CR0_CMDQEN, smmu->cr0);
>  }
>  
> +static bool smmu_filter_command(struct hyp_arm_smmu_v3_device *smmu, u64 *command)
> +{
> +	u64 type = FIELD_GET(CMDQ_0_OP, command[0]);
> +
> +	switch (type) {
> +	case CMDQ_OP_CFGI_STE:
> +		/* TBD: SHADOW_STE*/
> +		break;
> +	case CMDQ_OP_CFGI_ALL:
> +	{
> +		/*
> +		 * Linux doesn't use range STE invalidation, and only use this
> +		 * for CFGI_ALL, which is done on reset and not on an new STE
> +		 * being used.
> +		 * Although, this is not architectural we rely on the current Linux
> +		 * implementation.
> +		 */
> +		WARN_ON((FIELD_GET(CMDQ_CFGI_1_RANGE, command[1]) != 31));
> +		break;
> +	}
> +	case CMDQ_OP_TLBI_NH_ASID:
> +	case CMDQ_OP_TLBI_NH_VA:
> +	case 0x13: /* CMD_TLBI_NH_VAA: Not used by Linux */
> +	{
> +		/* Only allow VMID = 0*/
> +		if (FIELD_GET(CMDQ_TLBI_0_VMID, command[0]) == 0)
> +			break;
> +		break;
> +	}
> +	case 0x10: /* CMD_TLBI_NH_ALL: Not used by Linux */
> +	case CMDQ_OP_TLBI_EL2_ALL:
> +	case CMDQ_OP_TLBI_EL2_VA:
> +	case CMDQ_OP_TLBI_EL2_ASID:
> +	case CMDQ_OP_TLBI_S12_VMALL:
> +	case 0x23: /* CMD_TLBI_EL2_VAA: Not used by Linux */
> +		/* Malicous host */
> +		return WARN_ON(true);
> +	case CMDQ_OP_CMD_SYNC:
> +		if (FIELD_GET(CMDQ_SYNC_0_CS, command[0]) == CMDQ_SYNC_0_CS_IRQ) {
> +			/* Allow it, but let the host timeout, as this should never happen. */
> +			command[0] &= ~CMDQ_SYNC_0_CS;
> +			command[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
> +			command[1] &= ~CMDQ_SYNC_1_MSIADDR_MASK;
> +		}
> +		break;
> +	}
> +
> +	return false;
> +}
> +
> +static void smmu_emulate_cmdq_insert(struct hyp_arm_smmu_v3_device *smmu)
> +{
> +	u64 *host_cmdq = hyp_phys_to_virt(smmu->cmdq_host.q_base & Q_BASE_ADDR_MASK);
> +	int idx;
> +	u64 cmd[CMDQ_ENT_DWORDS];
> +	bool skip;
> +
> +	if (!is_cmdq_enabled(smmu))
> +		return;
> +
> +	while (!queue_empty(&smmu->cmdq_host.llq)) {
> +		/* Wait for the command queue to have some space. */
> +		WARN_ON(smmu_wait_event(smmu, !smmu_cmdq_full(&smmu->cmdq)));
> +
> +		idx = Q_IDX(&smmu->cmdq_host.llq, smmu->cmdq_host.llq.cons);
> +		/* Avoid TOCTOU */
> +		memcpy(cmd, &host_cmdq[idx * CMDQ_ENT_DWORDS], CMDQ_ENT_DWORDS << 3);
> +		skip = smmu_filter_command(smmu, cmd);
> +		if (!skip)
> +			smmu_add_cmd_raw(smmu, cmd);
> +		queue_inc_cons(&smmu->cmdq_host.llq);
> +	}

Hmmm. There's something I'd not considered before here.

Ideally, the data structures that are shadowed by the hypervisor would
be mapped as normal-WB cacheable in both the host and the hypervisor so
we don't have to worry about coherency and we get the performance
benefits from the caches. Indeed, I think that's how you've mapped
'host_cmdq' above _however_ I sadly don't think we can do that if the
actual SMMU hardware isn't coherent.

We don't have a way to say things like "The STEs and CMDQ are coherent
but the CDs and Stage-1 page-tables aren't" so that means we have to
treat the shadowed structures populated by the host in the same way as
the host-owned structures that are consumed directly by the hardware.
Consequently, we should either be using non-cacheable mappings at EL2
for these structures or doing CMOs around the accesses.

Will

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Mostafa Saleh 2 weeks, 3 days ago

On Fri, Sep 12, 2025 at 03:18:08PM +0100, Will Deacon wrote:
> On Tue, Aug 19, 2025 at 09:51:50PM +0000, Mostafa Saleh wrote:
> > Don’t allow access to the command queue from the host:
> > - ARM_SMMU_CMDQ_BASE: Only allowed to be written when CMDQ is disabled, we
> >   use it to keep track of the host command queue base.
> >   Reads return the saved value.
> > - ARM_SMMU_CMDQ_PROD: Writes trigger command queue emulation which sanitises
> >   and filters the whole range. Reads returns the host copy.
> > - ARM_SMMU_CMDQ_CONS: Writes move the sw copy of the cons, but the host can’t
> >   skip commands once submitted. Reads return the emulated value and the error
> >   bits in the actual cons.
> > 
> > Signed-off-by: Mostafa Saleh <smostafa@google.com>
> > ---
> >  .../iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c  | 108 +++++++++++++++++-
> >  1 file changed, 105 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
> > index 554229e466f3..10c6461bbf12 100644
> > --- a/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
> > +++ b/drivers/iommu/arm/arm-smmu-v3/pkvm/arm-smmu-v3.c
> > @@ -325,6 +325,88 @@ static bool is_cmdq_enabled(struct hyp_arm_smmu_v3_device *smmu)
> >  	return FIELD_GET(CR0_CMDQEN, smmu->cr0);
> >  }
> >  
> > +static bool smmu_filter_command(struct hyp_arm_smmu_v3_device *smmu, u64 *command)
> > +{
> > +	u64 type = FIELD_GET(CMDQ_0_OP, command[0]);
> > +
> > +	switch (type) {
> > +	case CMDQ_OP_CFGI_STE:
> > +		/* TBD: SHADOW_STE*/
> > +		break;
> > +	case CMDQ_OP_CFGI_ALL:
> > +	{
> > +		/*
> > +		 * Linux doesn't use range STE invalidation, and only use this
> > +		 * for CFGI_ALL, which is done on reset and not on an new STE
> > +		 * being used.
> > +		 * Although, this is not architectural we rely on the current Linux
> > +		 * implementation.
> > +		 */
> > +		WARN_ON((FIELD_GET(CMDQ_CFGI_1_RANGE, command[1]) != 31));
> > +		break;
> > +	}
> > +	case CMDQ_OP_TLBI_NH_ASID:
> > +	case CMDQ_OP_TLBI_NH_VA:
> > +	case 0x13: /* CMD_TLBI_NH_VAA: Not used by Linux */
> > +	{
> > +		/* Only allow VMID = 0*/
> > +		if (FIELD_GET(CMDQ_TLBI_0_VMID, command[0]) == 0)
> > +			break;
> > +		break;
> > +	}
> > +	case 0x10: /* CMD_TLBI_NH_ALL: Not used by Linux */
> > +	case CMDQ_OP_TLBI_EL2_ALL:
> > +	case CMDQ_OP_TLBI_EL2_VA:
> > +	case CMDQ_OP_TLBI_EL2_ASID:
> > +	case CMDQ_OP_TLBI_S12_VMALL:
> > +	case 0x23: /* CMD_TLBI_EL2_VAA: Not used by Linux */
> > +		/* Malicous host */
> > +		return WARN_ON(true);
> > +	case CMDQ_OP_CMD_SYNC:
> > +		if (FIELD_GET(CMDQ_SYNC_0_CS, command[0]) == CMDQ_SYNC_0_CS_IRQ) {
> > +			/* Allow it, but let the host timeout, as this should never happen. */
> > +			command[0] &= ~CMDQ_SYNC_0_CS;
> > +			command[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_SEV);
> > +			command[1] &= ~CMDQ_SYNC_1_MSIADDR_MASK;
> > +		}
> > +		break;
> > +	}
> > +
> > +	return false;
> > +}
> > +
> > +static void smmu_emulate_cmdq_insert(struct hyp_arm_smmu_v3_device *smmu)
> > +{
> > +	u64 *host_cmdq = hyp_phys_to_virt(smmu->cmdq_host.q_base & Q_BASE_ADDR_MASK);
> > +	int idx;
> > +	u64 cmd[CMDQ_ENT_DWORDS];
> > +	bool skip;
> > +
> > +	if (!is_cmdq_enabled(smmu))
> > +		return;
> > +
> > +	while (!queue_empty(&smmu->cmdq_host.llq)) {
> > +		/* Wait for the command queue to have some space. */
> > +		WARN_ON(smmu_wait_event(smmu, !smmu_cmdq_full(&smmu->cmdq)));
> > +
> > +		idx = Q_IDX(&smmu->cmdq_host.llq, smmu->cmdq_host.llq.cons);
> > +		/* Avoid TOCTOU */
> > +		memcpy(cmd, &host_cmdq[idx * CMDQ_ENT_DWORDS], CMDQ_ENT_DWORDS << 3);
> > +		skip = smmu_filter_command(smmu, cmd);
> > +		if (!skip)
> > +			smmu_add_cmd_raw(smmu, cmd);
> > +		queue_inc_cons(&smmu->cmdq_host.llq);
> > +	}
> 
> Hmmm. There's something I'd not considered before here.
> 
> Ideally, the data structures that are shadowed by the hypervisor would
> be mapped as normal-WB cacheable in both the host and the hypervisor so
> we don't have to worry about coherency and we get the performance
> benefits from the caches. Indeed, I think that's how you've mapped
> 'host_cmdq' above _however_ I sadly don't think we can do that if the
> actual SMMU hardware isn't coherent.
> 
> We don't have a way to say things like "The STEs and CMDQ are coherent
> but the CDs and Stage-1 page-tables aren't" so that means we have to
> treat the shadowed structures populated by the host in the same way as
> the host-owned structures that are consumed directly by the hardware.
> Consequently, we should either be using non-cacheable mappings at EL2
> for these structures or doing CMOs around the accesses.

Thanks for catching that, I missed it, I think we can keep the host shared
as cacheable, and use CMOs when accessing it, I will have a closer look.

Thanks,
Mostafa


> 
> Will

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Jason Gunthorpe 2 weeks, 4 days ago

On Fri, Sep 12, 2025 at 03:18:08PM +0100, Will Deacon wrote:
> Ideally, the data structures that are shadowed by the hypervisor would
> be mapped as normal-WB cacheable in both the host and the hypervisor so
> we don't have to worry about coherency and we get the performance
> benefits from the caches. Indeed, I think that's how you've mapped
> 'host_cmdq' above _however_ I sadly don't think we can do that if the
> actual SMMU hardware isn't coherent.

That seems like the right conclusion to me, pkvm should not be mapping
as cachable unless it knows the IORT/IDR is marked as coherent.

This is actually something I want to fix in the SMMU driver, it should
always be allocating cachable memory and using
dma_sync_single_for_device() instead of non-cachable DMA coherent
allocations. (Or perhaps better is to use
iommu_pages_flush_incoherent())

I'm hearing about an interesting use case where we'd want to tell the
SMMU to walk STEs non-cachable even if the HW is capable to do
cachable. Apparently in some SOCs it gives better isochronous
properties for realtime DMA.

IMHO for this series at this point pkvm should just require a coherent
SMMU until the above revisions happen.

Jason

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Mostafa Saleh 2 weeks, 3 days ago

On Mon, Sep 15, 2025 at 01:38:58PM -0300, Jason Gunthorpe wrote:
> On Fri, Sep 12, 2025 at 03:18:08PM +0100, Will Deacon wrote:
> > Ideally, the data structures that are shadowed by the hypervisor would
> > be mapped as normal-WB cacheable in both the host and the hypervisor so
> > we don't have to worry about coherency and we get the performance
> > benefits from the caches. Indeed, I think that's how you've mapped
> > 'host_cmdq' above _however_ I sadly don't think we can do that if the
> > actual SMMU hardware isn't coherent.
> 
> That seems like the right conclusion to me, pkvm should not be mapping
> as cachable unless it knows the IORT/IDR is marked as coherent.
> 
> This is actually something I want to fix in the SMMU driver, it should
> always be allocating cachable memory and using
> dma_sync_single_for_device() instead of non-cachable DMA coherent
> allocations. (Or perhaps better is to use
> iommu_pages_flush_incoherent())
> 
> I'm hearing about an interesting use case where we'd want to tell the
> SMMU to walk STEs non-cachable even if the HW is capable to do
> cachable. Apparently in some SOCs it gives better isochronous
> properties for realtime DMA.

Interesting, I guess that would be more noticable for the page table
walks rather than the STE, as Linux doesn't invalidate STEs that much.
>
> 
> IMHO for this series at this point pkvm should just require a coherent
> SMMU until the above revisions happen.

I think the fix for the problem Will mentioned is to just use CMOs
before accessing the host structures, so that should be simple.
If it turns to be more complicated, I am happy to drop the support
for non-coherent devices from this series and we can add it later.

Thanks,
Mostafa
> 
> Jason

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Jason Gunthorpe 2 weeks, 2 days ago

On Tue, Sep 16, 2025 at 03:19:02PM +0000, Mostafa Saleh wrote:

> I think the fix for the problem Will mentioned is to just use CMOs
> before accessing the host structures, so that should be simple.
> If it turns to be more complicated, I am happy to drop the support
> for non-coherent devices from this series and we can add it later.

I feel like it is easier/better to fix the driver to use cachable
memory than to add CMOs to the pkvm side..

This way it will help qemu/etc as well.

Jason

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Will Deacon 2 weeks, 2 days ago

On Wed, Sep 17, 2025 at 09:36:01AM -0300, Jason Gunthorpe wrote:
> On Tue, Sep 16, 2025 at 03:19:02PM +0000, Mostafa Saleh wrote:
> 
> > I think the fix for the problem Will mentioned is to just use CMOs
> > before accessing the host structures, so that should be simple.
> > If it turns to be more complicated, I am happy to drop the support
> > for non-coherent devices from this series and we can add it later.
> 
> I feel like it is easier/better to fix the driver to use cachable
> memory than to add CMOs to the pkvm side..

Hmm, but for non-coherent SMMU hardware (which sadly exists in
production), I don't think there's a way for firmware to tell the driver
that it needs to issue CMOs for the page-tables and the CDs but not the
other in-memory data structures (e.g. STEs). I suppose we could do it in
some pKVM-specific way, but then that's not really helping anybody else.

Will

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Jason Gunthorpe 2 weeks, 2 days ago

On Wed, Sep 17, 2025 at 04:01:34PM +0100, Will Deacon wrote:
> On Wed, Sep 17, 2025 at 09:36:01AM -0300, Jason Gunthorpe wrote:
> > On Tue, Sep 16, 2025 at 03:19:02PM +0000, Mostafa Saleh wrote:
> > 
> > > I think the fix for the problem Will mentioned is to just use CMOs
> > > before accessing the host structures, so that should be simple.
> > > If it turns to be more complicated, I am happy to drop the support
> > > for non-coherent devices from this series and we can add it later.
> > 
> > I feel like it is easier/better to fix the driver to use cachable
> > memory than to add CMOs to the pkvm side..
> 
> Hmm, but for non-coherent SMMU hardware (which sadly exists in
> production), I don't think there's a way for firmware to tell the driver
> that it needs to issue CMOs for the page-tables and the CDs but not the
> other in-memory data structures (e.g. STEs). I suppose we could do it in
> some pKVM-specific way, but then that's not really helping anybody else.

Not sure I understand?

I mean to issue CMOs in the smmu driver consistently for everthing,
page table, CD entry, STE, etc. Today it only does it for page table.

Make the driver consistently use cachable memory for everything
instead of having two different ways to deal with incoherent HW.

Jason

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Will Deacon 2 weeks, 2 days ago

On Wed, Sep 17, 2025 at 12:16:12PM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 17, 2025 at 04:01:34PM +0100, Will Deacon wrote:
> > On Wed, Sep 17, 2025 at 09:36:01AM -0300, Jason Gunthorpe wrote:
> > > On Tue, Sep 16, 2025 at 03:19:02PM +0000, Mostafa Saleh wrote:
> > > 
> > > > I think the fix for the problem Will mentioned is to just use CMOs
> > > > before accessing the host structures, so that should be simple.
> > > > If it turns to be more complicated, I am happy to drop the support
> > > > for non-coherent devices from this series and we can add it later.
> > > 
> > > I feel like it is easier/better to fix the driver to use cachable
> > > memory than to add CMOs to the pkvm side..
> > 
> > Hmm, but for non-coherent SMMU hardware (which sadly exists in
> > production), I don't think there's a way for firmware to tell the driver
> > that it needs to issue CMOs for the page-tables and the CDs but not the
> > other in-memory data structures (e.g. STEs). I suppose we could do it in
> > some pKVM-specific way, but then that's not really helping anybody else.
> 
> Not sure I understand?
> 
> I mean to issue CMOs in the smmu driver consistently for everthing,
> page table, CD entry, STE, etc. Today it only does it for page table.
> 
> Make the driver consistently use cachable memory for everything
> instead of having two different ways to deal with incoherent HW.

Ah right, so the driver would unnecessarily issue CMOs for the structures
that are just shared with the hypervisor. At least it's _functional_ that
way, but I'm sure people will complain!

Will

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Jason Gunthorpe 2 weeks, 2 days ago

On Wed, Sep 17, 2025 at 04:25:35PM +0100, Will Deacon wrote:

> Ah right, so the driver would unnecessarily issue CMOs for the structures
> that are just shared with the hypervisor. At least it's _functional_ that
> way, but I'm sure people will complain!

Yes, functional, why would anyone complain? STE and CD manipulation is
not fast path for anything?

Jason

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Will Deacon 2 weeks, 1 day ago

On Wed, Sep 17, 2025 at 12:59:31PM -0300, Jason Gunthorpe wrote:
> On Wed, Sep 17, 2025 at 04:25:35PM +0100, Will Deacon wrote:
> 
> > Ah right, so the driver would unnecessarily issue CMOs for the structures
> > that are just shared with the hypervisor. At least it's _functional_ that
> > way, but I'm sure people will complain!
> 
> Yes, functional, why would anyone complain? STE and CD manipulation is
> not fast path for anything?

Won't it also apply to cmdq insertion?

Will

Re: [PATCH v4 22/28] iommu/arm-smmu-v3-kvm: Emulate CMDQ for host

Posted by Jason Gunthorpe 2 weeks, 1 day ago

On Thu, Sep 18, 2025 at 11:26:50AM +0100, Will Deacon wrote:
> On Wed, Sep 17, 2025 at 12:59:31PM -0300, Jason Gunthorpe wrote:
> > On Wed, Sep 17, 2025 at 04:25:35PM +0100, Will Deacon wrote:
> > 
> > > Ah right, so the driver would unnecessarily issue CMOs for the structures
> > > that are just shared with the hypervisor. At least it's _functional_ that
> > > way, but I'm sure people will complain!
> > 
> > Yes, functional, why would anyone complain? STE and CD manipulation is
> > not fast path for anything?
> 
> Won't it also apply to cmdq insertion?

Oh, changing CMDQ wasn't on my mind..

Yeah, OK I don't know what the performance delta would be like there.

However, to get peak performance out of pkvm we really do want the
SMMU driver to write CMDQ as cachable, pkvm to read it as cachable and
then copy it to a non-cachable HW queue.

Otherwise pkvm will be issuing CMOs on fast paths :\

If we convert the slow speed stuff, STE, CD, Fault to do CMOs then we
could make a fairly small change for pkvm mode to force the guest CMDQ
to be cachable without CMO. Some special feature triggered by pkvm
detection during probe.

Jason