hw/riscv/riscv-iommu.c | 141 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 139 insertions(+), 2 deletions(-)
From: "Guo Ren (Alibaba DAMO Academy)" <guoren@kernel.org>
Current implementation is wrong when iohgatp != bare. The RISC-V
IOMMU specification has defined that the PDT is based on GPA, not
SPA. So this patch fixes the problem, making PDT walk correctly
when the G-stage table walk is enabled.
Fixes: 0c54acb8243d ("hw/riscv: add RISC-V IOMMU base emulation")
Cc: qemu-stable@nongnu.org
Cc: Sebastien Boeuf <seb@rivosinc.com>
Cc: Tomasz Jeznach <tjeznach@rivosinc.com>
Signed-off-by: Guo Ren (Alibaba DAMO Academy) <guoren@kernel.org>
---
Changes in V2:
- Remove nested param to make patch clearer.
hw/riscv/riscv-iommu.c | 141 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 139 insertions(+), 2 deletions(-)
diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
index 96a7fbdefcf3..ded3f7b2fdce 100644
--- a/hw/riscv/riscv-iommu.c
+++ b/hw/riscv/riscv-iommu.c
@@ -866,6 +866,143 @@ static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
return true;
}
+/**
+ * pdt_memory_read: PDT wrapper of dma_memory_read.
+ *
+ * @s: IOMMU Device State
+ * @ctx: Device Translation Context with devid and pasid set
+ * @addr: address within that address space
+ * @buf: buffer with the data transferred
+ * @len: length of the data transferred
+ * @attrs: memory transaction attributes
+ */
+static MemTxResult pdt_memory_read(RISCVIOMMUState *s,
+ RISCVIOMMUContext *ctx,
+ dma_addr_t addr,
+ void *buf, dma_addr_t len,
+ MemTxAttrs attrs)
+{
+ uint64_t gatp_mode, pte;
+ struct {
+ unsigned char step;
+ unsigned char levels;
+ unsigned char ptidxbits;
+ unsigned char ptesize;
+ } sc;
+ MemTxResult ret;
+ dma_addr_t base = addr;
+
+ /* G stages translation mode */
+ gatp_mode = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
+ if (gatp_mode == RISCV_IOMMU_DC_IOHGATP_MODE_BARE)
+ goto out;
+
+ /* G stages translation tables root pointer */
+ base = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
+
+ /* Start at step 0 */
+ sc.step = 0;
+
+ if (s->fctl & RISCV_IOMMU_FCTL_GXL) {
+ /* 32bit mode for GXL == 1 */
+ switch (gatp_mode) {
+ case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
+ if (!(s->cap & RISCV_IOMMU_CAP_SV32X4)) {
+ return MEMTX_ACCESS_ERROR;
+ }
+ sc.levels = 2;
+ sc.ptidxbits = 10;
+ sc.ptesize = 4;
+ break;
+ default:
+ return MEMTX_ACCESS_ERROR;
+ }
+ } else {
+ /* 64bit mode for GXL == 0 */
+ switch (gatp_mode) {
+ case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
+ if (!(s->cap & RISCV_IOMMU_CAP_SV39X4)) {
+ return MEMTX_ACCESS_ERROR;
+ }
+ sc.levels = 3;
+ sc.ptidxbits = 9;
+ sc.ptesize = 8;
+ break;
+ case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
+ if (!(s->cap & RISCV_IOMMU_CAP_SV48X4)) {
+ return MEMTX_ACCESS_ERROR;
+ }
+ sc.levels = 4;
+ sc.ptidxbits = 9;
+ sc.ptesize = 8;
+ break;
+ case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
+ if (!(s->cap & RISCV_IOMMU_CAP_SV57X4)) {
+ return MEMTX_ACCESS_ERROR;
+ }
+ sc.levels = 5;
+ sc.ptidxbits = 9;
+ sc.ptesize = 8;
+ break;
+ default:
+ return MEMTX_ACCESS_ERROR;
+ }
+ }
+
+ do {
+ const unsigned va_bits = (sc.step ? 0 : 2) + sc.ptidxbits;
+ const unsigned va_skip = TARGET_PAGE_BITS + sc.ptidxbits *
+ (sc.levels - 1 - sc.step);
+ const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
+ const dma_addr_t pte_addr = base + idx * sc.ptesize;
+
+ /* Address range check before first level lookup */
+ if (!sc.step) {
+ const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
+ if ((addr & va_mask) != addr) {
+ return MEMTX_ACCESS_ERROR;
+ }
+ }
+
+ /* Read page table entry */
+ if (sc.ptesize == 4) {
+ uint32_t pte32 = 0;
+ ret = ldl_le_dma(s->target_as, pte_addr, &pte32, attrs);
+ pte = pte32;
+ } else {
+ ret = ldq_le_dma(s->target_as, pte_addr, &pte, attrs);
+ }
+ if (ret != MEMTX_OK)
+ return ret;
+
+ sc.step++;
+ hwaddr ppn = pte >> PTE_PPN_SHIFT;
+
+ if (!(pte & PTE_V)) {
+ return MEMTX_ACCESS_ERROR; /* Invalid PTE */
+ } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
+ return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W */
+ } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
+ return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W + PTE_X */
+ } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
+ return MEMTX_ACCESS_ERROR; /* Misaligned PPN */
+ } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
+ base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
+ } else {
+ /* Leaf PTE, translation completed. */
+ base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
+ break;
+ }
+
+ if (sc.step == sc.levels) {
+ return MEMTX_ACCESS_ERROR; /* Can't find leaf PTE */
+ }
+ } while (1);
+
+out:
+ return dma_memory_read(s->target_as, base, buf, len, attrs);
+}
+
/*
* RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
*
@@ -1038,7 +1175,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
*/
const int split = depth * 9 + 8;
addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
- if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+ if (pdt_memory_read(s, ctx, addr, &de, sizeof(de),
MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
}
@@ -1053,7 +1190,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
/* Leaf entry in PDT */
addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
- if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
+ if (pdt_memory_read(s, ctx, addr, &dc.ta, sizeof(uint64_t) * 2,
MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
}
--
2.40.1
<guoren@kernel.org> 于2025年8月5日周二 01:12写道:
> From: "Guo Ren (Alibaba DAMO Academy)" <guoren@kernel.org>
>
> Current implementation is wrong when iohgatp != bare. The RISC-V
> IOMMU specification has defined that the PDT is based on GPA, not
> SPA. So this patch fixes the problem, making PDT walk correctly
> when the G-stage table walk is enabled.
>
> Fixes: 0c54acb8243d ("hw/riscv: add RISC-V IOMMU base emulation")
> Cc: qemu-stable@nongnu.org
> Cc: Sebastien Boeuf <seb@rivosinc.com>
> Cc: Tomasz Jeznach <tjeznach@rivosinc.com>
> Signed-off-by: Guo Ren (Alibaba DAMO Academy) <guoren@kernel.org>
> ---
> Changes in V2:
> - Remove nested param to make patch clearer.
>
> hw/riscv/riscv-iommu.c | 141 ++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 139 insertions(+), 2 deletions(-)
>
> LGTM.
*"3.3.2. Process to locate the Process-context*
*
The device-context provides the PDT root page PPN (pdtp.ppn). When
DC.iohgatp.mode is not
*
*
Bare, pdtp.PPN as well as pdte.PPN are Guest Physical Addresses (GPA)
which must be translated
*
*
into Supervisor Physical Addresses (SPA) using the second-stage page
table pointed to by
*
*
DC.iohgatp. The memory accesses to the PDT are treated as implicit
read memory accesses by the
*
* second-stage."*
Reviewed-by: Weiwei Li <liwei1518@gmail.com>
Regards,
Weiwei Li
diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> index 96a7fbdefcf3..ded3f7b2fdce 100644
> --- a/hw/riscv/riscv-iommu.c
> +++ b/hw/riscv/riscv-iommu.c
> @@ -866,6 +866,143 @@ static bool
> riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
> return true;
> }
>
> +/**
> + * pdt_memory_read: PDT wrapper of dma_memory_read.
> + *
> + * @s: IOMMU Device State
> + * @ctx: Device Translation Context with devid and pasid set
> + * @addr: address within that address space
> + * @buf: buffer with the data transferred
> + * @len: length of the data transferred
> + * @attrs: memory transaction attributes
> + */
> +static MemTxResult pdt_memory_read(RISCVIOMMUState *s,
> + RISCVIOMMUContext *ctx,
> + dma_addr_t addr,
> + void *buf, dma_addr_t len,
> + MemTxAttrs attrs)
> +{
> + uint64_t gatp_mode, pte;
> + struct {
> + unsigned char step;
> + unsigned char levels;
> + unsigned char ptidxbits;
> + unsigned char ptesize;
> + } sc;
> + MemTxResult ret;
> + dma_addr_t base = addr;
> +
> + /* G stages translation mode */
> + gatp_mode = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> + if (gatp_mode == RISCV_IOMMU_DC_IOHGATP_MODE_BARE)
> + goto out;
> +
> + /* G stages translation tables root pointer */
> + base = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> +
> + /* Start at step 0 */
> + sc.step = 0;
> +
> + if (s->fctl & RISCV_IOMMU_FCTL_GXL) {
> + /* 32bit mode for GXL == 1 */
> + switch (gatp_mode) {
> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> + if (!(s->cap & RISCV_IOMMU_CAP_SV32X4)) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + sc.levels = 2;
> + sc.ptidxbits = 10;
> + sc.ptesize = 4;
> + break;
> + default:
> + return MEMTX_ACCESS_ERROR;
> + }
> + } else {
> + /* 64bit mode for GXL == 0 */
> + switch (gatp_mode) {
> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> + if (!(s->cap & RISCV_IOMMU_CAP_SV39X4)) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + sc.levels = 3;
> + sc.ptidxbits = 9;
> + sc.ptesize = 8;
> + break;
> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> + if (!(s->cap & RISCV_IOMMU_CAP_SV48X4)) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + sc.levels = 4;
> + sc.ptidxbits = 9;
> + sc.ptesize = 8;
> + break;
> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> + if (!(s->cap & RISCV_IOMMU_CAP_SV57X4)) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + sc.levels = 5;
> + sc.ptidxbits = 9;
> + sc.ptesize = 8;
> + break;
> + default:
> + return MEMTX_ACCESS_ERROR;
> + }
> + }
> +
> + do {
> + const unsigned va_bits = (sc.step ? 0 : 2) + sc.ptidxbits;
> + const unsigned va_skip = TARGET_PAGE_BITS + sc.ptidxbits *
> + (sc.levels - 1 - sc.step);
> + const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> + const dma_addr_t pte_addr = base + idx * sc.ptesize;
> +
> + /* Address range check before first level lookup */
> + if (!sc.step) {
> + const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> + if ((addr & va_mask) != addr) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + }
> +
> + /* Read page table entry */
> + if (sc.ptesize == 4) {
> + uint32_t pte32 = 0;
> + ret = ldl_le_dma(s->target_as, pte_addr, &pte32, attrs);
> + pte = pte32;
> + } else {
> + ret = ldq_le_dma(s->target_as, pte_addr, &pte, attrs);
> + }
> + if (ret != MEMTX_OK)
> + return ret;
> +
> + sc.step++;
> + hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> + if (!(pte & PTE_V)) {
> + return MEMTX_ACCESS_ERROR; /* Invalid PTE */
> + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> + return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W
> */
> + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> + return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W
> + PTE_X */
> + } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> + return MEMTX_ACCESS_ERROR; /* Misaligned PPN */
> + } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> + base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> + } else {
> + /* Leaf PTE, translation completed. */
> + base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> + break;
> + }
> +
> + if (sc.step == sc.levels) {
> + return MEMTX_ACCESS_ERROR; /* Can't find leaf PTE */
> + }
> + } while (1);
> +
> +out:
> + return dma_memory_read(s->target_as, base, buf, len, attrs);
> +}
> +
> /*
> * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> *
> @@ -1038,7 +1175,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s,
> RISCVIOMMUContext *ctx)
> */
> const int split = depth * 9 + 8;
> addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
> - if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> + if (pdt_memory_read(s, ctx, addr, &de, sizeof(de),
> MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> }
> @@ -1053,7 +1190,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s,
> RISCVIOMMUContext *ctx)
>
> /* Leaf entry in PDT */
> addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
> - if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> + if (pdt_memory_read(s, ctx, addr, &dc.ta, sizeof(uint64_t) * 2,
> MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> }
> --
> 2.40.1
>
>
On Wed, Aug 6, 2025 at 10:19 PM wei li <liwei1518@gmail.com> wrote:
>
>
>
> <guoren@kernel.org> 于2025年8月5日周二 01:12写道:
>>
>> From: "Guo Ren (Alibaba DAMO Academy)" <guoren@kernel.org>
>>
>> Current implementation is wrong when iohgatp != bare. The RISC-V
>> IOMMU specification has defined that the PDT is based on GPA, not
>> SPA. So this patch fixes the problem, making PDT walk correctly
>> when the G-stage table walk is enabled.
>>
>> Fixes: 0c54acb8243d ("hw/riscv: add RISC-V IOMMU base emulation")
>> Cc: qemu-stable@nongnu.org
>> Cc: Sebastien Boeuf <seb@rivosinc.com>
>> Cc: Tomasz Jeznach <tjeznach@rivosinc.com>
>> Signed-off-by: Guo Ren (Alibaba DAMO Academy) <guoren@kernel.org>
>> ---
>> Changes in V2:
>> - Remove nested param to make patch clearer.
>>
>> hw/riscv/riscv-iommu.c | 141 ++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 139 insertions(+), 2 deletions(-)
>>
> LGTM.
>
> "3.3.2. Process to locate the Process-context
>
> The device-context provides the PDT root page PPN (pdtp.ppn). When DC.iohgatp.mode is not
>
> Bare, pdtp.PPN as well as pdte.PPN are Guest Physical Addresses (GPA) which must be translated
>
> into Supervisor Physical Addresses (SPA) using the second-stage page table pointed to by
>
> DC.iohgatp. The memory accesses to the PDT are treated as implicit read memory accesses by the
>
> second-stage."
>
> Reviewed-by: Weiwei Li <liwei1518@gmail.com>
Thanks for mentioning the spec words. I would add this quotation to
the commit log of the next version.
>
> Regards,
>
> Weiwei Li
>
>
>> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
>> index 96a7fbdefcf3..ded3f7b2fdce 100644
>> --- a/hw/riscv/riscv-iommu.c
>> +++ b/hw/riscv/riscv-iommu.c
>> @@ -866,6 +866,143 @@ static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
>> return true;
>> }
>>
>> +/**
>> + * pdt_memory_read: PDT wrapper of dma_memory_read.
>> + *
>> + * @s: IOMMU Device State
>> + * @ctx: Device Translation Context with devid and pasid set
>> + * @addr: address within that address space
>> + * @buf: buffer with the data transferred
>> + * @len: length of the data transferred
>> + * @attrs: memory transaction attributes
>> + */
>> +static MemTxResult pdt_memory_read(RISCVIOMMUState *s,
>> + RISCVIOMMUContext *ctx,
>> + dma_addr_t addr,
>> + void *buf, dma_addr_t len,
>> + MemTxAttrs attrs)
>> +{
>> + uint64_t gatp_mode, pte;
>> + struct {
>> + unsigned char step;
>> + unsigned char levels;
>> + unsigned char ptidxbits;
>> + unsigned char ptesize;
>> + } sc;
>> + MemTxResult ret;
>> + dma_addr_t base = addr;
>> +
>> + /* G stages translation mode */
>> + gatp_mode = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
>> + if (gatp_mode == RISCV_IOMMU_DC_IOHGATP_MODE_BARE)
>> + goto out;
>> +
>> + /* G stages translation tables root pointer */
>> + base = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
>> +
>> + /* Start at step 0 */
>> + sc.step = 0;
>> +
>> + if (s->fctl & RISCV_IOMMU_FCTL_GXL) {
>> + /* 32bit mode for GXL == 1 */
>> + switch (gatp_mode) {
>> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
>> + if (!(s->cap & RISCV_IOMMU_CAP_SV32X4)) {
>> + return MEMTX_ACCESS_ERROR;
>> + }
>> + sc.levels = 2;
>> + sc.ptidxbits = 10;
>> + sc.ptesize = 4;
>> + break;
>> + default:
>> + return MEMTX_ACCESS_ERROR;
>> + }
>> + } else {
>> + /* 64bit mode for GXL == 0 */
>> + switch (gatp_mode) {
>> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
>> + if (!(s->cap & RISCV_IOMMU_CAP_SV39X4)) {
>> + return MEMTX_ACCESS_ERROR;
>> + }
>> + sc.levels = 3;
>> + sc.ptidxbits = 9;
>> + sc.ptesize = 8;
>> + break;
>> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
>> + if (!(s->cap & RISCV_IOMMU_CAP_SV48X4)) {
>> + return MEMTX_ACCESS_ERROR;
>> + }
>> + sc.levels = 4;
>> + sc.ptidxbits = 9;
>> + sc.ptesize = 8;
>> + break;
>> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
>> + if (!(s->cap & RISCV_IOMMU_CAP_SV57X4)) {
>> + return MEMTX_ACCESS_ERROR;
>> + }
>> + sc.levels = 5;
>> + sc.ptidxbits = 9;
>> + sc.ptesize = 8;
>> + break;
>> + default:
>> + return MEMTX_ACCESS_ERROR;
>> + }
>> + }
>> +
>> + do {
>> + const unsigned va_bits = (sc.step ? 0 : 2) + sc.ptidxbits;
>> + const unsigned va_skip = TARGET_PAGE_BITS + sc.ptidxbits *
>> + (sc.levels - 1 - sc.step);
>> + const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
>> + const dma_addr_t pte_addr = base + idx * sc.ptesize;
>> +
>> + /* Address range check before first level lookup */
>> + if (!sc.step) {
>> + const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
>> + if ((addr & va_mask) != addr) {
>> + return MEMTX_ACCESS_ERROR;
>> + }
>> + }
>> +
>> + /* Read page table entry */
>> + if (sc.ptesize == 4) {
>> + uint32_t pte32 = 0;
>> + ret = ldl_le_dma(s->target_as, pte_addr, &pte32, attrs);
>> + pte = pte32;
>> + } else {
>> + ret = ldq_le_dma(s->target_as, pte_addr, &pte, attrs);
>> + }
>> + if (ret != MEMTX_OK)
>> + return ret;
>> +
>> + sc.step++;
>> + hwaddr ppn = pte >> PTE_PPN_SHIFT;
>> +
>> + if (!(pte & PTE_V)) {
>> + return MEMTX_ACCESS_ERROR; /* Invalid PTE */
>> + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
>> + return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W */
>> + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
>> + return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W + PTE_X */
>> + } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
>> + return MEMTX_ACCESS_ERROR; /* Misaligned PPN */
>> + } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
>> + base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
>> + } else {
>> + /* Leaf PTE, translation completed. */
>> + base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
>> + break;
>> + }
>> +
>> + if (sc.step == sc.levels) {
>> + return MEMTX_ACCESS_ERROR; /* Can't find leaf PTE */
>> + }
>> + } while (1);
>> +
>> +out:
>> + return dma_memory_read(s->target_as, base, buf, len, attrs);
>> +}
>> +
>> /*
>> * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
>> *
>> @@ -1038,7 +1175,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>> */
>> const int split = depth * 9 + 8;
>> addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
>> - if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
>> + if (pdt_memory_read(s, ctx, addr, &de, sizeof(de),
>> MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>> return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
>> }
>> @@ -1053,7 +1190,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>>
>> /* Leaf entry in PDT */
>> addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
>> - if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
>> + if (pdt_memory_read(s, ctx, addr, &dc.ta, sizeof(uint64_t) * 2,
>> MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>> return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
>> }
>> --
>> 2.40.1
>>
--
Best Regards
Guo Ren
On 8/5/2025 1:12 AM, guoren@kernel.org wrote:
> From: "Guo Ren (Alibaba DAMO Academy)" <guoren@kernel.org>
>
> Current implementation is wrong when iohgatp != bare. The RISC-V
> IOMMU specification has defined that the PDT is based on GPA, not
> SPA. So this patch fixes the problem, making PDT walk correctly
> when the G-stage table walk is enabled.
>
> Fixes: 0c54acb8243d ("hw/riscv: add RISC-V IOMMU base emulation")
> Cc: qemu-stable@nongnu.org
> Cc: Sebastien Boeuf <seb@rivosinc.com>
> Cc: Tomasz Jeznach <tjeznach@rivosinc.com>
> Signed-off-by: Guo Ren (Alibaba DAMO Academy) <guoren@kernel.org>
> ---
> Changes in V2:
> - Remove nested param to make patch clearer.
>
> hw/riscv/riscv-iommu.c | 141 ++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 139 insertions(+), 2 deletions(-)
>
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> index 96a7fbdefcf3..ded3f7b2fdce 100644
> --- a/hw/riscv/riscv-iommu.c
> +++ b/hw/riscv/riscv-iommu.c
> @@ -866,6 +866,143 @@ static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
> return true;
> }
>
> +/**
> + * pdt_memory_read: PDT wrapper of dma_memory_read.
> + *
> + * @s: IOMMU Device State
> + * @ctx: Device Translation Context with devid and pasid set
> + * @addr: address within that address space
> + * @buf: buffer with the data transferred
> + * @len: length of the data transferred
> + * @attrs: memory transaction attributes
> + */
> +static MemTxResult pdt_memory_read(RISCVIOMMUState *s,
> + RISCVIOMMUContext *ctx,
> + dma_addr_t addr,
> + void *buf, dma_addr_t len,
> + MemTxAttrs attrs)
> +{
> + uint64_t gatp_mode, pte;
> + struct {
> + unsigned char step;
> + unsigned char levels;
> + unsigned char ptidxbits;
> + unsigned char ptesize;
> + } sc;
> + MemTxResult ret;
> + dma_addr_t base = addr;
> +
> + /* G stages translation mode */
> + gatp_mode = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> + if (gatp_mode == RISCV_IOMMU_DC_IOHGATP_MODE_BARE)
> + goto out;
> +
> + /* G stages translation tables root pointer */
> + base = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> +
> + /* Start at step 0 */
> + sc.step = 0;
> +
> + if (s->fctl & RISCV_IOMMU_FCTL_GXL) {
> + /* 32bit mode for GXL == 1 */
> + switch (gatp_mode) {
> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> + if (!(s->cap & RISCV_IOMMU_CAP_SV32X4)) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + sc.levels = 2;
> + sc.ptidxbits = 10;
> + sc.ptesize = 4;
> + break;
> + default:
> + return MEMTX_ACCESS_ERROR;
> + }
> + } else {
> + /* 64bit mode for GXL == 0 */
> + switch (gatp_mode) {
> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> + if (!(s->cap & RISCV_IOMMU_CAP_SV39X4)) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + sc.levels = 3;
> + sc.ptidxbits = 9;
> + sc.ptesize = 8;
> + break;
> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> + if (!(s->cap & RISCV_IOMMU_CAP_SV48X4)) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + sc.levels = 4;
> + sc.ptidxbits = 9;
> + sc.ptesize = 8;
> + break;
> + case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> + if (!(s->cap & RISCV_IOMMU_CAP_SV57X4)) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + sc.levels = 5;
> + sc.ptidxbits = 9;
> + sc.ptesize = 8;
> + break;
> + default:
> + return MEMTX_ACCESS_ERROR;
> + }
> + }
How about moving the variables of 'gatp_mode', 'base' and 'sc' out of
this wrapper function ?
Since all of them are the same except for 'sc.step' during the traversal
of PDT.
Otherwise,
Reviewed-by: Nutty Liu <liujingqi@lanxincomputing.com>
Thanks,
Nutty
> +
> + do {
> + const unsigned va_bits = (sc.step ? 0 : 2) + sc.ptidxbits;
> + const unsigned va_skip = TARGET_PAGE_BITS + sc.ptidxbits *
> + (sc.levels - 1 - sc.step);
> + const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> + const dma_addr_t pte_addr = base + idx * sc.ptesize;
> +
> + /* Address range check before first level lookup */
> + if (!sc.step) {
> + const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> + if ((addr & va_mask) != addr) {
> + return MEMTX_ACCESS_ERROR;
> + }
> + }
> +
> + /* Read page table entry */
> + if (sc.ptesize == 4) {
> + uint32_t pte32 = 0;
> + ret = ldl_le_dma(s->target_as, pte_addr, &pte32, attrs);
> + pte = pte32;
> + } else {
> + ret = ldq_le_dma(s->target_as, pte_addr, &pte, attrs);
> + }
> + if (ret != MEMTX_OK)
> + return ret;
> +
> + sc.step++;
> + hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> + if (!(pte & PTE_V)) {
> + return MEMTX_ACCESS_ERROR; /* Invalid PTE */
> + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> + return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W */
> + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> + return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W + PTE_X */
> + } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> + return MEMTX_ACCESS_ERROR; /* Misaligned PPN */
> + } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> + base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> + } else {
> + /* Leaf PTE, translation completed. */
> + base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> + break;
> + }
> +
> + if (sc.step == sc.levels) {
> + return MEMTX_ACCESS_ERROR; /* Can't find leaf PTE */
> + }
> + } while (1);
> +
> +out:
> + return dma_memory_read(s->target_as, base, buf, len, attrs);
> +}
> +
> /*
> * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> *
> @@ -1038,7 +1175,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> */
> const int split = depth * 9 + 8;
> addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
> - if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> + if (pdt_memory_read(s, ctx, addr, &de, sizeof(de),
> MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> }
> @@ -1053,7 +1190,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>
> /* Leaf entry in PDT */
> addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
> - if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> + if (pdt_memory_read(s, ctx, addr, &dc.ta, sizeof(uint64_t) * 2,
> MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> }
On Wed, Aug 6, 2025 at 4:58 PM Nutty Liu <liujingqi@lanxincomputing.com> wrote:
>
> On 8/5/2025 1:12 AM, guoren@kernel.org wrote:
> > From: "Guo Ren (Alibaba DAMO Academy)" <guoren@kernel.org>
> >
> > Current implementation is wrong when iohgatp != bare. The RISC-V
> > IOMMU specification has defined that the PDT is based on GPA, not
> > SPA. So this patch fixes the problem, making PDT walk correctly
> > when the G-stage table walk is enabled.
> >
> > Fixes: 0c54acb8243d ("hw/riscv: add RISC-V IOMMU base emulation")
> > Cc: qemu-stable@nongnu.org
> > Cc: Sebastien Boeuf <seb@rivosinc.com>
> > Cc: Tomasz Jeznach <tjeznach@rivosinc.com>
> > Signed-off-by: Guo Ren (Alibaba DAMO Academy) <guoren@kernel.org>
> > ---
> > Changes in V2:
> > - Remove nested param to make patch clearer.
> >
> > hw/riscv/riscv-iommu.c | 141 ++++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 139 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> > index 96a7fbdefcf3..ded3f7b2fdce 100644
> > --- a/hw/riscv/riscv-iommu.c
> > +++ b/hw/riscv/riscv-iommu.c
> > @@ -866,6 +866,143 @@ static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
> > return true;
> > }
> >
> > +/**
> > + * pdt_memory_read: PDT wrapper of dma_memory_read.
> > + *
> > + * @s: IOMMU Device State
> > + * @ctx: Device Translation Context with devid and pasid set
> > + * @addr: address within that address space
> > + * @buf: buffer with the data transferred
> > + * @len: length of the data transferred
> > + * @attrs: memory transaction attributes
> > + */
> > +static MemTxResult pdt_memory_read(RISCVIOMMUState *s,
> > + RISCVIOMMUContext *ctx,
> > + dma_addr_t addr,
> > + void *buf, dma_addr_t len,
> > + MemTxAttrs attrs)
> > +{
> > + uint64_t gatp_mode, pte;
> > + struct {
> > + unsigned char step;
> > + unsigned char levels;
> > + unsigned char ptidxbits;
> > + unsigned char ptesize;
> > + } sc;
> > + MemTxResult ret;
> > + dma_addr_t base = addr;
> > +
> > + /* G stages translation mode */
> > + gatp_mode = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> > + if (gatp_mode == RISCV_IOMMU_DC_IOHGATP_MODE_BARE)
> > + goto out;
> > +
> > + /* G stages translation tables root pointer */
> > + base = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> > +
> > + /* Start at step 0 */
> > + sc.step = 0;
> > +
> > + if (s->fctl & RISCV_IOMMU_FCTL_GXL) {
> > + /* 32bit mode for GXL == 1 */
> > + switch (gatp_mode) {
> > + case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> > + if (!(s->cap & RISCV_IOMMU_CAP_SV32X4)) {
> > + return MEMTX_ACCESS_ERROR;
> > + }
> > + sc.levels = 2;
> > + sc.ptidxbits = 10;
> > + sc.ptesize = 4;
> > + break;
> > + default:
> > + return MEMTX_ACCESS_ERROR;
> > + }
> > + } else {
> > + /* 64bit mode for GXL == 0 */
> > + switch (gatp_mode) {
> > + case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> > + if (!(s->cap & RISCV_IOMMU_CAP_SV39X4)) {
> > + return MEMTX_ACCESS_ERROR;
> > + }
> > + sc.levels = 3;
> > + sc.ptidxbits = 9;
> > + sc.ptesize = 8;
> > + break;
> > + case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> > + if (!(s->cap & RISCV_IOMMU_CAP_SV48X4)) {
> > + return MEMTX_ACCESS_ERROR;
> > + }
> > + sc.levels = 4;
> > + sc.ptidxbits = 9;
> > + sc.ptesize = 8;
> > + break;
> > + case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> > + if (!(s->cap & RISCV_IOMMU_CAP_SV57X4)) {
> > + return MEMTX_ACCESS_ERROR;
> > + }
> > + sc.levels = 5;
> > + sc.ptidxbits = 9;
> > + sc.ptesize = 8;
> > + break;
> > + default:
> > + return MEMTX_ACCESS_ERROR;
> > + }
> > + }
> How about moving the variables of 'gatp_mode', 'base' and 'sc' out of
> this wrapper function ?
> Since all of them are the same except for 'sc.step' during the traversal
> of PDT.
This is a fixup patch, so I want to make the wrapper function handle
all the necessary tasks to minimize the impact on other code.
And the detection of them is low-cost, so no need to complicate the
coding convention.
>
> Otherwise,
> Reviewed-by: Nutty Liu <liujingqi@lanxincomputing.com>
Thx.
>
> Thanks,
> Nutty
> > +
> > + do {
> > + const unsigned va_bits = (sc.step ? 0 : 2) + sc.ptidxbits;
> > + const unsigned va_skip = TARGET_PAGE_BITS + sc.ptidxbits *
> > + (sc.levels - 1 - sc.step);
> > + const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> > + const dma_addr_t pte_addr = base + idx * sc.ptesize;
> > +
> > + /* Address range check before first level lookup */
> > + if (!sc.step) {
> > + const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> > + if ((addr & va_mask) != addr) {
> > + return MEMTX_ACCESS_ERROR;
> > + }
> > + }
> > +
> > + /* Read page table entry */
> > + if (sc.ptesize == 4) {
> > + uint32_t pte32 = 0;
> > + ret = ldl_le_dma(s->target_as, pte_addr, &pte32, attrs);
> > + pte = pte32;
> > + } else {
> > + ret = ldq_le_dma(s->target_as, pte_addr, &pte, attrs);
> > + }
> > + if (ret != MEMTX_OK)
> > + return ret;
> > +
> > + sc.step++;
> > + hwaddr ppn = pte >> PTE_PPN_SHIFT;
> > +
> > + if (!(pte & PTE_V)) {
> > + return MEMTX_ACCESS_ERROR; /* Invalid PTE */
> > + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> > + return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W */
> > + } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> > + return MEMTX_ACCESS_ERROR; /* Reserved leaf PTE flags: PTE_W + PTE_X */
> > + } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> > + return MEMTX_ACCESS_ERROR; /* Misaligned PPN */
> > + } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> > + base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> > + } else {
> > + /* Leaf PTE, translation completed. */
> > + base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> > + break;
> > + }
> > +
> > + if (sc.step == sc.levels) {
> > + return MEMTX_ACCESS_ERROR; /* Can't find leaf PTE */
> > + }
> > + } while (1);
> > +
> > +out:
> > + return dma_memory_read(s->target_as, base, buf, len, attrs);
> > +}
> > +
> > /*
> > * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> > *
> > @@ -1038,7 +1175,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> > */
> > const int split = depth * 9 + 8;
> > addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
> > - if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> > + if (pdt_memory_read(s, ctx, addr, &de, sizeof(de),
> > MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> > return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> > }
> > @@ -1053,7 +1190,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> >
> > /* Leaf entry in PDT */
> > addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
> > - if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> > + if (pdt_memory_read(s, ctx, addr, &dc.ta, sizeof(uint64_t) * 2,
> > MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> > return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> > }
--
Best Regards
Guo Ren
© 2016 - 2025 Red Hat, Inc.