[RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report

Aneesh Kumar K.V (Arm) posted 38 patches 2 months, 1 week ago
[RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Aneesh Kumar K.V (Arm) 2 months, 1 week ago
This starts the sequence to transition the realm device to the TDISP RUN
state by writing 1 to 'tsm/accept'.

Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
 arch/arm64/include/asm/rsi_cmds.h         |  18 +++
 arch/arm64/include/asm/rsi_smc.h          |   4 +
 drivers/virt/coco/arm-cca-guest/arm-cca.c |   3 +
 drivers/virt/coco/arm-cca-guest/rsi-da.c  | 132 ++++++++++++++++++++++
 drivers/virt/coco/arm-cca-guest/rsi-da.h  |  25 ++++
 5 files changed, 182 insertions(+)

diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h
index 18fc4e1ce577..1cc00d404e53 100644
--- a/arch/arm64/include/asm/rsi_cmds.h
+++ b/arch/arm64/include/asm/rsi_cmds.h
@@ -228,4 +228,22 @@ static inline unsigned long rsi_host_call(phys_addr_t addr)
 	return res.a0;
 }
 
+static inline unsigned long
+rsi_rdev_validate_mapping(unsigned long vdev_id, unsigned long inst_id,
+			  phys_addr_t start_ipa, phys_addr_t end_ipa,
+			  phys_addr_t io_pa, phys_addr_t *next_ipa, unsigned long flags)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RSI_RDEV_VALIDATE_MAPPING, vdev_id,
+			     inst_id, start_ipa, end_ipa, io_pa, flags, &res);
+	*next_ipa = res.a1;
+
+	if (res.a2 != RSI_ACCEPT)
+		return -EPERM;
+
+	if (res.a0 != RSI_SUCCESS)
+		return -EINVAL;
+	return 0;
+}
 #endif /* __ASM_RSI_CMDS_H */
diff --git a/arch/arm64/include/asm/rsi_smc.h b/arch/arm64/include/asm/rsi_smc.h
index 1d762fe3777b..a28b41cf01ca 100644
--- a/arch/arm64/include/asm/rsi_smc.h
+++ b/arch/arm64/include/asm/rsi_smc.h
@@ -204,4 +204,8 @@ struct rsi_host_call {
 
 #define SMC_RSI_RDEV_LOCK			SMC_RSI_FID(0x1a9)
 
+#define RSI_DEV_MEM_COHERENT		BIT(0)
+#define RSI_DEV_MEM_LIMITED_ORDER	BIT(1)
+#define SMC_RSI_RDEV_VALIDATE_MAPPING		SMC_RSI_FID(0x1ac)
+
 #endif /* __ASM_RSI_SMC_H_ */
diff --git a/drivers/virt/coco/arm-cca-guest/arm-cca.c b/drivers/virt/coco/arm-cca-guest/arm-cca.c
index de70fba09e92..c1cefb983ac7 100644
--- a/drivers/virt/coco/arm-cca-guest/arm-cca.c
+++ b/drivers/virt/coco/arm-cca-guest/arm-cca.c
@@ -247,6 +247,9 @@ static void cca_tsm_unlock(struct pci_dev *pdev)
 	int vdev_id = (pci_domain_nr(pdev->bus) << 16) |
 		PCI_DEVID(pdev->bus->number, pdev->devfn);
 
+	/* invalidate dev mapping based on interface report */
+	rsi_update_interface_report(pdev, false);
+
 	ret = rhi_da_vdev_set_tdi_state(vdev_id, RHI_DA_TDI_CONFIG_UNLOCKED);
 	if (ret) {
 		pci_err(pdev, "failed to TSM unbind the device (%ld)\n", ret);
diff --git a/drivers/virt/coco/arm-cca-guest/rsi-da.c b/drivers/virt/coco/arm-cca-guest/rsi-da.c
index 47b379318e7c..936f844880de 100644
--- a/drivers/virt/coco/arm-cca-guest/rsi-da.c
+++ b/drivers/virt/coco/arm-cca-guest/rsi-da.c
@@ -215,3 +215,135 @@ int rsi_device_lock(struct pci_dev *pdev)
 
 	return ret;
 }
+
+static inline unsigned long
+rsi_validate_dev_mapping(unsigned long vdev_id, unsigned long inst_id,
+			 phys_addr_t start_ipa, phys_addr_t end_ipa,
+			 phys_addr_t io_pa, unsigned long flags)
+{
+	unsigned long ret;
+	phys_addr_t next_ipa;
+
+	while (start_ipa < end_ipa) {
+		ret = rsi_rdev_validate_mapping(vdev_id, inst_id,
+						start_ipa, end_ipa,
+						io_pa, &next_ipa, flags);
+		if (ret || next_ipa < start_ipa || next_ipa > end_ipa)
+			return -EINVAL;
+		io_pa += next_ipa - start_ipa;
+		start_ipa = next_ipa;
+	}
+	return 0;
+}
+
+static int rsi_invalidate_dev_mapping(phys_addr_t start_ipa, phys_addr_t end_ipa)
+{
+	return rsi_set_memory_range(start_ipa, end_ipa, RSI_RIPAS_EMPTY,
+				    RSI_CHANGE_DESTROYED);
+}
+
+static int get_msix_bar(struct pci_dev *pdev, int cap)
+{
+	int bar;
+	u32 table_offset;
+	unsigned long flags;
+
+	pci_read_config_dword(pdev, pdev->msix_cap + cap, &table_offset);
+	bar = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
+	flags = pci_resource_flags(pdev, bar);
+	if (!flags || (flags & IORESOURCE_UNSET))
+		return -1;
+
+	return bar;
+}
+
+int rsi_update_interface_report(struct pci_dev *pdev, bool validate)
+{
+	int ret;
+	struct resource *r;
+	int msix_tbl_bar, msix_pba_bar;
+	unsigned int range_id;
+	unsigned long mmio_start_phys;
+	unsigned long mmio_flags = 0; /* non coherent, not limited order */
+	struct pci_tdisp_mmio_range *mmio_range;
+	struct pci_tdisp_device_interface_report *interface_report;
+	struct cca_guest_dsc *dsm = to_cca_guest_dsc(pdev);
+	int vdev_id = (pci_domain_nr(pdev->bus) << 16) |
+		PCI_DEVID(pdev->bus->number, pdev->devfn);
+
+
+	interface_report = (struct pci_tdisp_device_interface_report *)dsm->interface_report;
+	mmio_range = (struct pci_tdisp_mmio_range *)(interface_report + 1);
+
+
+	msix_tbl_bar = get_msix_bar(pdev, PCI_MSIX_TABLE);
+	msix_pba_bar = get_msix_bar(pdev, PCI_MSIX_PBA);
+
+	for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) {
+
+		/*FIXME!! units in 4K size*/
+		range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes);
+
+		/* no secure interrupts */
+		if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
+			pr_info("Skipping misx table\n");
+			continue;
+		}
+
+		if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
+			pr_info("Skipping misx pba\n");
+			continue;
+		}
+
+		r = pci_resource_n(pdev, range_id);
+
+		if (r->end == r->start ||
+		    ((r->end - r->start + 1) & ~PAGE_MASK) || !mmio_range->num_pages) {
+			pci_warn(pdev, "Skipping broken range [%d] #%d %d pages, %llx..%llx\n",
+				i, range_id, mmio_range->num_pages, r->start, r->end);
+			continue;
+		}
+
+		if (FIELD_GET(TSM_INTF_REPORT_MMIO_IS_NON_TEE, mmio_range->range_attributes)) {
+			pci_info(pdev, "Skipping non-TEE range [%d] #%d %d pages, %llx..%llx\n",
+				 i, range_id, mmio_range->num_pages, r->start, r->end);
+			continue;
+		}
+
+		/* No secure interrupts, we should not find this set, ignore for now. */
+		if (FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes) ||
+		    FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes)) {
+			pci_info(pdev, "Skipping MSIX (%ld/%ld) range [%d] #%d %d pages, %llx..%llx\n",
+				 FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes),
+				 FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes),
+				 i, range_id, mmio_range->num_pages, r->start, r->end);
+			continue;
+		}
+
+		mmio_start_phys = mmio_range->first_page;
+		if (validate)
+			ret = rsi_validate_dev_mapping(vdev_id, dsm->instance_id,
+						       r->start, r->end + 1,
+						       mmio_start_phys << 12, mmio_flags);
+		else
+			ret = rsi_invalidate_dev_mapping(r->start, r->end + 1);
+		if (ret) {
+			pci_err(pdev, "failed to set protection attributes for the address range\n");
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+int rsi_device_start(struct pci_dev *pdev)
+{
+	int ret;
+
+	ret = rsi_update_interface_report(pdev, true);
+	if (ret) {
+		pci_err(pdev, "failed validate the interface report\n");
+		return -EIO;
+	}
+
+	return 0;
+}
diff --git a/drivers/virt/coco/arm-cca-guest/rsi-da.h b/drivers/virt/coco/arm-cca-guest/rsi-da.h
index bd565785ff4b..0d6e1c0ada4a 100644
--- a/drivers/virt/coco/arm-cca-guest/rsi-da.h
+++ b/drivers/virt/coco/arm-cca-guest/rsi-da.h
@@ -11,6 +11,28 @@
 #include <asm/rsi_smc.h>
 #include <asm/rhi.h>
 
+struct pci_tdisp_device_interface_report {
+	u16 interface_info;
+	u16 reserved;
+	u16 msi_x_message_control;
+	u16 lnr_control;
+	u32 tph_control;
+	u32 mmio_range_count;
+} __packed;
+
+struct pci_tdisp_mmio_range {
+	u64 first_page;
+	u32 num_pages;
+	u32 range_attributes;
+} __packed;
+
+#define TSM_INTF_REPORT_MMIO_MSIX_TABLE		BIT(0)
+#define TSM_INTF_REPORT_MMIO_PBA		BIT(1)
+#define TSM_INTF_REPORT_MMIO_IS_NON_TEE		BIT(2)
+#define TSM_INTF_REPORT_MMIO_IS_UPDATABLE	BIT(3)
+#define TSM_INTF_REPORT_MMIO_RESERVED		GENMASK(15, 4)
+#define TSM_INTF_REPORT_MMIO_RANGE_ID		GENMASK(31, 16)
+
 struct cca_guest_dsc {
 	struct pci_tsm_pf0 pci;
 	unsigned long instance_id;
@@ -29,5 +51,8 @@ static inline struct cca_guest_dsc *to_cca_guest_dsc(struct pci_dev *pdev)
 	return container_of(tsm, struct cca_guest_dsc, pci.tsm);
 }
 
+int rsi_update_interface_report(struct pci_dev *pdev, bool validate);
 int rsi_device_lock(struct pci_dev *pdev);
+int rsi_device_start(struct pci_dev *pdev);
+
 #endif
-- 
2.43.0
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Bjorn Helgaas 2 months ago
On Mon, Jul 28, 2025 at 07:22:11PM +0530, Aneesh Kumar K.V (Arm) wrote:
> This starts the sequence to transition the realm device to the TDISP RUN
> state by writing 1 to 'tsm/accept'.

> +++ b/drivers/virt/coco/arm-cca-guest/rsi-da.c

> +int rsi_update_interface_report(struct pci_dev *pdev, bool validate)
> +{
> +	int ret;
> +	struct resource *r;
> +	int msix_tbl_bar, msix_pba_bar;
> +	unsigned int range_id;
> +	unsigned long mmio_start_phys;
> +	unsigned long mmio_flags = 0; /* non coherent, not limited order */
> +	struct pci_tdisp_mmio_range *mmio_range;
> +	struct pci_tdisp_device_interface_report *interface_report;
> +	struct cca_guest_dsc *dsm = to_cca_guest_dsc(pdev);
> +	int vdev_id = (pci_domain_nr(pdev->bus) << 16) |
> +		PCI_DEVID(pdev->bus->number, pdev->devfn);
> +
> +
> +	interface_report = (struct pci_tdisp_device_interface_report *)dsm->interface_report;
> +	mmio_range = (struct pci_tdisp_mmio_range *)(interface_report + 1);
> +
> +
> +	msix_tbl_bar = get_msix_bar(pdev, PCI_MSIX_TABLE);
> +	msix_pba_bar = get_msix_bar(pdev, PCI_MSIX_PBA);
> +
> +	for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) {
> +
> +		/*FIXME!! units in 4K size*/

I guess you intend to fix this?

> +		/* no secure interrupts */
> +		if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
> +			pr_info("Skipping misx table\n");
> +			continue;
> +		}
> +
> +		if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
> +			pr_info("Skipping misx pba\n");

s/misx/MSI-X/ (twice)
s/pba/PBA/
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Arto Merilainen 2 months ago
On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote:

> +	for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) {
> +
> +		/*FIXME!! units in 4K size*/
> +		range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes);
> +
> +		/* no secure interrupts */
> +		if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
> +			pr_info("Skipping misx table\n");
> +			continue;
> +		}
> +
> +		if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
> +			pr_info("Skipping misx pba\n");
> +			continue;
> +		}
> +


MSI-X and PBA can be placed to a BAR that has other registers as well. 
While the PCIe specification recommends BAR-level isolation for MSI-X 
structures, it is not mandated. It is enough to have sufficient 
isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs 
altogether may leave registers unintentionally mapped via unprotected 
IPA when they should have been mapped via protected IPA.

Instead of skipping the whole BAR, would it make sense to determine
where the MSI-X related regions reside, and skip validation only from 
these regions?

- R2
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Arto Merilainen 3 weeks, 4 days ago
On 31.7.2025 14.39, Arto Merilainen wrote:
> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote:
> 
>> +    for (int i = 0; i < interface_report->mmio_range_count; i++, 
>> mmio_range++) {
>> +
>> +        /*FIXME!! units in 4K size*/
>> +        range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, 
>> mmio_range->range_attributes);
>> +
>> +        /* no secure interrupts */
>> +        if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
>> +            pr_info("Skipping misx table\n");
>> +            continue;
>> +        }
>> +
>> +        if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
>> +            pr_info("Skipping misx pba\n");
>> +            continue;
>> +        }
>> +
> 
> 
> MSI-X and PBA can be placed to a BAR that has other registers as well. 
> While the PCIe specification recommends BAR-level isolation for MSI-X 
> structures, it is not mandated. It is enough to have sufficient 
> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs 
> altogether may leave registers unintentionally mapped via unprotected 
> IPA when they should have been mapped via protected IPA.
> 
> Instead of skipping the whole BAR, would it make sense to determine
> where the MSI-X related regions reside, and skip validation only from 
> these regions?

I re-reviewed my suggestion, and what I proposed here seems wrong. 
However, I think there is a different more generic problem related to 
the MSI-X table, PBA and non-TEE ranges.

If a BAR is sparse (e.g., it has TEE pages and the MSI-X table, PBA or 
non-TEE areas), the TDISP interface report may contain multiple ranges 
with the same range_id (/BAR id). In case a BAR contains some registers 
in low addresses, the MSI-X table and other registers after the MSI-X 
table, the interface report is expected to have two ranges for the same 
BAR with different "first 4k page" and "size" fields.

This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING 
requires both the ipa_base and pa_base which should correspond to the 
same location. In above scenario, the PA of the first range would 
correspond to the BAR base whereas the second range would correspond to 
a location residing after the MSI-X table.

Assuming that the report contains obfuscated (but linear) physical 
addresses, it would be possible to create heuristics for this case. 
However, the fundamental problem is that none of the "first 4k page" 
fields in the ranges is guaranteed to correspond to the base of any BAR: 
Consider a case where the MSI-X table is in the beginning of a BAR and 
it is followed by a single TEE range. If the MSI-X is not locked, the 
"first 4k page" field will not correspond to the beginning of the BAR. 
If the realm naiviely reads the ipa_base using pci_resouce_n() and 
corresponding pa_base from the interface report, the addresses won't 
match and the validation will fail.

It seems that interpreting the interface report cannot be done without 
knowledge of the device's register layout. Therefore, I don't think the 
ranges can be validated/remapped automatically without involving the 
device driver, but there should be APIs for reading the interface 
report, and for requesting making specific ranges protected.

- R2
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Aneesh Kumar K.V 3 weeks, 3 days ago
Arto Merilainen <amerilainen@nvidia.com> writes:

> On 31.7.2025 14.39, Arto Merilainen wrote:
>> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote:
>> 
>>> +    for (int i = 0; i < interface_report->mmio_range_count; i++, 
>>> mmio_range++) {
>>> +
>>> +        /*FIXME!! units in 4K size*/
>>> +        range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, 
>>> mmio_range->range_attributes);
>>> +
>>> +        /* no secure interrupts */
>>> +        if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
>>> +            pr_info("Skipping misx table\n");
>>> +            continue;
>>> +        }
>>> +
>>> +        if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
>>> +            pr_info("Skipping misx pba\n");
>>> +            continue;
>>> +        }
>>> +
>> 
>> 
>> MSI-X and PBA can be placed to a BAR that has other registers as well. 
>> While the PCIe specification recommends BAR-level isolation for MSI-X 
>> structures, it is not mandated. It is enough to have sufficient 
>> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs 
>> altogether may leave registers unintentionally mapped via unprotected 
>> IPA when they should have been mapped via protected IPA.
>> 
>> Instead of skipping the whole BAR, would it make sense to determine
>> where the MSI-X related regions reside, and skip validation only from 
>> these regions?
>
> I re-reviewed my suggestion, and what I proposed here seems wrong. 
> However, I think there is a different more generic problem related to 
> the MSI-X table, PBA and non-TEE ranges.
>
> If a BAR is sparse (e.g., it has TEE pages and the MSI-X table, PBA or 
> non-TEE areas), the TDISP interface report may contain multiple ranges 
> with the same range_id (/BAR id). In case a BAR contains some registers 
> in low addresses, the MSI-X table and other registers after the MSI-X 
> table, the interface report is expected to have two ranges for the same 
> BAR with different "first 4k page" and "size" fields.
>
> This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING 
> requires both the ipa_base and pa_base which should correspond to the 
> same location. In above scenario, the PA of the first range would 
> correspond to the BAR base whereas the second range would correspond to 
> a location residing after the MSI-X table.
>
> Assuming that the report contains obfuscated (but linear) physical 
> addresses, it would be possible to create heuristics for this case. 
> However, the fundamental problem is that none of the "first 4k page" 
> fields in the ranges is guaranteed to correspond to the base of any BAR: 
> Consider a case where the MSI-X table is in the beginning of a BAR and 
> it is followed by a single TEE range. If the MSI-X is not locked, the 
> "first 4k page" field will not correspond to the beginning of the BAR. 
> If the realm naiviely reads the ipa_base using pci_resouce_n() and 
> corresponding pa_base from the interface report, the addresses won't 
> match and the validation will fail.
>
> It seems that interpreting the interface report cannot be done without 
> knowledge of the device's register layout. Therefore, I don't think the 
> ranges can be validated/remapped automatically without involving the 
> device driver, but there should be APIs for reading the interface 
> report, and for requesting making specific ranges protected.
>

But we need to validate the interface report before accepting the device,
and the device driver is only loaded after the device has been accepted.

Can we assume that only the MSI-X table and PBA ranges may be missing
from the interface report, while all other non-secure regions are
reported as NON-TEE ranges?

If so, we could retrieve the MSI-X guest real address details from
config space and map the beginning of the BAR correctly.

Dan / Yilun — how is this handled in Intel TDX?

From what I can see, the AMD patches appear to encounter the same issue.

-aneesh
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by dan.j.williams@intel.com 3 weeks, 3 days ago
Aneesh Kumar K.V wrote:
> Arto Merilainen <amerilainen@nvidia.com> writes:
> 
> > On 31.7.2025 14.39, Arto Merilainen wrote:
> >> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote:
> >> 
> >>> +    for (int i = 0; i < interface_report->mmio_range_count; i++, 
> >>> mmio_range++) {
> >>> +
> >>> +        /*FIXME!! units in 4K size*/
> >>> +        range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, 
> >>> mmio_range->range_attributes);
> >>> +
> >>> +        /* no secure interrupts */
> >>> +        if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
> >>> +            pr_info("Skipping misx table\n");
> >>> +            continue;
> >>> +        }
> >>> +
> >>> +        if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
> >>> +            pr_info("Skipping misx pba\n");
> >>> +            continue;
> >>> +        }
> >>> +
> >> 
> >> 
> >> MSI-X and PBA can be placed to a BAR that has other registers as well. 
> >> While the PCIe specification recommends BAR-level isolation for MSI-X 
> >> structures, it is not mandated. It is enough to have sufficient 
> >> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs 
> >> altogether may leave registers unintentionally mapped via unprotected 
> >> IPA when they should have been mapped via protected IPA.
> >> 
> >> Instead of skipping the whole BAR, would it make sense to determine
> >> where the MSI-X related regions reside, and skip validation only from 
> >> these regions?
> >
> > I re-reviewed my suggestion, and what I proposed here seems wrong. 
> > However, I think there is a different more generic problem related to 
> > the MSI-X table, PBA and non-TEE ranges.
> >
> > If a BAR is sparse (e.g., it has TEE pages and the MSI-X table, PBA or 
> > non-TEE areas), the TDISP interface report may contain multiple ranges 
> > with the same range_id (/BAR id). In case a BAR contains some registers 
> > in low addresses, the MSI-X table and other registers after the MSI-X 
> > table, the interface report is expected to have two ranges for the same 
> > BAR with different "first 4k page" and "size" fields.
> >
> > This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING 
> > requires both the ipa_base and pa_base which should correspond to the 
> > same location. In above scenario, the PA of the first range would 
> > correspond to the BAR base whereas the second range would correspond to 
> > a location residing after the MSI-X table.
> >
> > Assuming that the report contains obfuscated (but linear) physical 
> > addresses, it would be possible to create heuristics for this case. 
> > However, the fundamental problem is that none of the "first 4k page" 
> > fields in the ranges is guaranteed to correspond to the base of any BAR: 
> > Consider a case where the MSI-X table is in the beginning of a BAR and 
> > it is followed by a single TEE range. If the MSI-X is not locked, the 
> > "first 4k page" field will not correspond to the beginning of the BAR. 
> > If the realm naiviely reads the ipa_base using pci_resouce_n() and 
> > corresponding pa_base from the interface report, the addresses won't 
> > match and the validation will fail.
> >
> > It seems that interpreting the interface report cannot be done without 
> > knowledge of the device's register layout. Therefore, I don't think the 
> > ranges can be validated/remapped automatically without involving the 
> > device driver, but there should be APIs for reading the interface 
> > report, and for requesting making specific ranges protected.
> >
> 
> But we need to validate the interface report before accepting the device,
> and the device driver is only loaded after the device has been accepted.
> 
> Can we assume that only the MSI-X table and PBA ranges may be missing
> from the interface report, while all other non-secure regions are
> reported as NON-TEE ranges?
> 
> If so, we could retrieve the MSI-X guest real address details from
> config space and map the beginning of the BAR correctly.
> 
> Dan / Yilun — how is this handled in Intel TDX?
> 
> From what I can see, the AMD patches appear to encounter the same issue.

Same issue exists for TDX. In the near term this solidifies that the
PCI/TSM core should not be assumining anything with respect to marking
MMIO ranges as private, and leave that all the to low-level TSM driver.

...but then yes I expect we need to build some common infrastructure for
special casing MSIX.
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Jason Gunthorpe 3 weeks, 3 days ago
On Thu, Sep 11, 2025 at 11:03:50AM +0530, Aneesh Kumar K.V wrote:

> But we need to validate the interface report before accepting the device,
> and the device driver is only loaded after the device has been accepted.

+1

This must work from the generic OS code.

So I'd say add a new TSM op:
 int validate_pci_bar_range(struct pci_dev *pdev,
                            unsigned int bar_index, u64 tdisp_pa,
			    u64 size,phys_addr_t *bar_offset_out);

TSM has broadly two options to compute bar_offset_out:

1) Require the TDISP MMIO Offset is aligned to the BAR size and use
   something like:

    *bar_offset_out = (tdisp_pa) % pci_resource_len(pdev, bar_index);
    ipa = pci_resource_start(pdev, bar_index) + *bar_offset_out;
    if (size + *bar_offset_out > pci_resource_len(pdev, bar_index))
        return -EINVAL;
    tsm_call_to_validate(pdev, ipa, pa, size)

2) Require the TSM to convert the offest'd PA to the IPA:

    tsm_call_to_convert(pdev, pa, size, &ipa);

    if (ipa < pci_resource_start(pdev, bar_index) ||
        ipa >= pci_resource_end(pdev, bar_index) ||
        (ipa + size) > pci_resource_end(pdev, bar_index))
	return -EINVAL;

    *bar_offset_out = ipa -  pci_resource_start(pdev, bar_index);

Then the generic code builds a map of what parts of the BAR are secure
and what are not.

If it can't do either the TSM is unusable by Linux.

Jason
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Alexey Kardashevskiy 3 weeks, 3 days ago

On 11/9/25 15:33, Aneesh Kumar K.V wrote:
> Arto Merilainen <amerilainen@nvidia.com> writes:
> 
>> On 31.7.2025 14.39, Arto Merilainen wrote:
>>> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote:
>>>
>>>> +    for (int i = 0; i < interface_report->mmio_range_count; i++,
>>>> mmio_range++) {
>>>> +
>>>> +        /*FIXME!! units in 4K size*/
>>>> +        range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID,
>>>> mmio_range->range_attributes);
>>>> +
>>>> +        /* no secure interrupts */
>>>> +        if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
>>>> +            pr_info("Skipping misx table\n");
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
>>>> +            pr_info("Skipping misx pba\n");
>>>> +            continue;
>>>> +        }
>>>> +
>>>
>>>
>>> MSI-X and PBA can be placed to a BAR that has other registers as well.
>>> While the PCIe specification recommends BAR-level isolation for MSI-X
>>> structures, it is not mandated. It is enough to have sufficient
>>> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs
>>> altogether may leave registers unintentionally mapped via unprotected
>>> IPA when they should have been mapped via protected IPA.
>>>
>>> Instead of skipping the whole BAR, would it make sense to determine
>>> where the MSI-X related regions reside, and skip validation only from
>>> these regions?
>>
>> I re-reviewed my suggestion, and what I proposed here seems wrong.
>> However, I think there is a different more generic problem related to
>> the MSI-X table, PBA and non-TEE ranges.
>>
>> If a BAR is sparse (e.g., it has TEE pages and the MSI-X table, PBA or
>> non-TEE areas), the TDISP interface report may contain multiple ranges
>> with the same range_id (/BAR id). In case a BAR contains some registers
>> in low addresses, the MSI-X table and other registers after the MSI-X
>> table, the interface report is expected to have two ranges for the same
>> BAR with different "first 4k page" and "size" fields.
>>
>> This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING
>> requires both the ipa_base and pa_base which should correspond to the
>> same location. In above scenario, the PA of the first range would
>> correspond to the BAR base whereas the second range would correspond to
>> a location residing after the MSI-X table.
>>
>> Assuming that the report contains obfuscated (but linear) physical
>> addresses, it would be possible to create heuristics for this case.
>> However, the fundamental problem is that none of the "first 4k page"
>> fields in the ranges is guaranteed to correspond to the base of any BAR:
>> Consider a case where the MSI-X table is in the beginning of a BAR and
>> it is followed by a single TEE range. If the MSI-X is not locked, the
>> "first 4k page" field will not correspond to the beginning of the BAR.
>> If the realm naiviely reads the ipa_base using pci_resouce_n() and
>> corresponding pa_base from the interface report, the addresses won't
>> match and the validation will fail.
>>
>> It seems that interpreting the interface report cannot be done without
>> knowledge of the device's register layout. Therefore, I don't think the
>> ranges can be validated/remapped automatically without involving the
>> device driver, but there should be APIs for reading the interface
>> report, and for requesting making specific ranges protected.
>>
> 
> But we need to validate the interface report before accepting the device,
> and the device driver is only loaded after the device has been accepted.
> 
> Can we assume that only the MSI-X table and PBA ranges may be missing
> from the interface report, while all other non-secure regions are
> reported as NON-TEE ranges?

In case of the "some registers"+MSIX+"some more registers" BAR#X, I'd rather expect 3 ranges in the report and not just 2, for each subrange, sorted, with/without NonTEE bit set, as the PCIe spec suggests (sort of, I struggle parsing it):

===
When reporting the MMIO range for a TDI, the MMIO ranges must be reported in the logical order in which the TDI MMIO
range is configured such that the first range reported corresponds to first range of pages in the TDI and so on
===

And if the number of pages in the report differs from the BAR size, then we should fail validation. Otherwise it is impossible to tell from the report's MMIO addresses what part of a BAR needs to be validated (==TEE) and if the guest device driver has to know it - then reports are just useless (which is hardly true).


> If so, we could retrieve the MSI-X guest real address details from
> config space and map the beginning of the BAR correctly.
> 
> Dan / Yilun — how is this handled in Intel TDX?
> 
>  From what I can see, the AMD patches appear to encounter the same issue.

I am skipping MSIX BAR because T=1 is tied to C=1 so after such validation, the BAR belongs to the guest and the hw will reject VFIO-PCI (==HV) attempts to write to MSIX BAR. Probably too straight forward though and I can try assigning it to the host (and add Cbit in those PTEs), and see how the PSP handles it (not). Thanks,


> 
> -aneesh

-- 
Alexey

Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Jason Gunthorpe 3 weeks, 4 days ago
On Wed, Sep 10, 2025 at 08:47:43AM +0300, Arto Merilainen wrote:
> This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING requires
> both the ipa_base and pa_base which should correspond to the same location.
> In above scenario, the PA of the first range would correspond to the BAR
> base whereas the second range would correspond to a location residing after
> the MSI-X table.

This seems like a defect in the RSI_VDEV_VALIDATE_MAPPING - it should
be able to consume the same format of data that the tdisp report emits
to validate it.

From a kernel side we also should be careful that the driver isn't
tricked into mapping MMIO that is not secure when it should
be. Presumably all the default io access functions should demand
secure memory in T=1 mode, and special ones like the MSI-X code would
have some special version to accept either?

Jason
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Aneesh Kumar K.V 2 months ago
Arto Merilainen <amerilainen@nvidia.com> writes:

> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote:
>
>> +	for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) {
>> +
>> +		/*FIXME!! units in 4K size*/
>> +		range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes);
>> +
>> +		/* no secure interrupts */
>> +		if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
>> +			pr_info("Skipping misx table\n");
>> +			continue;
>> +		}
>> +
>> +		if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
>> +			pr_info("Skipping misx pba\n");
>> +			continue;
>> +		}
>> +
>
>
> MSI-X and PBA can be placed to a BAR that has other registers as well. 
> While the PCIe specification recommends BAR-level isolation for MSI-X 
> structures, it is not mandated. It is enough to have sufficient 
> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs 
> altogether may leave registers unintentionally mapped via unprotected 
> IPA when they should have been mapped via protected IPA.
>
> Instead of skipping the whole BAR, would it make sense to determine
> where the MSI-X related regions reside, and skip validation only from 
> these regions?
>

Yes, that was added because at one point the FVP model was including the
MSI-X table and PBA regions in the interface report.

If I understand correctly, we shouldn't expect to see those regions in
the report unless secure interrupts are supported. The BAR-based
skipping was added as a workaround to handle the FVP issue.

I believe we can drop that workaround now—if those regions are
incorrectly present, the below validation logic should catch and
reject them appropriately. Does that sound reasonable?

		/* No secure interrupts, we should not find this set, ignore for now. */
		if (FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes) ||
		    FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes)) {
			pci_info(pdev, "Skipping MSIX (%ld/%ld) range [%d] #%d %d pages, %llx..%llx\n",
				 FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes),
				 FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes),
				 i, range_id, mmio_range->num_pages, r->start, r->end);
			continue;
		}


-aneesh
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Arto Merilainen 2 months ago
On 4.8.2025 9.37, Aneesh Kumar K.V wrote:
> Arto Merilainen <amerilainen@nvidia.com> writes:
> 
>> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote:
>>
>>> +    for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) {
>>> +
>>> +            /*FIXME!! units in 4K size*/
>>> +            range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes);
>>> +
>>> +            /* no secure interrupts */
>>> +            if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
>>> +                    pr_info("Skipping misx table\n");
>>> +                    continue;
>>> +            }
>>> +
>>> +            if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
>>> +                    pr_info("Skipping misx pba\n");
>>> +                    continue;
>>> +            }
>>> +
>>
>>
>> MSI-X and PBA can be placed to a BAR that has other registers as well.
>> While the PCIe specification recommends BAR-level isolation for MSI-X
>> structures, it is not mandated. It is enough to have sufficient
>> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs
>> altogether may leave registers unintentionally mapped via unprotected
>> IPA when they should have been mapped via protected IPA.
>>
>> Instead of skipping the whole BAR, would it make sense to determine
>> where the MSI-X related regions reside, and skip validation only from
>> these regions?
>>
> 
> Yes, that was added because at one point the FVP model was including the
> MSI-X table and PBA regions in the interface report.

The issue I am raising is a different one. The MSI-X table and PBA may 
reside in the middle of a BAR range, and the BAR range may contain 
registers which should be accessible only via protected IPA. In this 
case I would expect the pages before and after the MSI-X table (and PBA) 
to be validated while the pages related to MSI-X structures would be 
left unprotected.

Given that the MSI-X table or PBA regions shouldn't be present in the 
interface report, the code needs to first find out where the MSI-X 
structures reside, and use this information to determine which 
"sub-ranges" should be skipped over during validation.
> If I understand correctly, we shouldn't expect to see those regions in
> the report unless secure interrupts are supported. The BAR-based
> skipping was added as a workaround to handle the FVP issue.
> 

Correct. Assuming that RMM doesn't lock the MSI-X table, I'd expect 
these regions to be omitted in the interface report.

> I believe we can drop that workaround now—if those regions are
> incorrectly present, the below validation logic should catch and
> reject them appropriately. Does that sound reasonable?
> 
>                  /* No secure interrupts, we should not find this set, ignore for now. */
>                  if (FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes) ||
>                      FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes)) {
>                          pci_info(pdev, "Skipping MSIX (%ld/%ld) range [%d] #%d %d pages, %llx..%llx\n",
>                                   FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes),
>                                   FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes),
>                                   i, range_id, mmio_range->num_pages, r->start, r->end);
>                          continue;
>                  }
> 
> 

First, I think this is skipping over the whole range => if there are 
registers outside of the MSI-X structures (but within the same BAR), 
they won't be validated.

Second, if the MSI-X regions are present in the interface report, 
wouldn't it - in the common case - mean that the device expects the 
ranges to be accessed with T=1? If this happens unexpectedly, it sounds 
that MSI-X wouldn't be usable. I wonder if the code should simply return 
an error instead if informing the user via pci_info()...

- R2
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Jason Gunthorpe 2 months ago
On Thu, Jul 31, 2025 at 02:39:09PM +0300, Arto Merilainen wrote:
> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote:
> 
> > +	for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) {
> > +
> > +		/*FIXME!! units in 4K size*/
> > +		range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes);
> > +
> > +		/* no secure interrupts */
> > +		if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
> > +			pr_info("Skipping misx table\n");
> > +			continue;
> > +		}
> > +
> > +		if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
> > +			pr_info("Skipping misx pba\n");
> > +			continue;
> > +		}
> > +
> 
> 
> MSI-X and PBA can be placed to a BAR that has other registers as well. While
> the PCIe specification recommends BAR-level isolation for MSI-X structures,
> it is not mandated.

Right, there are not enough BARs in most devices to give MSI its own
BAR.

>  It is enough to have sufficient isolation within the
> BAR. Therefore, skipping the MSI-X and PBA BARs altogether may leave
> registers unintentionally mapped via unprotected IPA when they
> should have been mapped via protected IPA.

Right, this sounds bad.

> Instead of skipping the whole BAR, would it make sense to determine
> where the MSI-X related regions reside, and skip validation only from these
> regions?

IMHO this is a mess. The virtualization must end up putting a shared
page(s) covering the MSI space in the middle of the MMIO region.

I think this should be done by fragmenting the layout in the IPA where
the private MMIO is within the protected IPA space with an unmapped
hole covering the MSIX registers. The acceptance process should
validate this.

The MSIX registers would then be located in the shared IPA space.

A normal driver mmaping it's BAR will then crash if it tries to access
the MSIX registers. This is good, we want to catch these non-secure
configurations and block them.

The MSI code will have to know to compute the shared IPA alias and use
that.

Jason
Re: [RFC PATCH v1 34/38] coco: guest: arm64: Validate mmio range found in the interface report
Posted by Jonathan Cameron 2 months, 1 week ago
On Mon, 28 Jul 2025 19:22:11 +0530
"Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote:

> This starts the sequence to transition the realm device to the TDISP RUN
> state by writing 1 to 'tsm/accept'.
> 
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Just some trivial stuff.


> diff --git a/drivers/virt/coco/arm-cca-guest/rsi-da.c b/drivers/virt/coco/arm-cca-guest/rsi-da.c
> index 47b379318e7c..936f844880de 100644
> --- a/drivers/virt/coco/arm-cca-guest/rsi-da.c
> +++ b/drivers/virt/coco/arm-cca-guest/rsi-da.c
> @@ -215,3 +215,135 @@ int rsi_device_lock(struct pci_dev *pdev)
>  
>  	return ret;
>  }

> +
> +int rsi_update_interface_report(struct pci_dev *pdev, bool validate)
> +{
> +	int ret;
> +	struct resource *r;
> +	int msix_tbl_bar, msix_pba_bar;
> +	unsigned int range_id;
> +	unsigned long mmio_start_phys;
> +	unsigned long mmio_flags = 0; /* non coherent, not limited order */
> +	struct pci_tdisp_mmio_range *mmio_range;
> +	struct pci_tdisp_device_interface_report *interface_report;
> +	struct cca_guest_dsc *dsm = to_cca_guest_dsc(pdev);
> +	int vdev_id = (pci_domain_nr(pdev->bus) << 16) |
> +		PCI_DEVID(pdev->bus->number, pdev->devfn);
> +
> +
Single line.
> +	interface_report = (struct pci_tdisp_device_interface_report *)dsm->interface_report;
> +	mmio_range = (struct pci_tdisp_mmio_range *)(interface_report + 1);
> +
> +
Single line

> +	msix_tbl_bar = get_msix_bar(pdev, PCI_MSIX_TABLE);
> +	msix_pba_bar = get_msix_bar(pdev, PCI_MSIX_PBA);
> +
> +	for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) {
> +
> +		/*FIXME!! units in 4K size*/
> +		range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes);
> +
> +		/* no secure interrupts */
> +		if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {

That first condition can get hiked out of the loop.

> +			pr_info("Skipping misx table\n");
> +			continue;
> +		}
> +
> +		if (msix_pba_bar != -1 && range_id == msix_pba_bar) {

Likewise.

> +			pr_info("Skipping misx pba\n");
> +			continue;
> +		}
> +
> +		r = pci_resource_n(pdev, range_id);
> +
> +		if (r->end == r->start ||
> +		    ((r->end - r->start + 1) & ~PAGE_MASK) || !mmio_range->num_pages) {
resource_size() 
> +			pci_warn(pdev, "Skipping broken range [%d] #%d %d pages, %llx..%llx\n",
> +				i, range_id, mmio_range->num_pages, r->start, r->end);
> +			continue;
> +		}
> +
> +		if (FIELD_GET(TSM_INTF_REPORT_MMIO_IS_NON_TEE, mmio_range->range_attributes)) {
> +			pci_info(pdev, "Skipping non-TEE range [%d] #%d %d pages, %llx..%llx\n",
> +				 i, range_id, mmio_range->num_pages, r->start, r->end);
> +			continue;
> +		}
> +
> +		/* No secure interrupts, we should not find this set, ignore for now. */
> +		if (FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes) ||
> +		    FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes)) {
> +			pci_info(pdev, "Skipping MSIX (%ld/%ld) range [%d] #%d %d pages, %llx..%llx\n",
> +				 FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes),
> +				 FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes),
> +				 i, range_id, mmio_range->num_pages, r->start, r->end);
> +			continue;
> +		}
> +
> +		mmio_start_phys = mmio_range->first_page;
> +		if (validate)
> +			ret = rsi_validate_dev_mapping(vdev_id, dsm->instance_id,
> +						       r->start, r->end + 1,
> +						       mmio_start_phys << 12, mmio_flags);
> +		else
> +			ret = rsi_invalidate_dev_mapping(r->start, r->end + 1);
> +		if (ret) {
> +			pci_err(pdev, "failed to set protection attributes for the address range\n");
> +			return -EIO;
> +		}
> +	}
> +	return 0;
> +}