This starts the sequence to transition the realm device to the TDISP RUN
state by writing 1 to 'tsm/accept'.
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
arch/arm64/include/asm/rsi_cmds.h | 18 +++
arch/arm64/include/asm/rsi_smc.h | 4 +
drivers/virt/coco/arm-cca-guest/arm-cca.c | 3 +
drivers/virt/coco/arm-cca-guest/rsi-da.c | 132 ++++++++++++++++++++++
drivers/virt/coco/arm-cca-guest/rsi-da.h | 25 ++++
5 files changed, 182 insertions(+)
diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h
index 18fc4e1ce577..1cc00d404e53 100644
--- a/arch/arm64/include/asm/rsi_cmds.h
+++ b/arch/arm64/include/asm/rsi_cmds.h
@@ -228,4 +228,22 @@ static inline unsigned long rsi_host_call(phys_addr_t addr)
return res.a0;
}
+static inline unsigned long
+rsi_rdev_validate_mapping(unsigned long vdev_id, unsigned long inst_id,
+ phys_addr_t start_ipa, phys_addr_t end_ipa,
+ phys_addr_t io_pa, phys_addr_t *next_ipa, unsigned long flags)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_invoke(SMC_RSI_RDEV_VALIDATE_MAPPING, vdev_id,
+ inst_id, start_ipa, end_ipa, io_pa, flags, &res);
+ *next_ipa = res.a1;
+
+ if (res.a2 != RSI_ACCEPT)
+ return -EPERM;
+
+ if (res.a0 != RSI_SUCCESS)
+ return -EINVAL;
+ return 0;
+}
#endif /* __ASM_RSI_CMDS_H */
diff --git a/arch/arm64/include/asm/rsi_smc.h b/arch/arm64/include/asm/rsi_smc.h
index 1d762fe3777b..a28b41cf01ca 100644
--- a/arch/arm64/include/asm/rsi_smc.h
+++ b/arch/arm64/include/asm/rsi_smc.h
@@ -204,4 +204,8 @@ struct rsi_host_call {
#define SMC_RSI_RDEV_LOCK SMC_RSI_FID(0x1a9)
+#define RSI_DEV_MEM_COHERENT BIT(0)
+#define RSI_DEV_MEM_LIMITED_ORDER BIT(1)
+#define SMC_RSI_RDEV_VALIDATE_MAPPING SMC_RSI_FID(0x1ac)
+
#endif /* __ASM_RSI_SMC_H_ */
diff --git a/drivers/virt/coco/arm-cca-guest/arm-cca.c b/drivers/virt/coco/arm-cca-guest/arm-cca.c
index de70fba09e92..c1cefb983ac7 100644
--- a/drivers/virt/coco/arm-cca-guest/arm-cca.c
+++ b/drivers/virt/coco/arm-cca-guest/arm-cca.c
@@ -247,6 +247,9 @@ static void cca_tsm_unlock(struct pci_dev *pdev)
int vdev_id = (pci_domain_nr(pdev->bus) << 16) |
PCI_DEVID(pdev->bus->number, pdev->devfn);
+ /* invalidate dev mapping based on interface report */
+ rsi_update_interface_report(pdev, false);
+
ret = rhi_da_vdev_set_tdi_state(vdev_id, RHI_DA_TDI_CONFIG_UNLOCKED);
if (ret) {
pci_err(pdev, "failed to TSM unbind the device (%ld)\n", ret);
diff --git a/drivers/virt/coco/arm-cca-guest/rsi-da.c b/drivers/virt/coco/arm-cca-guest/rsi-da.c
index 47b379318e7c..936f844880de 100644
--- a/drivers/virt/coco/arm-cca-guest/rsi-da.c
+++ b/drivers/virt/coco/arm-cca-guest/rsi-da.c
@@ -215,3 +215,135 @@ int rsi_device_lock(struct pci_dev *pdev)
return ret;
}
+
+static inline unsigned long
+rsi_validate_dev_mapping(unsigned long vdev_id, unsigned long inst_id,
+ phys_addr_t start_ipa, phys_addr_t end_ipa,
+ phys_addr_t io_pa, unsigned long flags)
+{
+ unsigned long ret;
+ phys_addr_t next_ipa;
+
+ while (start_ipa < end_ipa) {
+ ret = rsi_rdev_validate_mapping(vdev_id, inst_id,
+ start_ipa, end_ipa,
+ io_pa, &next_ipa, flags);
+ if (ret || next_ipa < start_ipa || next_ipa > end_ipa)
+ return -EINVAL;
+ io_pa += next_ipa - start_ipa;
+ start_ipa = next_ipa;
+ }
+ return 0;
+}
+
+static int rsi_invalidate_dev_mapping(phys_addr_t start_ipa, phys_addr_t end_ipa)
+{
+ return rsi_set_memory_range(start_ipa, end_ipa, RSI_RIPAS_EMPTY,
+ RSI_CHANGE_DESTROYED);
+}
+
+static int get_msix_bar(struct pci_dev *pdev, int cap)
+{
+ int bar;
+ u32 table_offset;
+ unsigned long flags;
+
+ pci_read_config_dword(pdev, pdev->msix_cap + cap, &table_offset);
+ bar = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
+ flags = pci_resource_flags(pdev, bar);
+ if (!flags || (flags & IORESOURCE_UNSET))
+ return -1;
+
+ return bar;
+}
+
+int rsi_update_interface_report(struct pci_dev *pdev, bool validate)
+{
+ int ret;
+ struct resource *r;
+ int msix_tbl_bar, msix_pba_bar;
+ unsigned int range_id;
+ unsigned long mmio_start_phys;
+ unsigned long mmio_flags = 0; /* non coherent, not limited order */
+ struct pci_tdisp_mmio_range *mmio_range;
+ struct pci_tdisp_device_interface_report *interface_report;
+ struct cca_guest_dsc *dsm = to_cca_guest_dsc(pdev);
+ int vdev_id = (pci_domain_nr(pdev->bus) << 16) |
+ PCI_DEVID(pdev->bus->number, pdev->devfn);
+
+
+ interface_report = (struct pci_tdisp_device_interface_report *)dsm->interface_report;
+ mmio_range = (struct pci_tdisp_mmio_range *)(interface_report + 1);
+
+
+ msix_tbl_bar = get_msix_bar(pdev, PCI_MSIX_TABLE);
+ msix_pba_bar = get_msix_bar(pdev, PCI_MSIX_PBA);
+
+ for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) {
+
+ /*FIXME!! units in 4K size*/
+ range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes);
+
+ /* no secure interrupts */
+ if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) {
+ pr_info("Skipping misx table\n");
+ continue;
+ }
+
+ if (msix_pba_bar != -1 && range_id == msix_pba_bar) {
+ pr_info("Skipping misx pba\n");
+ continue;
+ }
+
+ r = pci_resource_n(pdev, range_id);
+
+ if (r->end == r->start ||
+ ((r->end - r->start + 1) & ~PAGE_MASK) || !mmio_range->num_pages) {
+ pci_warn(pdev, "Skipping broken range [%d] #%d %d pages, %llx..%llx\n",
+ i, range_id, mmio_range->num_pages, r->start, r->end);
+ continue;
+ }
+
+ if (FIELD_GET(TSM_INTF_REPORT_MMIO_IS_NON_TEE, mmio_range->range_attributes)) {
+ pci_info(pdev, "Skipping non-TEE range [%d] #%d %d pages, %llx..%llx\n",
+ i, range_id, mmio_range->num_pages, r->start, r->end);
+ continue;
+ }
+
+ /* No secure interrupts, we should not find this set, ignore for now. */
+ if (FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes) ||
+ FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes)) {
+ pci_info(pdev, "Skipping MSIX (%ld/%ld) range [%d] #%d %d pages, %llx..%llx\n",
+ FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes),
+ FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes),
+ i, range_id, mmio_range->num_pages, r->start, r->end);
+ continue;
+ }
+
+ mmio_start_phys = mmio_range->first_page;
+ if (validate)
+ ret = rsi_validate_dev_mapping(vdev_id, dsm->instance_id,
+ r->start, r->end + 1,
+ mmio_start_phys << 12, mmio_flags);
+ else
+ ret = rsi_invalidate_dev_mapping(r->start, r->end + 1);
+ if (ret) {
+ pci_err(pdev, "failed to set protection attributes for the address range\n");
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+int rsi_device_start(struct pci_dev *pdev)
+{
+ int ret;
+
+ ret = rsi_update_interface_report(pdev, true);
+ if (ret) {
+ pci_err(pdev, "failed validate the interface report\n");
+ return -EIO;
+ }
+
+ return 0;
+}
diff --git a/drivers/virt/coco/arm-cca-guest/rsi-da.h b/drivers/virt/coco/arm-cca-guest/rsi-da.h
index bd565785ff4b..0d6e1c0ada4a 100644
--- a/drivers/virt/coco/arm-cca-guest/rsi-da.h
+++ b/drivers/virt/coco/arm-cca-guest/rsi-da.h
@@ -11,6 +11,28 @@
#include <asm/rsi_smc.h>
#include <asm/rhi.h>
+struct pci_tdisp_device_interface_report {
+ u16 interface_info;
+ u16 reserved;
+ u16 msi_x_message_control;
+ u16 lnr_control;
+ u32 tph_control;
+ u32 mmio_range_count;
+} __packed;
+
+struct pci_tdisp_mmio_range {
+ u64 first_page;
+ u32 num_pages;
+ u32 range_attributes;
+} __packed;
+
+#define TSM_INTF_REPORT_MMIO_MSIX_TABLE BIT(0)
+#define TSM_INTF_REPORT_MMIO_PBA BIT(1)
+#define TSM_INTF_REPORT_MMIO_IS_NON_TEE BIT(2)
+#define TSM_INTF_REPORT_MMIO_IS_UPDATABLE BIT(3)
+#define TSM_INTF_REPORT_MMIO_RESERVED GENMASK(15, 4)
+#define TSM_INTF_REPORT_MMIO_RANGE_ID GENMASK(31, 16)
+
struct cca_guest_dsc {
struct pci_tsm_pf0 pci;
unsigned long instance_id;
@@ -29,5 +51,8 @@ static inline struct cca_guest_dsc *to_cca_guest_dsc(struct pci_dev *pdev)
return container_of(tsm, struct cca_guest_dsc, pci.tsm);
}
+int rsi_update_interface_report(struct pci_dev *pdev, bool validate);
int rsi_device_lock(struct pci_dev *pdev);
+int rsi_device_start(struct pci_dev *pdev);
+
#endif
--
2.43.0
On Mon, Jul 28, 2025 at 07:22:11PM +0530, Aneesh Kumar K.V (Arm) wrote: > This starts the sequence to transition the realm device to the TDISP RUN > state by writing 1 to 'tsm/accept'. > +++ b/drivers/virt/coco/arm-cca-guest/rsi-da.c > +int rsi_update_interface_report(struct pci_dev *pdev, bool validate) > +{ > + int ret; > + struct resource *r; > + int msix_tbl_bar, msix_pba_bar; > + unsigned int range_id; > + unsigned long mmio_start_phys; > + unsigned long mmio_flags = 0; /* non coherent, not limited order */ > + struct pci_tdisp_mmio_range *mmio_range; > + struct pci_tdisp_device_interface_report *interface_report; > + struct cca_guest_dsc *dsm = to_cca_guest_dsc(pdev); > + int vdev_id = (pci_domain_nr(pdev->bus) << 16) | > + PCI_DEVID(pdev->bus->number, pdev->devfn); > + > + > + interface_report = (struct pci_tdisp_device_interface_report *)dsm->interface_report; > + mmio_range = (struct pci_tdisp_mmio_range *)(interface_report + 1); > + > + > + msix_tbl_bar = get_msix_bar(pdev, PCI_MSIX_TABLE); > + msix_pba_bar = get_msix_bar(pdev, PCI_MSIX_PBA); > + > + for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) { > + > + /*FIXME!! units in 4K size*/ I guess you intend to fix this? > + /* no secure interrupts */ > + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { > + pr_info("Skipping misx table\n"); > + continue; > + } > + > + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { > + pr_info("Skipping misx pba\n"); s/misx/MSI-X/ (twice) s/pba/PBA/
On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote: > + for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) { > + > + /*FIXME!! units in 4K size*/ > + range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes); > + > + /* no secure interrupts */ > + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { > + pr_info("Skipping misx table\n"); > + continue; > + } > + > + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { > + pr_info("Skipping misx pba\n"); > + continue; > + } > + MSI-X and PBA can be placed to a BAR that has other registers as well. While the PCIe specification recommends BAR-level isolation for MSI-X structures, it is not mandated. It is enough to have sufficient isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs altogether may leave registers unintentionally mapped via unprotected IPA when they should have been mapped via protected IPA. Instead of skipping the whole BAR, would it make sense to determine where the MSI-X related regions reside, and skip validation only from these regions? - R2
On 31.7.2025 14.39, Arto Merilainen wrote: > On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote: > >> + for (int i = 0; i < interface_report->mmio_range_count; i++, >> mmio_range++) { >> + >> + /*FIXME!! units in 4K size*/ >> + range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, >> mmio_range->range_attributes); >> + >> + /* no secure interrupts */ >> + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { >> + pr_info("Skipping misx table\n"); >> + continue; >> + } >> + >> + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { >> + pr_info("Skipping misx pba\n"); >> + continue; >> + } >> + > > > MSI-X and PBA can be placed to a BAR that has other registers as well. > While the PCIe specification recommends BAR-level isolation for MSI-X > structures, it is not mandated. It is enough to have sufficient > isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs > altogether may leave registers unintentionally mapped via unprotected > IPA when they should have been mapped via protected IPA. > > Instead of skipping the whole BAR, would it make sense to determine > where the MSI-X related regions reside, and skip validation only from > these regions? I re-reviewed my suggestion, and what I proposed here seems wrong. However, I think there is a different more generic problem related to the MSI-X table, PBA and non-TEE ranges. If a BAR is sparse (e.g., it has TEE pages and the MSI-X table, PBA or non-TEE areas), the TDISP interface report may contain multiple ranges with the same range_id (/BAR id). In case a BAR contains some registers in low addresses, the MSI-X table and other registers after the MSI-X table, the interface report is expected to have two ranges for the same BAR with different "first 4k page" and "size" fields. This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING requires both the ipa_base and pa_base which should correspond to the same location. In above scenario, the PA of the first range would correspond to the BAR base whereas the second range would correspond to a location residing after the MSI-X table. Assuming that the report contains obfuscated (but linear) physical addresses, it would be possible to create heuristics for this case. However, the fundamental problem is that none of the "first 4k page" fields in the ranges is guaranteed to correspond to the base of any BAR: Consider a case where the MSI-X table is in the beginning of a BAR and it is followed by a single TEE range. If the MSI-X is not locked, the "first 4k page" field will not correspond to the beginning of the BAR. If the realm naiviely reads the ipa_base using pci_resouce_n() and corresponding pa_base from the interface report, the addresses won't match and the validation will fail. It seems that interpreting the interface report cannot be done without knowledge of the device's register layout. Therefore, I don't think the ranges can be validated/remapped automatically without involving the device driver, but there should be APIs for reading the interface report, and for requesting making specific ranges protected. - R2
Arto Merilainen <amerilainen@nvidia.com> writes: > On 31.7.2025 14.39, Arto Merilainen wrote: >> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote: >> >>> + for (int i = 0; i < interface_report->mmio_range_count; i++, >>> mmio_range++) { >>> + >>> + /*FIXME!! units in 4K size*/ >>> + range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, >>> mmio_range->range_attributes); >>> + >>> + /* no secure interrupts */ >>> + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { >>> + pr_info("Skipping misx table\n"); >>> + continue; >>> + } >>> + >>> + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { >>> + pr_info("Skipping misx pba\n"); >>> + continue; >>> + } >>> + >> >> >> MSI-X and PBA can be placed to a BAR that has other registers as well. >> While the PCIe specification recommends BAR-level isolation for MSI-X >> structures, it is not mandated. It is enough to have sufficient >> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs >> altogether may leave registers unintentionally mapped via unprotected >> IPA when they should have been mapped via protected IPA. >> >> Instead of skipping the whole BAR, would it make sense to determine >> where the MSI-X related regions reside, and skip validation only from >> these regions? > > I re-reviewed my suggestion, and what I proposed here seems wrong. > However, I think there is a different more generic problem related to > the MSI-X table, PBA and non-TEE ranges. > > If a BAR is sparse (e.g., it has TEE pages and the MSI-X table, PBA or > non-TEE areas), the TDISP interface report may contain multiple ranges > with the same range_id (/BAR id). In case a BAR contains some registers > in low addresses, the MSI-X table and other registers after the MSI-X > table, the interface report is expected to have two ranges for the same > BAR with different "first 4k page" and "size" fields. > > This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING > requires both the ipa_base and pa_base which should correspond to the > same location. In above scenario, the PA of the first range would > correspond to the BAR base whereas the second range would correspond to > a location residing after the MSI-X table. > > Assuming that the report contains obfuscated (but linear) physical > addresses, it would be possible to create heuristics for this case. > However, the fundamental problem is that none of the "first 4k page" > fields in the ranges is guaranteed to correspond to the base of any BAR: > Consider a case where the MSI-X table is in the beginning of a BAR and > it is followed by a single TEE range. If the MSI-X is not locked, the > "first 4k page" field will not correspond to the beginning of the BAR. > If the realm naiviely reads the ipa_base using pci_resouce_n() and > corresponding pa_base from the interface report, the addresses won't > match and the validation will fail. > > It seems that interpreting the interface report cannot be done without > knowledge of the device's register layout. Therefore, I don't think the > ranges can be validated/remapped automatically without involving the > device driver, but there should be APIs for reading the interface > report, and for requesting making specific ranges protected. > But we need to validate the interface report before accepting the device, and the device driver is only loaded after the device has been accepted. Can we assume that only the MSI-X table and PBA ranges may be missing from the interface report, while all other non-secure regions are reported as NON-TEE ranges? If so, we could retrieve the MSI-X guest real address details from config space and map the beginning of the BAR correctly. Dan / Yilun — how is this handled in Intel TDX? From what I can see, the AMD patches appear to encounter the same issue. -aneesh
Aneesh Kumar K.V wrote: > Arto Merilainen <amerilainen@nvidia.com> writes: > > > On 31.7.2025 14.39, Arto Merilainen wrote: > >> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote: > >> > >>> + for (int i = 0; i < interface_report->mmio_range_count; i++, > >>> mmio_range++) { > >>> + > >>> + /*FIXME!! units in 4K size*/ > >>> + range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, > >>> mmio_range->range_attributes); > >>> + > >>> + /* no secure interrupts */ > >>> + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { > >>> + pr_info("Skipping misx table\n"); > >>> + continue; > >>> + } > >>> + > >>> + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { > >>> + pr_info("Skipping misx pba\n"); > >>> + continue; > >>> + } > >>> + > >> > >> > >> MSI-X and PBA can be placed to a BAR that has other registers as well. > >> While the PCIe specification recommends BAR-level isolation for MSI-X > >> structures, it is not mandated. It is enough to have sufficient > >> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs > >> altogether may leave registers unintentionally mapped via unprotected > >> IPA when they should have been mapped via protected IPA. > >> > >> Instead of skipping the whole BAR, would it make sense to determine > >> where the MSI-X related regions reside, and skip validation only from > >> these regions? > > > > I re-reviewed my suggestion, and what I proposed here seems wrong. > > However, I think there is a different more generic problem related to > > the MSI-X table, PBA and non-TEE ranges. > > > > If a BAR is sparse (e.g., it has TEE pages and the MSI-X table, PBA or > > non-TEE areas), the TDISP interface report may contain multiple ranges > > with the same range_id (/BAR id). In case a BAR contains some registers > > in low addresses, the MSI-X table and other registers after the MSI-X > > table, the interface report is expected to have two ranges for the same > > BAR with different "first 4k page" and "size" fields. > > > > This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING > > requires both the ipa_base and pa_base which should correspond to the > > same location. In above scenario, the PA of the first range would > > correspond to the BAR base whereas the second range would correspond to > > a location residing after the MSI-X table. > > > > Assuming that the report contains obfuscated (but linear) physical > > addresses, it would be possible to create heuristics for this case. > > However, the fundamental problem is that none of the "first 4k page" > > fields in the ranges is guaranteed to correspond to the base of any BAR: > > Consider a case where the MSI-X table is in the beginning of a BAR and > > it is followed by a single TEE range. If the MSI-X is not locked, the > > "first 4k page" field will not correspond to the beginning of the BAR. > > If the realm naiviely reads the ipa_base using pci_resouce_n() and > > corresponding pa_base from the interface report, the addresses won't > > match and the validation will fail. > > > > It seems that interpreting the interface report cannot be done without > > knowledge of the device's register layout. Therefore, I don't think the > > ranges can be validated/remapped automatically without involving the > > device driver, but there should be APIs for reading the interface > > report, and for requesting making specific ranges protected. > > > > But we need to validate the interface report before accepting the device, > and the device driver is only loaded after the device has been accepted. > > Can we assume that only the MSI-X table and PBA ranges may be missing > from the interface report, while all other non-secure regions are > reported as NON-TEE ranges? > > If so, we could retrieve the MSI-X guest real address details from > config space and map the beginning of the BAR correctly. > > Dan / Yilun — how is this handled in Intel TDX? > > From what I can see, the AMD patches appear to encounter the same issue. Same issue exists for TDX. In the near term this solidifies that the PCI/TSM core should not be assumining anything with respect to marking MMIO ranges as private, and leave that all the to low-level TSM driver. ...but then yes I expect we need to build some common infrastructure for special casing MSIX.
On Thu, Sep 11, 2025 at 11:03:50AM +0530, Aneesh Kumar K.V wrote: > But we need to validate the interface report before accepting the device, > and the device driver is only loaded after the device has been accepted. +1 This must work from the generic OS code. So I'd say add a new TSM op: int validate_pci_bar_range(struct pci_dev *pdev, unsigned int bar_index, u64 tdisp_pa, u64 size,phys_addr_t *bar_offset_out); TSM has broadly two options to compute bar_offset_out: 1) Require the TDISP MMIO Offset is aligned to the BAR size and use something like: *bar_offset_out = (tdisp_pa) % pci_resource_len(pdev, bar_index); ipa = pci_resource_start(pdev, bar_index) + *bar_offset_out; if (size + *bar_offset_out > pci_resource_len(pdev, bar_index)) return -EINVAL; tsm_call_to_validate(pdev, ipa, pa, size) 2) Require the TSM to convert the offest'd PA to the IPA: tsm_call_to_convert(pdev, pa, size, &ipa); if (ipa < pci_resource_start(pdev, bar_index) || ipa >= pci_resource_end(pdev, bar_index) || (ipa + size) > pci_resource_end(pdev, bar_index)) return -EINVAL; *bar_offset_out = ipa - pci_resource_start(pdev, bar_index); Then the generic code builds a map of what parts of the BAR are secure and what are not. If it can't do either the TSM is unusable by Linux. Jason
On 11/9/25 15:33, Aneesh Kumar K.V wrote: > Arto Merilainen <amerilainen@nvidia.com> writes: > >> On 31.7.2025 14.39, Arto Merilainen wrote: >>> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote: >>> >>>> + for (int i = 0; i < interface_report->mmio_range_count; i++, >>>> mmio_range++) { >>>> + >>>> + /*FIXME!! units in 4K size*/ >>>> + range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, >>>> mmio_range->range_attributes); >>>> + >>>> + /* no secure interrupts */ >>>> + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { >>>> + pr_info("Skipping misx table\n"); >>>> + continue; >>>> + } >>>> + >>>> + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { >>>> + pr_info("Skipping misx pba\n"); >>>> + continue; >>>> + } >>>> + >>> >>> >>> MSI-X and PBA can be placed to a BAR that has other registers as well. >>> While the PCIe specification recommends BAR-level isolation for MSI-X >>> structures, it is not mandated. It is enough to have sufficient >>> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs >>> altogether may leave registers unintentionally mapped via unprotected >>> IPA when they should have been mapped via protected IPA. >>> >>> Instead of skipping the whole BAR, would it make sense to determine >>> where the MSI-X related regions reside, and skip validation only from >>> these regions? >> >> I re-reviewed my suggestion, and what I proposed here seems wrong. >> However, I think there is a different more generic problem related to >> the MSI-X table, PBA and non-TEE ranges. >> >> If a BAR is sparse (e.g., it has TEE pages and the MSI-X table, PBA or >> non-TEE areas), the TDISP interface report may contain multiple ranges >> with the same range_id (/BAR id). In case a BAR contains some registers >> in low addresses, the MSI-X table and other registers after the MSI-X >> table, the interface report is expected to have two ranges for the same >> BAR with different "first 4k page" and "size" fields. >> >> This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING >> requires both the ipa_base and pa_base which should correspond to the >> same location. In above scenario, the PA of the first range would >> correspond to the BAR base whereas the second range would correspond to >> a location residing after the MSI-X table. >> >> Assuming that the report contains obfuscated (but linear) physical >> addresses, it would be possible to create heuristics for this case. >> However, the fundamental problem is that none of the "first 4k page" >> fields in the ranges is guaranteed to correspond to the base of any BAR: >> Consider a case where the MSI-X table is in the beginning of a BAR and >> it is followed by a single TEE range. If the MSI-X is not locked, the >> "first 4k page" field will not correspond to the beginning of the BAR. >> If the realm naiviely reads the ipa_base using pci_resouce_n() and >> corresponding pa_base from the interface report, the addresses won't >> match and the validation will fail. >> >> It seems that interpreting the interface report cannot be done without >> knowledge of the device's register layout. Therefore, I don't think the >> ranges can be validated/remapped automatically without involving the >> device driver, but there should be APIs for reading the interface >> report, and for requesting making specific ranges protected. >> > > But we need to validate the interface report before accepting the device, > and the device driver is only loaded after the device has been accepted. > > Can we assume that only the MSI-X table and PBA ranges may be missing > from the interface report, while all other non-secure regions are > reported as NON-TEE ranges? In case of the "some registers"+MSIX+"some more registers" BAR#X, I'd rather expect 3 ranges in the report and not just 2, for each subrange, sorted, with/without NonTEE bit set, as the PCIe spec suggests (sort of, I struggle parsing it): === When reporting the MMIO range for a TDI, the MMIO ranges must be reported in the logical order in which the TDI MMIO range is configured such that the first range reported corresponds to first range of pages in the TDI and so on === And if the number of pages in the report differs from the BAR size, then we should fail validation. Otherwise it is impossible to tell from the report's MMIO addresses what part of a BAR needs to be validated (==TEE) and if the guest device driver has to know it - then reports are just useless (which is hardly true). > If so, we could retrieve the MSI-X guest real address details from > config space and map the beginning of the BAR correctly. > > Dan / Yilun — how is this handled in Intel TDX? > > From what I can see, the AMD patches appear to encounter the same issue. I am skipping MSIX BAR because T=1 is tied to C=1 so after such validation, the BAR belongs to the guest and the hw will reject VFIO-PCI (==HV) attempts to write to MSIX BAR. Probably too straight forward though and I can try assigning it to the host (and add Cbit in those PTEs), and see how the PSP handles it (not). Thanks, > > -aneesh -- Alexey
On Wed, Sep 10, 2025 at 08:47:43AM +0300, Arto Merilainen wrote: > This creates a tricky problem given that RSI_VDEV_VALIDATE_MAPPING requires > both the ipa_base and pa_base which should correspond to the same location. > In above scenario, the PA of the first range would correspond to the BAR > base whereas the second range would correspond to a location residing after > the MSI-X table. This seems like a defect in the RSI_VDEV_VALIDATE_MAPPING - it should be able to consume the same format of data that the tdisp report emits to validate it. From a kernel side we also should be careful that the driver isn't tricked into mapping MMIO that is not secure when it should be. Presumably all the default io access functions should demand secure memory in T=1 mode, and special ones like the MSI-X code would have some special version to accept either? Jason
Arto Merilainen <amerilainen@nvidia.com> writes: > On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote: > >> + for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) { >> + >> + /*FIXME!! units in 4K size*/ >> + range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes); >> + >> + /* no secure interrupts */ >> + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { >> + pr_info("Skipping misx table\n"); >> + continue; >> + } >> + >> + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { >> + pr_info("Skipping misx pba\n"); >> + continue; >> + } >> + > > > MSI-X and PBA can be placed to a BAR that has other registers as well. > While the PCIe specification recommends BAR-level isolation for MSI-X > structures, it is not mandated. It is enough to have sufficient > isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs > altogether may leave registers unintentionally mapped via unprotected > IPA when they should have been mapped via protected IPA. > > Instead of skipping the whole BAR, would it make sense to determine > where the MSI-X related regions reside, and skip validation only from > these regions? > Yes, that was added because at one point the FVP model was including the MSI-X table and PBA regions in the interface report. If I understand correctly, we shouldn't expect to see those regions in the report unless secure interrupts are supported. The BAR-based skipping was added as a workaround to handle the FVP issue. I believe we can drop that workaround now—if those regions are incorrectly present, the below validation logic should catch and reject them appropriately. Does that sound reasonable? /* No secure interrupts, we should not find this set, ignore for now. */ if (FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes) || FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes)) { pci_info(pdev, "Skipping MSIX (%ld/%ld) range [%d] #%d %d pages, %llx..%llx\n", FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes), FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes), i, range_id, mmio_range->num_pages, r->start, r->end); continue; } -aneesh
On 4.8.2025 9.37, Aneesh Kumar K.V wrote: > Arto Merilainen <amerilainen@nvidia.com> writes: > >> On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote: >> >>> + for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) { >>> + >>> + /*FIXME!! units in 4K size*/ >>> + range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes); >>> + >>> + /* no secure interrupts */ >>> + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { >>> + pr_info("Skipping misx table\n"); >>> + continue; >>> + } >>> + >>> + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { >>> + pr_info("Skipping misx pba\n"); >>> + continue; >>> + } >>> + >> >> >> MSI-X and PBA can be placed to a BAR that has other registers as well. >> While the PCIe specification recommends BAR-level isolation for MSI-X >> structures, it is not mandated. It is enough to have sufficient >> isolation within the BAR. Therefore, skipping the MSI-X and PBA BARs >> altogether may leave registers unintentionally mapped via unprotected >> IPA when they should have been mapped via protected IPA. >> >> Instead of skipping the whole BAR, would it make sense to determine >> where the MSI-X related regions reside, and skip validation only from >> these regions? >> > > Yes, that was added because at one point the FVP model was including the > MSI-X table and PBA regions in the interface report. The issue I am raising is a different one. The MSI-X table and PBA may reside in the middle of a BAR range, and the BAR range may contain registers which should be accessible only via protected IPA. In this case I would expect the pages before and after the MSI-X table (and PBA) to be validated while the pages related to MSI-X structures would be left unprotected. Given that the MSI-X table or PBA regions shouldn't be present in the interface report, the code needs to first find out where the MSI-X structures reside, and use this information to determine which "sub-ranges" should be skipped over during validation. > If I understand correctly, we shouldn't expect to see those regions in > the report unless secure interrupts are supported. The BAR-based > skipping was added as a workaround to handle the FVP issue. > Correct. Assuming that RMM doesn't lock the MSI-X table, I'd expect these regions to be omitted in the interface report. > I believe we can drop that workaround now—if those regions are > incorrectly present, the below validation logic should catch and > reject them appropriately. Does that sound reasonable? > > /* No secure interrupts, we should not find this set, ignore for now. */ > if (FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes) || > FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes)) { > pci_info(pdev, "Skipping MSIX (%ld/%ld) range [%d] #%d %d pages, %llx..%llx\n", > FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes), > FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes), > i, range_id, mmio_range->num_pages, r->start, r->end); > continue; > } > > First, I think this is skipping over the whole range => if there are registers outside of the MSI-X structures (but within the same BAR), they won't be validated. Second, if the MSI-X regions are present in the interface report, wouldn't it - in the common case - mean that the device expects the ranges to be accessed with T=1? If this happens unexpectedly, it sounds that MSI-X wouldn't be usable. I wonder if the code should simply return an error instead if informing the user via pci_info()... - R2
On Thu, Jul 31, 2025 at 02:39:09PM +0300, Arto Merilainen wrote: > On 28.7.2025 16.52, Aneesh Kumar K.V (Arm) wrote: > > > + for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) { > > + > > + /*FIXME!! units in 4K size*/ > > + range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes); > > + > > + /* no secure interrupts */ > > + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { > > + pr_info("Skipping misx table\n"); > > + continue; > > + } > > + > > + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { > > + pr_info("Skipping misx pba\n"); > > + continue; > > + } > > + > > > MSI-X and PBA can be placed to a BAR that has other registers as well. While > the PCIe specification recommends BAR-level isolation for MSI-X structures, > it is not mandated. Right, there are not enough BARs in most devices to give MSI its own BAR. > It is enough to have sufficient isolation within the > BAR. Therefore, skipping the MSI-X and PBA BARs altogether may leave > registers unintentionally mapped via unprotected IPA when they > should have been mapped via protected IPA. Right, this sounds bad. > Instead of skipping the whole BAR, would it make sense to determine > where the MSI-X related regions reside, and skip validation only from these > regions? IMHO this is a mess. The virtualization must end up putting a shared page(s) covering the MSI space in the middle of the MMIO region. I think this should be done by fragmenting the layout in the IPA where the private MMIO is within the protected IPA space with an unmapped hole covering the MSIX registers. The acceptance process should validate this. The MSIX registers would then be located in the shared IPA space. A normal driver mmaping it's BAR will then crash if it tries to access the MSIX registers. This is good, we want to catch these non-secure configurations and block them. The MSI code will have to know to compute the shared IPA alias and use that. Jason
On Mon, 28 Jul 2025 19:22:11 +0530 "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org> wrote: > This starts the sequence to transition the realm device to the TDISP RUN > state by writing 1 to 'tsm/accept'. > > Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org> Just some trivial stuff. > diff --git a/drivers/virt/coco/arm-cca-guest/rsi-da.c b/drivers/virt/coco/arm-cca-guest/rsi-da.c > index 47b379318e7c..936f844880de 100644 > --- a/drivers/virt/coco/arm-cca-guest/rsi-da.c > +++ b/drivers/virt/coco/arm-cca-guest/rsi-da.c > @@ -215,3 +215,135 @@ int rsi_device_lock(struct pci_dev *pdev) > > return ret; > } > + > +int rsi_update_interface_report(struct pci_dev *pdev, bool validate) > +{ > + int ret; > + struct resource *r; > + int msix_tbl_bar, msix_pba_bar; > + unsigned int range_id; > + unsigned long mmio_start_phys; > + unsigned long mmio_flags = 0; /* non coherent, not limited order */ > + struct pci_tdisp_mmio_range *mmio_range; > + struct pci_tdisp_device_interface_report *interface_report; > + struct cca_guest_dsc *dsm = to_cca_guest_dsc(pdev); > + int vdev_id = (pci_domain_nr(pdev->bus) << 16) | > + PCI_DEVID(pdev->bus->number, pdev->devfn); > + > + Single line. > + interface_report = (struct pci_tdisp_device_interface_report *)dsm->interface_report; > + mmio_range = (struct pci_tdisp_mmio_range *)(interface_report + 1); > + > + Single line > + msix_tbl_bar = get_msix_bar(pdev, PCI_MSIX_TABLE); > + msix_pba_bar = get_msix_bar(pdev, PCI_MSIX_PBA); > + > + for (int i = 0; i < interface_report->mmio_range_count; i++, mmio_range++) { > + > + /*FIXME!! units in 4K size*/ > + range_id = FIELD_GET(TSM_INTF_REPORT_MMIO_RANGE_ID, mmio_range->range_attributes); > + > + /* no secure interrupts */ > + if (msix_tbl_bar != -1 && range_id == msix_tbl_bar) { That first condition can get hiked out of the loop. > + pr_info("Skipping misx table\n"); > + continue; > + } > + > + if (msix_pba_bar != -1 && range_id == msix_pba_bar) { Likewise. > + pr_info("Skipping misx pba\n"); > + continue; > + } > + > + r = pci_resource_n(pdev, range_id); > + > + if (r->end == r->start || > + ((r->end - r->start + 1) & ~PAGE_MASK) || !mmio_range->num_pages) { resource_size() > + pci_warn(pdev, "Skipping broken range [%d] #%d %d pages, %llx..%llx\n", > + i, range_id, mmio_range->num_pages, r->start, r->end); > + continue; > + } > + > + if (FIELD_GET(TSM_INTF_REPORT_MMIO_IS_NON_TEE, mmio_range->range_attributes)) { > + pci_info(pdev, "Skipping non-TEE range [%d] #%d %d pages, %llx..%llx\n", > + i, range_id, mmio_range->num_pages, r->start, r->end); > + continue; > + } > + > + /* No secure interrupts, we should not find this set, ignore for now. */ > + if (FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes) || > + FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes)) { > + pci_info(pdev, "Skipping MSIX (%ld/%ld) range [%d] #%d %d pages, %llx..%llx\n", > + FIELD_GET(TSM_INTF_REPORT_MMIO_MSIX_TABLE, mmio_range->range_attributes), > + FIELD_GET(TSM_INTF_REPORT_MMIO_PBA, mmio_range->range_attributes), > + i, range_id, mmio_range->num_pages, r->start, r->end); > + continue; > + } > + > + mmio_start_phys = mmio_range->first_page; > + if (validate) > + ret = rsi_validate_dev_mapping(vdev_id, dsm->instance_id, > + r->start, r->end + 1, > + mmio_start_phys << 12, mmio_flags); > + else > + ret = rsi_invalidate_dev_mapping(r->start, r->end + 1); > + if (ret) { > + pci_err(pdev, "failed to set protection attributes for the address range\n"); > + return -EIO; > + } > + } > + return 0; > +}
© 2016 - 2025 Red Hat, Inc.