[v4] Confidential VMBus

[PATCH hyperv-next v4 15/16] Drivers: hv: Support establishing the confidential VMBus connection

Posted by Roman Kisel 2 months, 3 weeks ago

To establish the confidential VMBus connection the CoCo VM guest
first attempts to connect to the VMBus server run by the paravisor.
If that fails, the guest falls back to the non-confidential VMBus.

Implement that in the VMBus driver initialization.

Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
---
 drivers/hv/vmbus_drv.c | 189 ++++++++++++++++++++++++++++-------------
 1 file changed, 130 insertions(+), 59 deletions(-)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 13aca5abc7d8..53be3157e22c 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1057,12 +1057,9 @@ static void vmbus_onmessage_work(struct work_struct *work)
 	kfree(ctx);
 }
 
-void vmbus_on_msg_dpc(unsigned long data)
+static void __vmbus_on_msg_dpc(void *message_page_addr)
 {
-	struct hv_per_cpu_context *hv_cpu = (void *)data;
-	void *page_addr = hv_cpu->hyp_synic_message_page;
-	struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
-				  VMBUS_MESSAGE_SINT;
+	struct hv_message msg_copy, *msg;
 	struct vmbus_channel_message_header *hdr;
 	enum vmbus_channel_message_type msgtype;
 	const struct vmbus_channel_message_table_entry *entry;
@@ -1070,6 +1067,10 @@ void vmbus_on_msg_dpc(unsigned long data)
 	__u8 payload_size;
 	u32 message_type;
 
+	if (!message_page_addr)
+		return;
+	msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
+
 	/*
 	 * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
 	 * it is being used in 'struct vmbus_channel_message_header' definition
@@ -1195,6 +1196,14 @@ void vmbus_on_msg_dpc(unsigned long data)
 	vmbus_signal_eom(msg, message_type);
 }
 
+void vmbus_on_msg_dpc(unsigned long data)
+{
+	struct hv_per_cpu_context *hv_cpu = (void *)data;
+
+	__vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page);
+	__vmbus_on_msg_dpc(hv_cpu->para_synic_message_page);
+}
+
 #ifdef CONFIG_PM_SLEEP
 /*
  * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
@@ -1233,21 +1242,19 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
 #endif /* CONFIG_PM_SLEEP */
 
 /*
- * Schedule all channels with events pending
+ * Schedule all channels with events pending.
+ * The event page can be directly checked to get the id of
+ * the channel that has the interrupt pending.
  */
-static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
+static void vmbus_chan_sched(void *event_page_addr)
 {
 	unsigned long *recv_int_page;
 	u32 maxbits, relid;
+	union hv_synic_event_flags *event;
 
-	/*
-	 * The event page can be directly checked to get the id of
-	 * the channel that has the interrupt pending.
-	 */
-	void *page_addr = hv_cpu->hyp_synic_event_page;
-	union hv_synic_event_flags *event
-		= (union hv_synic_event_flags *)page_addr +
-					 VMBUS_MESSAGE_SINT;
+	if (!event_page_addr)
+		return;
+	event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT;
 
 	maxbits = HV_EVENT_FLAGS_COUNT;
 	recv_int_page = event->flags;
@@ -1255,6 +1262,11 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
 	if (unlikely(!recv_int_page))
 		return;
 
+	/*
+	 * Suggested-by: Michael Kelley <mhklinux@outlook.com>
+	 * One possible optimization would be to keep track of the largest relID that's in use,
+	 * and only scan up to that relID.
+	 */
 	for_each_set_bit(relid, recv_int_page, maxbits) {
 		void (*callback_fn)(void *context);
 		struct vmbus_channel *channel;
@@ -1318,26 +1330,35 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
 	}
 }
 
-static void vmbus_isr(void)
+static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr)
 {
-	struct hv_per_cpu_context *hv_cpu
-		= this_cpu_ptr(hv_context.cpu_context);
-	void *page_addr;
 	struct hv_message *msg;
 
-	vmbus_chan_sched(hv_cpu);
-
-	page_addr = hv_cpu->hyp_synic_message_page;
-	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
+	if (!message_page_addr)
+		return;
+	msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
 
 	/* Check if there are actual msgs to be processed */
 	if (msg->header.message_type != HVMSG_NONE) {
 		if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
 			hv_stimer0_isr();
 			vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
-		} else
+		} else {
 			tasklet_schedule(&hv_cpu->msg_dpc);
+		}
 	}
+}
+
+static void vmbus_isr(void)
+{
+	struct hv_per_cpu_context *hv_cpu
+		= this_cpu_ptr(hv_context.cpu_context);
+
+	vmbus_chan_sched(hv_cpu->hyp_synic_event_page);
+	vmbus_chan_sched(hv_cpu->para_synic_event_page);
+
+	vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page);
+	vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page);
 
 	add_interrupt_randomness(vmbus_interrupt);
 }
@@ -1355,6 +1376,59 @@ static void vmbus_percpu_work(struct work_struct *work)
 	hv_synic_init(cpu);
 }
 
+static int vmbus_alloc_synic_and_connect(void)
+{
+	int ret, cpu;
+	struct work_struct __percpu *works;
+	int hyperv_cpuhp_online;
+
+	ret = hv_synic_alloc();
+	if (ret < 0)
+		goto err_alloc;
+
+	works = alloc_percpu(struct work_struct);
+	if (!works) {
+		ret = -ENOMEM;
+		goto err_alloc;
+	}
+
+	/*
+	 * Initialize the per-cpu interrupt state and stimer state.
+	 * Then connect to the host.
+	 */
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = per_cpu_ptr(works, cpu);
+
+		INIT_WORK(work, vmbus_percpu_work);
+		schedule_work_on(cpu, work);
+	}
+
+	for_each_online_cpu(cpu)
+		flush_work(per_cpu_ptr(works, cpu));
+
+	/* Register the callbacks for possible CPU online/offline'ing */
+	ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
+						   hv_synic_init, hv_synic_cleanup);
+	cpus_read_unlock();
+	free_percpu(works);
+	if (ret < 0)
+		goto err_alloc;
+	hyperv_cpuhp_online = ret;
+
+	ret = vmbus_connect();
+	if (ret)
+		goto err_connect;
+	return 0;
+
+err_connect:
+	cpuhp_remove_state(hyperv_cpuhp_online);
+	return -ENODEV;
+err_alloc:
+	hv_synic_free();
+	return -ENOMEM;
+}
+
 /*
  * vmbus_bus_init -Main vmbus driver initialization routine.
  *
@@ -1365,8 +1439,7 @@ static void vmbus_percpu_work(struct work_struct *work)
  */
 static int vmbus_bus_init(void)
 {
-	int ret, cpu;
-	struct work_struct __percpu *works;
+	int ret;
 
 	ret = hv_init();
 	if (ret != 0) {
@@ -1401,41 +1474,42 @@ static int vmbus_bus_init(void)
 		}
 	}
 
-	ret = hv_synic_alloc();
-	if (ret)
-		goto err_alloc;
-
-	works = alloc_percpu(struct work_struct);
-	if (!works) {
-		ret = -ENOMEM;
-		goto err_alloc;
-	}
-
 	/*
-	 * Initialize the per-cpu interrupt state and stimer state.
-	 * Then connect to the host.
+	 * Attempt to establish the confidential VMBus connection first if this VM is
+	 * a hardware confidential VM, and the paravisor is present.
+	 *
+	 * All scenarios here are:
+	 *	1. No paravisor,
+	 *  2. Paravisor without VMBus relay, no hardware isolation,
+	 *  3. Paravisor without VMBus relay, with hardware isolation,
+	 *  4. Paravisor with VMBus relay, no hardware isolation,
+	 *  5. Paravisor with VMBus relay, with hardware isolation.
+	 *
+	 * In the cloud, scenarios 1, 4, 5 are most common, and outside the cloud,
+	 * scenario 1 should be the most common at the moment. Detecting of the Confidential
+	 * VMBus support below takes that into account running `vmbus_alloc_synic_and_connect()`
+	 * only once (barring any faults not related to VMBus) in these cases. That is true
+	 * for the scenario 2, too, albeit it might be not as feature-rich as 1, 4, 5.
+	 *
+	 * However, the code will be doing much more work in scenario 3 where it will have to
+	 * first initialize lots of structures for every CPU only to likely tear them down later
+	 * and start again, now without attempting to use Confidential VMBus, thus taking a
+	 * performance hit. Such systems are rather uncomoon today, don't support more than
+	 * ~300 CPUs, and are rarely used with many dozens of CPUs. As the time goes on, that
+	 * will be even less common. Hence, the preference is to not specialize the code for
+	 * that scenario.
 	 */
-	cpus_read_lock();
-	for_each_online_cpu(cpu) {
-		struct work_struct *work = per_cpu_ptr(works, cpu);
+	ret = -ENODEV;
+	if (ms_hyperv.paravisor_present && (hv_isolation_type_tdx() || hv_isolation_type_snp())) {
+		is_confidential = true;
+		ret = vmbus_alloc_synic_and_connect();
+		is_confidential = !ret;
 
-		INIT_WORK(work, vmbus_percpu_work);
-		schedule_work_on(cpu, work);
+		pr_info("VMBus is confidential: %d\n", is_confidential);
 	}
 
-	for_each_online_cpu(cpu)
-		flush_work(per_cpu_ptr(works, cpu));
-
-	/* Register the callbacks for possible CPU online/offline'ing */
-	ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
-						   hv_synic_init, hv_synic_cleanup);
-	cpus_read_unlock();
-	free_percpu(works);
-	if (ret < 0)
-		goto err_alloc;
-	hyperv_cpuhp_online = ret;
-
-	ret = vmbus_connect();
+	if (!is_confidential)
+		ret = vmbus_alloc_synic_and_connect();
 	if (ret)
 		goto err_connect;
 
@@ -1451,9 +1525,6 @@ static int vmbus_bus_init(void)
 	return 0;
 
 err_connect:
-	cpuhp_remove_state(hyperv_cpuhp_online);
-err_alloc:
-	hv_synic_free();
 	if (vmbus_irq == -1) {
 		hv_remove_vmbus_handler();
 	} else {
-- 
2.43.0

Re: [PATCH hyperv-next v4 15/16] Drivers: hv: Support establishing the confidential VMBus connection

Posted by dan.j.williams@intel.com 1 month, 2 weeks ago

Roman Kisel wrote:
> To establish the confidential VMBus connection the CoCo VM guest
> first attempts to connect to the VMBus server run by the paravisor.
> If that fails, the guest falls back to the non-confidential VMBus.
> 
> Implement that in the VMBus driver initialization.
> 
> Signed-off-by: Roman Kisel <romank@linux.microsoft.com>
> ---
>  drivers/hv/vmbus_drv.c | 189 ++++++++++++++++++++++++++++-------------
>  1 file changed, 130 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
[..]
> @@ -1401,41 +1474,42 @@ static int vmbus_bus_init(void)
>  		}
>  	}
>  
> -	ret = hv_synic_alloc();
> -	if (ret)
> -		goto err_alloc;
> -
> -	works = alloc_percpu(struct work_struct);
> -	if (!works) {
> -		ret = -ENOMEM;
> -		goto err_alloc;
> -	}
> -
>  	/*
> -	 * Initialize the per-cpu interrupt state and stimer state.
> -	 * Then connect to the host.
> +	 * Attempt to establish the confidential VMBus connection first if this VM is
> +	 * a hardware confidential VM, and the paravisor is present.
> +	 *
> +	 * All scenarios here are:
> +	 *	1. No paravisor,
> +	 *  2. Paravisor without VMBus relay, no hardware isolation,
> +	 *  3. Paravisor without VMBus relay, with hardware isolation,
> +	 *  4. Paravisor with VMBus relay, no hardware isolation,
> +	 *  5. Paravisor with VMBus relay, with hardware isolation.
> +	 *
> +	 * In the cloud, scenarios 1, 4, 5 are most common, and outside the cloud,
> +	 * scenario 1 should be the most common at the moment. Detecting of the Confidential
> +	 * VMBus support below takes that into account running `vmbus_alloc_synic_and_connect()`
> +	 * only once (barring any faults not related to VMBus) in these cases. That is true
> +	 * for the scenario 2, too, albeit it might be not as feature-rich as 1, 4, 5.
> +	 *
> +	 * However, the code will be doing much more work in scenario 3 where it will have to
> +	 * first initialize lots of structures for every CPU only to likely tear them down later
> +	 * and start again, now without attempting to use Confidential VMBus, thus taking a
> +	 * performance hit. Such systems are rather uncomoon today, don't support more than
> +	 * ~300 CPUs, and are rarely used with many dozens of CPUs. As the time goes on, that
> +	 * will be even less common. Hence, the preference is to not specialize the code for
> +	 * that scenario.

I read this blurb looking for answers to my question below, no luck, and
left further wondering what is the comment trying to convey to future
maintenance?

>  	 */
> -	cpus_read_lock();
> -	for_each_online_cpu(cpu) {
> -		struct work_struct *work = per_cpu_ptr(works, cpu);
> +	ret = -ENODEV;
> +	if (ms_hyperv.paravisor_present && (hv_isolation_type_tdx() || hv_isolation_type_snp())) {
> +		is_confidential = true;

In comparison to PCIe TDISP where there is an explicit validation step
of cryptographic evidence that the platform is what it claims to be, I
am missing the same for this.

I would expect something like a paravisor signed golden measurement with
a certificate that can be built-in to the kernel to validate that "yes,
in addition to the platform claims that can be emulated, this bus
enumeration is signed by an authority this kernel image trusts."

My motivation for commenting here is for alignment purposes with the
PCIe TDISP enabling and wider concerns about accepting other devices for
private operation. Specifically, I want to align on a shared
representation in the device-core (struct device) to communicate that a
device is either on a bus that has been accepted for private operation
(confidential-vmbus today, potentially signed-ACPI-devices tomorrow), or
is a device that has been individually accepted for private operation
(PCIe TDISP). In both cases there needs to be either a golden
measurement mechanism built-in, or a userspace acceptance dependency in
the flow.

Otherwise what mitigates a guest conveying secrets to a device that is
merely emulating a trusted bus/device?

Re: [PATCH hyperv-next v4 15/16] Drivers: hv: Support establishing the confidential VMBus connection

Posted by Roman Kisel 1 month, 1 week ago

[...]
>> +	 *
>> +	 * All scenarios here are:
>> +	 *	1. No paravisor,
>> +	 *  2. Paravisor without VMBus relay, no hardware isolation,
>> +	 *  3. Paravisor without VMBus relay, with hardware isolation,
>> +	 *  4. Paravisor with VMBus relay, no hardware isolation,
>> +	 *  5. Paravisor with VMBus relay, with hardware isolation.
>> +	 *
>>
> I read this blurb looking for answers to my question below, no luck, and
> left further wondering what is the comment trying to convey to future
> maintenance?

The intention was to enumerate scenarios in which the driver executes
this code to document what to expect of the conditional statement

| if (ms_hyperv.paravisor_present && (hv_isolation_type_tdx() || hv_isolation_type_snp()))

[...]

> In comparison to PCIe TDISP where there is an explicit validation step
> of cryptographic evidence that the platform is what it claims to be, I
> am missing the same for this.
>

This doesn't replace TDISP, I'll do a better job of supplementing the code changes
with documentation and comments! Any suggestions are greatly appreciated.

A fully-enlightened Linux guest could just use TDISP once support for that is available
in the Linux kernel. Before it is, the non-fully enlightened Linux guests (they can only deal
with accepting memory and sharing memory with the host) could rely on the paravisor to talk
to such devices. The TDISP device will be connected to the paravisor, and the paravisor will
provide the paravirtualized storage and network over the VMBus channels to the Linux guest.

The patch set is a building block for building a confidential I/O path for the non-fully
enlightened Linux guests. It would be great to have the Linux storage and network stack not
to share pages with the host (and not bounce-buffer) if the storage and network are
paravirtualized && use the Confidential VMBus. In the first version of the patchset I had
patches for that, yet that was considered too naive to be merged in the main line kernel so
I dropped them. But even without that, this patch series protects the control plane and the
data plane from the host with the exception of the pages the guest might use for bounce-buffering
although it could've avoided that in this case.

I mentioned that the paravisor will be handling the TDISP device for such guests.
As folks might know, we use the OpenHCL paravisor which is a Linux kernel with the VTL
mode patches we've been upstreaming (links to the repos are in the cover letter), and
the OpenVMM running in the user land. The question would be if TDISP isn't available
in the Linux kernel, how one would get it working in the OpenHCL paravisor that itself
runs Linux? The SEV guest device in the paravisor kernel is being extended to handle
TIO. Once TDISP support is available in the mainline kernel, the paravisor will switch
to using the mainline implementation.

> I would expect something like a paravisor signed golden measurement with
> a certificate that can be built-in to the kernel to validate that "yes,
> in addition to the platform claims that can be emulated, this bus
> enumeration is signed by an authority this kernel image trusts."
>
> My motivation for commenting here is for alignment purposes with the
> PCIe TDISP enabling and wider concerns about accepting other devices for
> private operation. Specifically, I want to align on a shared
> representation in the device-core (struct device) to communicate that a
> device is either on a bus that has been accepted for private operation
> (confidential-vmbus today, potentially signed-ACPI-devices tomorrow), or
> is a device that has been individually accepted for private operation
> (PCIe TDISP). In both cases there needs to be either a golden
> measurement mechanism built-in, or a userspace acceptance dependency in
> the flow.
>
> Otherwise what mitigates a guest conveying secrets to a device that is
> merely emulating a trusted bus/device?