[PATCH v6 2/7] ptp: vmclock: support device notifications

Takahiro Itazuri posted 7 patches 2 weeks, 3 days ago
There is a newer version of this series
[PATCH v6 2/7] ptp: vmclock: support device notifications
Posted by Takahiro Itazuri 2 weeks, 3 days ago
From: "Babis Chalios" <bchalios@amazon.es>

Add optional support for device notifications in VMClock. When
supported, the hypervisor will send a device notification every time it
updates the seq_count to a new even value.

Moreover, add support for poll() in VMClock as a means to propagate this
notification to user space. poll() will return a POLLIN event to
listeners every time seq_count changes to a value different than the one
last seen (since open() or last read()/pread()). This means that when
poll() returns a POLLIN event, listeners need to use read() to observe
what has changed and update the reader's view of seq_count. In other
words, after a poll() returned, all subsequent calls to poll() will
immediately return with a POLLIN event until the listener calls read().

The device advertises support for the notification mechanism by setting
flag VMCLOCK_FLAG_NOTIFICATION_PRESENT in vmclock_abi flags field. If
the flag is not present the driver won't setup the ACPI notification
handler and poll() will always immediately return POLLHUP.

Signed-off-by: Babis Chalios <bchalios@amazon.es>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
---
 drivers/ptp/ptp_vmclock.c        | 125 +++++++++++++++++++++++++++++--
 include/uapi/linux/vmclock-abi.h |   5 ++
 2 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
index b3a83b03d9c1..38b2bacb755e 100644
--- a/drivers/ptp/ptp_vmclock.c
+++ b/drivers/ptp/ptp_vmclock.c
@@ -5,6 +5,9 @@
  * Copyright © 2024 Amazon.com, Inc. or its affiliates.
  */
 
+#include "linux/poll.h"
+#include "linux/types.h"
+#include "linux/wait.h"
 #include <linux/acpi.h>
 #include <linux/device.h>
 #include <linux/err.h>
@@ -39,6 +42,7 @@ struct vmclock_state {
 	struct resource res;
 	struct vmclock_abi *clk;
 	struct miscdevice miscdev;
+	wait_queue_head_t disrupt_wait;
 	struct ptp_clock_info ptp_clock_info;
 	struct ptp_clock *ptp_clock;
 	enum clocksource_ids cs_id, sys_cs_id;
@@ -357,10 +361,15 @@ static struct ptp_clock *vmclock_ptp_register(struct device *dev,
 	return ptp_clock_register(&st->ptp_clock_info, dev);
 }
 
+struct vmclock_file_state {
+	struct vmclock_state *st;
+	atomic_t seq;
+};
+
 static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
 {
-	struct vmclock_state *st = container_of(fp->private_data,
-						struct vmclock_state, miscdev);
+	struct vmclock_file_state *fst = fp->private_data;
+	struct vmclock_state *st = fst->st;
 
 	if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ)
 		return -EROFS;
@@ -379,11 +388,11 @@ static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma)
 static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
 				    size_t count, loff_t *ppos)
 {
-	struct vmclock_state *st = container_of(fp->private_data,
-						struct vmclock_state, miscdev);
 	ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT);
+	struct vmclock_file_state *fst = fp->private_data;
+	struct vmclock_state *st = fst->st;
+	uint32_t seq, old_seq;
 	size_t max_count;
-	uint32_t seq;
 
 	if (*ppos >= PAGE_SIZE)
 		return 0;
@@ -392,6 +401,7 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
 	if (count > max_count)
 		count = max_count;
 
+	old_seq = atomic_read(&fst->seq);
 	while (1) {
 		seq = le32_to_cpu(st->clk->seq_count) & ~1U;
 		/* Pairs with hypervisor wmb */
@@ -402,8 +412,16 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
 
 		/* Pairs with hypervisor wmb */
 		virt_rmb();
-		if (seq == le32_to_cpu(st->clk->seq_count))
-			break;
+		if (seq == le32_to_cpu(st->clk->seq_count)) {
+			/*
+			 * Either we updated fst->seq to seq (the latest version we observed)
+			 * or someone else did (old_seq == seq), so we can break.
+			 */
+			if (atomic_try_cmpxchg(&fst->seq, &old_seq, seq) ||
+			    old_seq == seq) {
+				break;
+			}
+		}
 
 		if (ktime_after(ktime_get(), deadline))
 			return -ETIMEDOUT;
@@ -413,10 +431,58 @@ static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf,
 	return count;
 }
 
+static __poll_t vmclock_miscdev_poll(struct file *fp, poll_table *wait)
+{
+	struct vmclock_file_state *fst = fp->private_data;
+	struct vmclock_state *st = fst->st;
+	uint32_t seq;
+
+	/*
+	 * Hypervisor will not send us any notifications, so fail immediately
+	 * to avoid having caller sleeping for ever.
+	 */
+	if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
+		return POLLHUP;
+
+	poll_wait(fp, &st->disrupt_wait, wait);
+
+	seq = le32_to_cpu(st->clk->seq_count);
+	if (atomic_read(&fst->seq) != seq)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static int vmclock_miscdev_open(struct inode *inode, struct file *fp)
+{
+	struct vmclock_state *st = container_of(fp->private_data,
+						struct vmclock_state, miscdev);
+	struct vmclock_file_state *fst = kzalloc(sizeof(*fst), GFP_KERNEL);
+
+	if (!fst)
+		return -ENOMEM;
+
+	fst->st = st;
+	atomic_set(&fst->seq, 0);
+
+	fp->private_data = fst;
+
+	return 0;
+}
+
+static int vmclock_miscdev_release(struct inode *inode, struct file *fp)
+{
+	kfree(fp->private_data);
+	return 0;
+}
+
 static const struct file_operations vmclock_miscdev_fops = {
 	.owner = THIS_MODULE,
+	.open = vmclock_miscdev_open,
+	.release = vmclock_miscdev_release,
 	.mmap = vmclock_miscdev_mmap,
 	.read = vmclock_miscdev_read,
+	.poll = vmclock_miscdev_poll,
 };
 
 /* module operations */
@@ -459,6 +525,44 @@ static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data
 	return AE_ERROR;
 }
 
+static void
+vmclock_acpi_notification_handler(acpi_handle __always_unused handle,
+				  u32 __always_unused event, void *dev)
+{
+	struct device *device = dev;
+	struct vmclock_state *st = device->driver_data;
+
+	wake_up_interruptible(&st->disrupt_wait);
+}
+
+static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st)
+{
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	acpi_status status;
+
+	/*
+	 * This should never happen as this function is only called when
+	 * has_acpi_companion(dev) is true, but the logic is sufficiently
+	 * complex that Coverity can't see the tautology.
+	 */
+	if (!adev)
+		return -ENODEV;
+
+	/* The device does not support notifications. Nothing else to do */
+	if (!(le64_to_cpu(st->clk->flags) & VMCLOCK_FLAG_NOTIFICATION_PRESENT))
+		return 0;
+
+	status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
+					     vmclock_acpi_notification_handler,
+					     dev);
+	if (ACPI_FAILURE(status)) {
+		dev_err(dev, "failed to install notification handler");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
 static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st)
 {
 	struct acpi_device *adev = ACPI_COMPANION(dev);
@@ -549,6 +653,11 @@ static int vmclock_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	init_waitqueue_head(&st->disrupt_wait);
+	ret = vmclock_setup_notification(dev, st);
+	if (ret)
+		return ret;
+
 	/*
 	 * If the structure is big enough, it can be mapped to userspace.
 	 * Theoretically a guest OS even using larger pages could still
@@ -581,6 +690,8 @@ static int vmclock_probe(struct platform_device *pdev)
 		return -ENODEV;
 	}
 
+	dev->driver_data = st;
+
 	dev_info(dev, "%s: registered %s%s%s\n", st->name,
 		 st->miscdev.minor ? "miscdev" : "",
 		 (st->miscdev.minor && st->ptp_clock) ? ", " : "",
diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h
index 937fe00e4f33..d320623b0118 100644
--- a/include/uapi/linux/vmclock-abi.h
+++ b/include/uapi/linux/vmclock-abi.h
@@ -121,6 +121,11 @@ struct vmclock_abi {
 	 * loaded from some save state (restored from a snapshot).
 	 */
 #define VMCLOCK_FLAG_VM_GEN_COUNTER_PRESENT     (1 << 8)
+	/*
+	 * If the NOTIFICATION_PRESENT flag is set, the hypervisor will send
+	 * a notification every time it updates seq_count to a new even number.
+	 */
+#define VMCLOCK_FLAG_NOTIFICATION_PRESENT       (1 << 9)
 
 	__u8 pad[2];
 	__u8 clock_status;
-- 
2.47.3

Re: [v6,2/7] ptp: vmclock: support device notifications
Posted by Jakub Kicinski 2 weeks, 1 day ago
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.

For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html
---
ptp: vmclock: support device notifications

This patch adds optional device notification support to VMClock, allowing
the hypervisor to notify the driver when seq_count changes. It also adds
poll() support to propagate these notifications to userspace.

> diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c
> index b3a83b03d9c1..38b2bacb755e 100644
> --- a/drivers/ptp/ptp_vmclock.c
> +++ b/drivers/ptp/ptp_vmclock.c

[ ... ]

> +static void
> +vmclock_acpi_notification_handler(acpi_handle __always_unused handle,
> +				  u32 __always_unused event, void *dev)
> +{
> +	struct device *device = dev;
> +	struct vmclock_state *st = device->driver_data;
> +
> +	wake_up_interruptible(&st->disrupt_wait);
> +}
> +
> +static int vmclock_setup_notification(struct device *dev, struct vmclock_state *st)
> +{

[ ... ]

> +	status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
> +					     vmclock_acpi_notification_handler,
> +					     dev);

Since acpi_install_notify_handler() is called here, should there be a
corresponding acpi_remove_notify_handler() in vmclock_remove()? Currently
the handler remains registered after device removal, which could lead to
a use-after-free if the hypervisor sends a notification after the driver
state has been freed.

[ ... ]

> @@ -549,6 +653,11 @@ static int vmclock_probe(struct platform_device *pdev)
>  	if (ret)
>  		return ret;
>
> +	init_waitqueue_head(&st->disrupt_wait);
> +	ret = vmclock_setup_notification(dev, st);
> +	if (ret)
> +		return ret;

[ ... ]

> @@ -581,6 +690,8 @@ static int vmclock_probe(struct platform_device *pdev)
>  		return -ENODEV;
>  	}
>
> +	dev->driver_data = st;

Is there a race window here? The ACPI notification handler is installed
in vmclock_setup_notification() which runs earlier in vmclock_probe(),
but the handler accesses device->driver_data. If the hypervisor sends a
notification between vmclock_setup_notification() and this assignment,
vmclock_acpi_notification_handler() would dereference NULL or garbage.

I see this is fixed later in the series by commit 47fdd294385e which
moves dev->driver_data = st before vmclock_setup_notification().
-- 
pw-bot: cr
Re: [v6,2/7] ptp: vmclock: support device notifications
Posted by David Woodhouse 2 weeks, 1 day ago
On Thu, 2026-01-22 at 19:11 -0800, Jakub Kicinski wrote:
> 
> > +	status = acpi_install_notify_handler(adev->handle, ACPI_DEVICE_NOTIFY,
> > +					     vmclock_acpi_notification_handler,
> > +					     dev);
> 
> Since acpi_install_notify_handler() is called here, should there be a
> corresponding acpi_remove_notify_handler() in vmclock_remove()? Currently
> the handler remains registered after device removal, which could lead to
> a use-after-free if the hypervisor sends a notification after the driver
> state has been freed.
> 
> [ ... ]
> 
> > @@ -549,6 +653,11 @@ static int vmclock_probe(struct platform_device *pdev)
> >   	if (ret)
> >   		return ret;
> > 
> > +	init_waitqueue_head(&st->disrupt_wait);
> > +	ret = vmclock_setup_notification(dev, st);
> > +	if (ret)
> > +		return ret;
> 
> [ ... ]
> 
> > @@ -581,6 +690,8 @@ static int vmclock_probe(struct platform_device *pdev)
> >   		return -ENODEV;
> >   	}
> > 
> > +	dev->driver_data = st;
> 
> Is there a race window here? The ACPI notification handler is installed
> in vmclock_setup_notification() which runs earlier in vmclock_probe(),
> but the handler accesses device->driver_data. If the hypervisor sends a
> notification between vmclock_setup_notification() and this assignment,
> vmclock_acpi_notification_handler() would dereference NULL or garbage.
> 
> I see this is fixed later in the series by commit 47fdd294385e which
> moves dev->driver_data = st before vmclock_setup_notification().

With the incremental diff below, I've fixed both of these in
https://git.infradead.org/?p=users/dwmw2/linux.git;a=shortlog;h=refs/heads/vmclock

Takahiro and Babis have the test environment set up to actually
exercise the notifications, so I'll let one of them actually do the
testing and repost v7.

Takahiro, please remember to add your own 'Signed-off-by:' as you do.

Thanks!


--- a/drivers/ptp/ptp_vmclock.c
+++ b/drivers/ptp/ptp_vmclock.c
@@ -490,17 +490,6 @@ static const struct file_operations vmclock_miscdev_fops = {
 
 /* module operations */
 
-static void vmclock_remove(void *data)
-{
-	struct vmclock_state *st = data;
-
-	if (st->ptp_clock)
-		ptp_clock_unregister(st->ptp_clock);
-
-	if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
-		misc_deregister(&st->miscdev);
-}
-
 #if IS_ENABLED(CONFIG_ACPI)
 static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data)
 {
@@ -636,6 +625,32 @@ static int vmclock_setup_notification(struct device *dev,
 	return vmclock_setup_of_notification(dev);
 }
 
+static void vmclock_remove(void *data)
+{
+	struct device *dev = data;
+	struct vmclock_state *st = dev->driver_data;
+
+	if (!st) {
+		dev_err(dev, "vmclock_remove() called with NULL driver_data");
+		return;
+	}
+
+#if IS_ENABLED(CONFIG_ACPI)
+	if (has_acpi_companion(dev))
+		acpi_remove_notify_handler(ACPI_COMPANION(dev)->handle,
+					   ACPI_DEVICE_NOTIFY,
+					   vmclock_acpi_notification_handler);
+#endif
+
+	if (st->ptp_clock)
+		ptp_clock_unregister(st->ptp_clock);
+
+	if (st->miscdev.minor != MISC_DYNAMIC_MINOR)
+		misc_deregister(&st->miscdev);
+
+	dev->driver_data = NULL;
+}
+
 static void vmclock_put_idx(void *data)
 {
 	struct vmclock_state *st = data;
@@ -701,12 +716,13 @@ static int vmclock_probe(struct platform_device *pdev)
 
 	st->miscdev.minor = MISC_DYNAMIC_MINOR;
 
-	ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, st);
+	init_waitqueue_head(&st->disrupt_wait);
+	dev->driver_data = st;
+
+	ret = devm_add_action_or_reset(&pdev->dev, vmclock_remove, dev);
 	if (ret)
 		return ret;
 
-	init_waitqueue_head(&st->disrupt_wait);
-	dev->driver_data = st;
 	ret = vmclock_setup_notification(dev, st);
 	if (ret)
 		return ret;