[v2] gve: add support for PTP gettimex64

[PATCH net-next v2 3/3] gve: implement PTP gettimex64

Posted by Harshitha Ramamurthy 6 days, 20 hours ago

From: Jordan Rhee <jordanrhee@google.com>

Enable chrony and phc2sys to synchronize system clock to NIC clock.

The system cycle counters are sampled by the device to minimize the
uncertainty window. If the system times are sampled in the host, the
delta between pre and post readings is 100us or more due to AQ command
latency. The system times returned by the device have a delta of ~1us,
which enables significantly more accurate clock synchronization.

Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Kevin Yang <yyd@google.com>
Reviewed-by: Naman Gulati <namangulati@google.com>
Signed-off-by: Jordan Rhee <jordanrhee@google.com>
Signed-off-by: Harshitha Ramamurthy <hramamurthy@google.com>
---
Changes in v2:
 - fix compilation warning on ARM by casting cycles_t to u64
---
 drivers/net/ethernet/google/gve/gve_adminq.h |   4 +-
 drivers/net/ethernet/google/gve/gve_ptp.c    | 189 ++++++++++++++++++-
 2 files changed, 184 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_adminq.h b/drivers/net/ethernet/google/gve/gve_adminq.h
index 22a74b6aa17e..e6dcf6da9091 100644
--- a/drivers/net/ethernet/google/gve/gve_adminq.h
+++ b/drivers/net/ethernet/google/gve/gve_adminq.h
@@ -411,8 +411,8 @@ static_assert(sizeof(struct gve_adminq_report_nic_ts) == 16);
 
 struct gve_nic_ts_report {
 	__be64 nic_timestamp; /* NIC clock in nanoseconds */
-	__be64 reserved1;
-	__be64 reserved2;
+	__be64 pre_cycles; /* System cycle counter before NIC clock read */
+	__be64 post_cycles; /* System cycle counter after NIC clock read */
 	__be64 reserved3;
 	__be64 reserved4;
 };
diff --git a/drivers/net/ethernet/google/gve/gve_ptp.c b/drivers/net/ethernet/google/gve/gve_ptp.c
index 140b8fbce4f4..bea794541f30 100644
--- a/drivers/net/ethernet/google/gve/gve_ptp.c
+++ b/drivers/net/ethernet/google/gve/gve_ptp.c
@@ -10,28 +10,203 @@
 /* Interval to schedule a nic timestamp calibration, 250ms. */
 #define GVE_NIC_TS_SYNC_INTERVAL_MS 250
 
+/*
+ * Stores cycle counter samples in get_cycles() units from a
+ * sandwiched NIC clock read
+ */
+struct gve_sysclock_sample {
+	/* Cycle counter from NIC before clock read */
+	u64 nic_pre_cycles;
+	/* Cycle counter from NIC after clock read */
+	u64 nic_post_cycles;
+	/* Cycle counter from host before issuing AQ command */
+	cycles_t host_pre_cycles;
+	/* Cycle counter from host after AQ command returns */
+	cycles_t host_post_cycles;
+};
+
+/*
+ * Read NIC clock by issuing the AQ command. The command is subject to
+ * rate limiting and may need to be retried. Requires nic_ts_read_lock
+ * to be held.
+ */
+static int gve_adminq_read_timestamp(struct gve_priv *priv,
+				     cycles_t *pre_cycles,
+				     cycles_t *post_cycles)
+{
+	unsigned long delay_us = 1000;
+	int retry_count = 0;
+	int err;
+
+	lockdep_assert_held(&priv->nic_ts_read_lock);
+
+	do {
+		*pre_cycles = get_cycles();
+		err = gve_adminq_report_nic_ts(priv, priv->nic_ts_report_bus);
+
+		/* Ensure cycle counter is sampled after AdminQ cmd returns */
+		rmb();
+		*post_cycles = get_cycles();
+		if (likely(err != -EAGAIN))
+			return err;
+
+		fsleep(delay_us);
+
+		/* Exponential backoff */
+		delay_us *= 2;
+		retry_count++;
+	} while (retry_count < 5);
+
+	return -ETIMEDOUT;
+}
+
 /* Read the nic timestamp from hardware via the admin queue. */
-static int gve_clock_nic_ts_read(struct gve_priv *priv, u64 *nic_raw)
+static int gve_clock_nic_ts_read(struct gve_priv *priv, u64 *nic_raw,
+				 struct gve_sysclock_sample *sysclock)
 {
+	cycles_t host_pre_cycles, host_post_cycles;
+	struct gve_nic_ts_report *ts_report;
 	int err;
 
 	mutex_lock(&priv->nic_ts_read_lock);
-	err = gve_adminq_report_nic_ts(priv, priv->nic_ts_report_bus);
-	if (err)
+	err = gve_adminq_read_timestamp(priv, &host_pre_cycles,
+					&host_post_cycles);
+	if (err) {
+		dev_err_ratelimited(&priv->pdev->dev,
+				    "AdminQ timestamp read failed: %d\n", err);
 		goto out;
+	}
 
-	*nic_raw = be64_to_cpu(priv->nic_ts_report->nic_timestamp);
+	ts_report = priv->nic_ts_report;
+	*nic_raw = be64_to_cpu(ts_report->nic_timestamp);
+
+	if (sysclock) {
+		sysclock->nic_pre_cycles = be64_to_cpu(ts_report->pre_cycles);
+		sysclock->nic_post_cycles = be64_to_cpu(ts_report->post_cycles);
+		sysclock->host_pre_cycles = host_pre_cycles;
+		sysclock->host_post_cycles = host_post_cycles;
+	}
 
 out:
 	mutex_unlock(&priv->nic_ts_read_lock);
 	return err;
 }
 
+struct gve_cycles_to_clock_callback_ctx {
+	u64 cycles;
+};
+
+static int gve_cycles_to_clock_fn(ktime_t *device_time,
+				  struct system_counterval_t *system_counterval,
+				  void *ctx)
+{
+	struct gve_cycles_to_clock_callback_ctx *context = ctx;
+
+	*device_time = 0;
+
+	system_counterval->cycles = context->cycles;
+	system_counterval->use_nsecs = false;
+
+	if (IS_ENABLED(CONFIG_X86))
+		system_counterval->cs_id = CSID_X86_TSC;
+	else if (IS_ENABLED(CONFIG_ARM64))
+		system_counterval->cs_id = CSID_ARM_ARCH_COUNTER;
+	else
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/*
+ * Convert a raw cycle count (e.g. from get_cycles()) to the system clock
+ * type specified by clockid. The system_time_snapshot must be taken before
+ * the cycle counter is sampled.
+ */
+static int gve_cycles_to_timespec64(struct gve_priv *priv, clockid_t clockid,
+				    struct system_time_snapshot *snap,
+				    u64 cycles, struct timespec64 *ts)
+{
+	struct gve_cycles_to_clock_callback_ctx ctx = {0};
+	struct system_device_crosststamp xtstamp;
+	int err;
+
+	ctx.cycles = cycles;
+	err = get_device_system_crosststamp(gve_cycles_to_clock_fn, &ctx, snap,
+					    &xtstamp);
+	if (err) {
+		dev_err_ratelimited(&priv->pdev->dev,
+				    "get_device_system_crosststamp() failed to convert %lld cycles to system time: %d\n",
+				    cycles,
+				    err);
+		return err;
+	}
+
+	switch (clockid) {
+	case CLOCK_REALTIME:
+		*ts = ktime_to_timespec64(xtstamp.sys_realtime);
+		break;
+	case CLOCK_MONOTONIC_RAW:
+		*ts = ktime_to_timespec64(xtstamp.sys_monoraw);
+		break;
+	default:
+		dev_err_ratelimited(&priv->pdev->dev,
+				    "Cycle count conversion to clockid %d not supported\n",
+				    clockid);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 static int gve_ptp_gettimex64(struct ptp_clock_info *info,
 			      struct timespec64 *ts,
 			      struct ptp_system_timestamp *sts)
 {
-	return -EOPNOTSUPP;
+	struct gve_ptp *ptp = container_of(info, struct gve_ptp, info);
+	struct gve_sysclock_sample sysclock = {0};
+	struct gve_priv *priv = ptp->priv;
+	struct system_time_snapshot snap;
+	u64 nic_ts;
+	int err;
+
+	/* Take system clock snapshot before sampling cycle counters */
+	if (sts)
+		ktime_get_snapshot(&snap);
+
+	err = gve_clock_nic_ts_read(priv, &nic_ts, &sysclock);
+	if (err)
+		return err;
+
+	if (sts) {
+		/* Reject samples with out of order system clock values */
+		if (!(sysclock.host_pre_cycles <= sysclock.nic_pre_cycles &&
+		      sysclock.nic_pre_cycles  <= sysclock.nic_post_cycles &&
+		      sysclock.nic_post_cycles <= sysclock.host_post_cycles)) {
+			dev_err_ratelimited(&priv->pdev->dev,
+					    "AdminQ system clock cycle counts out of order. Expecting %llu <= %llu <= %llu <= %llu\n",
+					    (u64)sysclock.host_pre_cycles,
+					    sysclock.nic_pre_cycles,
+					    sysclock.nic_post_cycles,
+					    (u64)sysclock.host_post_cycles);
+			return -EBADMSG;
+		}
+
+		err = gve_cycles_to_timespec64(priv, sts->clockid, &snap,
+					       sysclock.nic_pre_cycles,
+					       &sts->pre_ts);
+		if (err)
+			return err;
+
+		err = gve_cycles_to_timespec64(priv, sts->clockid, &snap,
+					       sysclock.nic_post_cycles,
+					       &sts->post_ts);
+		if (err)
+			return err;
+	}
+
+	*ts = ns_to_timespec64(nic_ts);
+
+	return 0;
 }
 
 static int gve_ptp_settime64(struct ptp_clock_info *info,
@@ -50,7 +225,7 @@ static long gve_ptp_do_aux_work(struct ptp_clock_info *info)
 	if (gve_get_reset_in_progress(priv) || !gve_get_admin_queue_ok(priv))
 		goto out;
 
-	err = gve_clock_nic_ts_read(priv, &nic_raw);
+	err = gve_clock_nic_ts_read(priv, &nic_raw, NULL);
 	if (err) {
 		dev_err_ratelimited(&priv->pdev->dev, "%s read err %d\n",
 				    __func__, err);
@@ -132,7 +307,7 @@ int gve_init_clock(struct gve_priv *priv)
 		goto release_ptp;
 	}
 	mutex_init(&priv->nic_ts_read_lock);
-	err = gve_clock_nic_ts_read(priv, &nic_raw);
+	err = gve_clock_nic_ts_read(priv, &nic_raw, NULL);
 	if (err) {
 		dev_err(&priv->pdev->dev, "failed to read NIC clock %d\n", err);
 		goto release_nic_ts_report;
-- 
2.53.0.1018.g2bb0e51243-goog

Re: [PATCH net-next v2 3/3] gve: implement PTP gettimex64

Posted by Jakub Kicinski 3 days, 21 hours ago

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
Subject: gve: implement PTP gettimex64

This commit implements the PTP gettimex64 function to enable chrony and
phc2sys to synchronize the system clock to the NIC clock. It reads the
NIC timestamp along with pre and post system cycle counters to minimize
the uncertainty window.

> diff --git a/drivers/net/ethernet/google/gve/gve_ptp.c b/drivers/net/ethernet/google/gve/gve_ptp.c
> index 140b8fbce4f4..bea794541f30 100644
> --- a/drivers/net/ethernet/google/gve/gve_ptp.c
> +++ b/drivers/net/ethernet/google/gve/gve_ptp.c
> @@ -10,28 +10,203 @@

[ ... ]

> +static int gve_adminq_read_timestamp(struct gve_priv *priv,
> +				     cycles_t *pre_cycles,
> +				     cycles_t *post_cycles)
> +{
> +	unsigned long delay_us = 1000;
> +	int retry_count = 0;
> +	int err;
> +
> +	lockdep_assert_held(&priv->nic_ts_read_lock);
> +
> +	do {
> +		*pre_cycles = get_cycles();
> +		err = gve_adminq_report_nic_ts(priv, priv->nic_ts_report_bus);
> +
> +		/* Ensure cycle counter is sampled after AdminQ cmd returns */
> +		rmb();
> +		*post_cycles = get_cycles();

Are execution barriers needed around get_cycles() here? rmb() orders
memory accesses, but the CPU might still speculatively execute the cycle
counter reads out of order with respect to the AdminQ commands.

Could this speculative reordering cause the strict bounds validation in
gve_ptp_gettimex64() to fail spuriously?

[ ... ]

>  static int gve_ptp_gettimex64(struct ptp_clock_info *info,
>  			      struct timespec64 *ts,
>  			      struct ptp_system_timestamp *sts)
>  {
> -	return -EOPNOTSUPP;
> +	struct gve_ptp *ptp = container_of(info, struct gve_ptp, info);
> +	struct gve_sysclock_sample sysclock = {0};
> +	struct gve_priv *priv = ptp->priv;
> +	struct system_time_snapshot snap;
> +	u64 nic_ts;
> +	int err;
> +
> +	/* Take system clock snapshot before sampling cycle counters */
> +	if (sts)
> +		ktime_get_snapshot(&snap);
> +
> +	err = gve_clock_nic_ts_read(priv, &nic_ts, &sysclock);

Could the system time snapshot become stale here? 

If priv->nic_ts_read_lock is contended, or if gve_adminq_read_timestamp()
retries and sleeps, the snapshot might be taken significantly before the
actual hardware read.

Would it be more accurate to take the snapshot immediately before reading the
hardware, inside the mutex?

> +	if (err)
> +		return err;
> +
> +	if (sts) {
> +		/* Reject samples with out of order system clock values */
> +		if (!(sysclock.host_pre_cycles <= sysclock.nic_pre_cycles &&
> +		      sysclock.nic_pre_cycles  <= sysclock.nic_post_cycles &&
> +		      sysclock.nic_post_cycles <= sysclock.host_post_cycles)) {
> +			dev_err_ratelimited(&priv->pdev->dev,
> +					    "AdminQ system clock cycle counts out of order. Expecting %llu <= %llu <= %llu <= %llu\n",
> +					    (u64)sysclock.host_pre_cycles,
> +					    sysclock.nic_pre_cycles,
> +					    sysclock.nic_post_cycles,
> +					    (u64)sysclock.host_post_cycles);
> +			return -EBADMSG;

How does this behave on architectures where get_cycles() is not implemented
and returns 0, or with older firmware that leaves nic_pre_cycles and
nic_post_cycles as 0?

It looks like the bounds check would unconditionally fail and return -EBADMSG.
Since userspace tools like chrony usually expect -EOPNOTSUPP for unsupported
extended ioctls to fall back to basic PTP ioctls, will returning -EBADMSG
cause a hard failure for clock synchronization?

[ ... ]

> @@ -132,7 +307,7 @@ int gve_init_clock(struct gve_priv *priv)
>  		goto release_ptp;
>  	}
>  	mutex_init(&priv->nic_ts_read_lock);
> -	err = gve_clock_nic_ts_read(priv, &nic_raw);
> +	err = gve_clock_nic_ts_read(priv, &nic_raw, NULL);
>  	if (err) {
>  		dev_err(&priv->pdev->dev, "failed to read NIC clock %d\n", err);
>  		goto release_nic_ts_report;

Is there a race condition exposing the PTP device to userspace before its
resources are fully initialized?

Looking at gve_init_clock(), gve_ptp_init() is called before allocating
nic_ts_report and initializing nic_ts_read_lock:

    err = gve_ptp_init(priv);
    if (err)
        return err;

    priv->nic_ts_report = dma_alloc_coherent(...);
    ...
    mutex_init(&priv->nic_ts_read_lock);

If a concurrent userspace process immediately invokes the
PTP_SYS_OFFSET_EXTENDED ioctl after gve_ptp_init() registers the /dev/ptpX
device, could it call gve_ptp_gettimex64() and attempt to lock the
uninitialized mutex or dereference the NULL nic_ts_report pointer?

Additionally, in the error path for gve_init_clock():

    release_nic_ts_report:
        mutex_destroy(&priv->nic_ts_read_lock);
        dma_free_coherent(...);
        priv->nic_ts_report = NULL;
    release_ptp:
        gve_ptp_release(priv);

Could destroying the mutex and freeing the memory before gve_ptp_release()
create a use-after-free window if an ioctl is currently running?

Re: [PATCH net-next v2 3/3] gve: implement PTP gettimex64

Posted by Jordan Rhee 3 days, 14 hours ago

On Sun, Mar 29, 2026 at 2:20 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> ---
> Subject: gve: implement PTP gettimex64
>
> This commit implements the PTP gettimex64 function to enable chrony and
> phc2sys to synchronize the system clock to the NIC clock. It reads the
> NIC timestamp along with pre and post system cycle counters to minimize
> the uncertainty window.
>
> > diff --git a/drivers/net/ethernet/google/gve/gve_ptp.c b/drivers/net/ethernet/google/gve/gve_ptp.c
> > index 140b8fbce4f4..bea794541f30 100644
> > --- a/drivers/net/ethernet/google/gve/gve_ptp.c
> > +++ b/drivers/net/ethernet/google/gve/gve_ptp.c
> > @@ -10,28 +10,203 @@
>
> [ ... ]
>
> > +static int gve_adminq_read_timestamp(struct gve_priv *priv,
> > +                                  cycles_t *pre_cycles,
> > +                                  cycles_t *post_cycles)
> > +{
> > +     unsigned long delay_us = 1000;
> > +     int retry_count = 0;
> > +     int err;
> > +
> > +     lockdep_assert_held(&priv->nic_ts_read_lock);
> > +
> > +     do {
> > +             *pre_cycles = get_cycles();
> > +             err = gve_adminq_report_nic_ts(priv, priv->nic_ts_report_bus);
> > +
> > +             /* Ensure cycle counter is sampled after AdminQ cmd returns */
> > +             rmb();
> > +             *post_cycles = get_cycles();
>
> Are execution barriers needed around get_cycles() here? rmb() orders
> memory accesses, but the CPU might still speculatively execute the cycle
> counter reads out of order with respect to the AdminQ commands.
>
> Could this speculative reordering cause the strict bounds validation in
> gve_ptp_gettimex64() to fail spuriously?

I will add mb() barriers to be totally safe.

> [ ... ]
>
> >  static int gve_ptp_gettimex64(struct ptp_clock_info *info,
> >                             struct timespec64 *ts,
> >                             struct ptp_system_timestamp *sts)
> >  {
> > -     return -EOPNOTSUPP;
> > +     struct gve_ptp *ptp = container_of(info, struct gve_ptp, info);
> > +     struct gve_sysclock_sample sysclock = {0};
> > +     struct gve_priv *priv = ptp->priv;
> > +     struct system_time_snapshot snap;
> > +     u64 nic_ts;
> > +     int err;
> > +
> > +     /* Take system clock snapshot before sampling cycle counters */
> > +     if (sts)
> > +             ktime_get_snapshot(&snap);
> > +
> > +     err = gve_clock_nic_ts_read(priv, &nic_ts, &sysclock);
>
> Could the system time snapshot become stale here?
>
> If priv->nic_ts_read_lock is contended, or if gve_adminq_read_timestamp()
> retries and sleeps, the snapshot might be taken significantly before the
> actual hardware read.
>
> Would it be more accurate to take the snapshot immediately before reading the
> hardware, inside the mutex?

Ack, will fix.

>
> > +     if (err)
> > +             return err;
> > +
> > +     if (sts) {
> > +             /* Reject samples with out of order system clock values */
> > +             if (!(sysclock.host_pre_cycles <= sysclock.nic_pre_cycles &&
> > +                   sysclock.nic_pre_cycles  <= sysclock.nic_post_cycles &&
> > +                   sysclock.nic_post_cycles <= sysclock.host_post_cycles)) {
> > +                     dev_err_ratelimited(&priv->pdev->dev,
> > +                                         "AdminQ system clock cycle counts out of order. Expecting %llu <= %llu <= %llu <= %llu\n",
> > +                                         (u64)sysclock.host_pre_cycles,
> > +                                         sysclock.nic_pre_cycles,
> > +                                         sysclock.nic_post_cycles,
> > +                                         (u64)sysclock.host_post_cycles);
> > +                     return -EBADMSG;
>
> How does this behave on architectures where get_cycles() is not implemented
> and returns 0, or with older firmware that leaves nic_pre_cycles and
> nic_post_cycles as 0?

Firmware is not allowed to return 0. 0 should trigger the check.

> It looks like the bounds check would unconditionally fail and return -EBADMSG.
> Since userspace tools like chrony usually expect -EOPNOTSUPP for unsupported
> extended ioctls to fall back to basic PTP ioctls, will returning -EBADMSG
> cause a hard failure for clock synchronization?

Will fix. GVE is only supported on x86_64 and arm64, and will fail on
other architectures with -EOPNOTSUPP due to the check in
gve_cycles_to_clock_fn(). However, if an architecture returns
get_cycles() as 0, it will fail here first with -EBADMSG. I will add
an explicit check for x86 || arm64 earlier in this function.

>
> [ ... ]
>
> > @@ -132,7 +307,7 @@ int gve_init_clock(struct gve_priv *priv)
> >               goto release_ptp;
> >       }
> >       mutex_init(&priv->nic_ts_read_lock);
> > -     err = gve_clock_nic_ts_read(priv, &nic_raw);
> > +     err = gve_clock_nic_ts_read(priv, &nic_raw, NULL);
> >       if (err) {
> >               dev_err(&priv->pdev->dev, "failed to read NIC clock %d\n", err);
> >               goto release_nic_ts_report;
>
> Is there a race condition exposing the PTP device to userspace before its
> resources are fully initialized?
>
> Looking at gve_init_clock(), gve_ptp_init() is called before allocating
> nic_ts_report and initializing nic_ts_read_lock:
>
>     err = gve_ptp_init(priv);
>     if (err)
>         return err;
>
>     priv->nic_ts_report = dma_alloc_coherent(...);
>     ...
>     mutex_init(&priv->nic_ts_read_lock);
>
> If a concurrent userspace process immediately invokes the
> PTP_SYS_OFFSET_EXTENDED ioctl after gve_ptp_init() registers the /dev/ptpX
> device, could it call gve_ptp_gettimex64() and attempt to lock the
> uninitialized mutex or dereference the NULL nic_ts_report pointer?
>
> Additionally, in the error path for gve_init_clock():
>
>     release_nic_ts_report:
>         mutex_destroy(&priv->nic_ts_read_lock);
>         dma_free_coherent(...);
>         priv->nic_ts_report = NULL;
>     release_ptp:
>         gve_ptp_release(priv);
>
> Could destroying the mutex and freeing the memory before gve_ptp_release()
> create a use-after-free window if an ioctl is currently running?

Will be fixed in the previous patch in the series.

Re: [PATCH net-next v2 3/3] gve: implement PTP gettimex64

Posted by Naman Gulati 2 days, 1 hour ago

On Sun, Mar 29, 2026 at 9:31 PM Jordan Rhee <jordanrhee@google.com> wrote:
>
> On Sun, Mar 29, 2026 at 2:20 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > This is an AI-generated review of your patch. The human sending this
> > email has considered the AI review valid, or at least plausible.
> > ---
> > Subject: gve: implement PTP gettimex64
> >
> > This commit implements the PTP gettimex64 function to enable chrony and
> > phc2sys to synchronize the system clock to the NIC clock. It reads the
> > NIC timestamp along with pre and post system cycle counters to minimize
> > the uncertainty window.
> >
> > > diff --git a/drivers/net/ethernet/google/gve/gve_ptp.c b/drivers/net/ethernet/google/gve/gve_ptp.c
> > > index 140b8fbce4f4..bea794541f30 100644
> > > --- a/drivers/net/ethernet/google/gve/gve_ptp.c
> > > +++ b/drivers/net/ethernet/google/gve/gve_ptp.c
> > > @@ -10,28 +10,203 @@
> >
> > [ ... ]
> >
> > > +static int gve_adminq_read_timestamp(struct gve_priv *priv,
> > > +                                  cycles_t *pre_cycles,
> > > +                                  cycles_t *post_cycles)
> > > +{
> > > +     unsigned long delay_us = 1000;
> > > +     int retry_count = 0;
> > > +     int err;
> > > +
> > > +     lockdep_assert_held(&priv->nic_ts_read_lock);
> > > +
> > > +     do {
> > > +             *pre_cycles = get_cycles();
> > > +             err = gve_adminq_report_nic_ts(priv, priv->nic_ts_report_bus);
> > > +
> > > +             /* Ensure cycle counter is sampled after AdminQ cmd returns */
> > > +             rmb();
> > > +             *post_cycles = get_cycles();
> >
> > Are execution barriers needed around get_cycles() here? rmb() orders
> > memory accesses, but the CPU might still speculatively execute the cycle
> > counter reads out of order with respect to the AdminQ commands.
> >
> > Could this speculative reordering cause the strict bounds validation in
> > gve_ptp_gettimex64() to fail spuriously?
>
> I will add mb() barriers to be totally safe.

I'm not so sure that rmb() is insufficient here. In x86_64 rmb() is an
lfence. lfence is used to prevent speculative execution (per the x86
barrier_nospec implementation) and rdtsc_ordered, which is called by
the ktime* helpers, also prefaces rdtsc with an lfence. On ARM the
get_cycles implementation adds an isb instruction before reading
cntvct_el0.


>
> > [ ... ]
> >
> > >  static int gve_ptp_gettimex64(struct ptp_clock_info *info,
> > >                             struct timespec64 *ts,
> > >                             struct ptp_system_timestamp *sts)
> > >  {
> > > -     return -EOPNOTSUPP;
> > > +     struct gve_ptp *ptp = container_of(info, struct gve_ptp, info);
> > > +     struct gve_sysclock_sample sysclock = {0};
> > > +     struct gve_priv *priv = ptp->priv;
> > > +     struct system_time_snapshot snap;
> > > +     u64 nic_ts;
> > > +     int err;
> > > +
> > > +     /* Take system clock snapshot before sampling cycle counters */
> > > +     if (sts)
> > > +             ktime_get_snapshot(&snap);
> > > +
> > > +     err = gve_clock_nic_ts_read(priv, &nic_ts, &sysclock);
> >
> > Could the system time snapshot become stale here?
> >
> > If priv->nic_ts_read_lock is contended, or if gve_adminq_read_timestamp()
> > retries and sleeps, the snapshot might be taken significantly before the
> > actual hardware read.
> >
> > Would it be more accurate to take the snapshot immediately before reading the
> > hardware, inside the mutex?
>
> Ack, will fix.
>
> >
> > > +     if (err)
> > > +             return err;
> > > +
> > > +     if (sts) {
> > > +             /* Reject samples with out of order system clock values */
> > > +             if (!(sysclock.host_pre_cycles <= sysclock.nic_pre_cycles &&
> > > +                   sysclock.nic_pre_cycles  <= sysclock.nic_post_cycles &&
> > > +                   sysclock.nic_post_cycles <= sysclock.host_post_cycles)) {
> > > +                     dev_err_ratelimited(&priv->pdev->dev,
> > > +                                         "AdminQ system clock cycle counts out of order. Expecting %llu <= %llu <= %llu <= %llu\n",
> > > +                                         (u64)sysclock.host_pre_cycles,
> > > +                                         sysclock.nic_pre_cycles,
> > > +                                         sysclock.nic_post_cycles,
> > > +                                         (u64)sysclock.host_post_cycles);
> > > +                     return -EBADMSG;
> >
> > How does this behave on architectures where get_cycles() is not implemented
> > and returns 0, or with older firmware that leaves nic_pre_cycles and
> > nic_post_cycles as 0?
>
> Firmware is not allowed to return 0. 0 should trigger the check.
>
> > It looks like the bounds check would unconditionally fail and return -EBADMSG.
> > Since userspace tools like chrony usually expect -EOPNOTSUPP for unsupported
> > extended ioctls to fall back to basic PTP ioctls, will returning -EBADMSG
> > cause a hard failure for clock synchronization?
>
> Will fix. GVE is only supported on x86_64 and arm64, and will fail on
> other architectures with -EOPNOTSUPP due to the check in
> gve_cycles_to_clock_fn(). However, if an architecture returns
> get_cycles() as 0, it will fail here first with -EBADMSG. I will add
> an explicit check for x86 || arm64 earlier in this function.
>
> >
> > [ ... ]
> >
> > > @@ -132,7 +307,7 @@ int gve_init_clock(struct gve_priv *priv)
> > >               goto release_ptp;
> > >       }
> > >       mutex_init(&priv->nic_ts_read_lock);
> > > -     err = gve_clock_nic_ts_read(priv, &nic_raw);
> > > +     err = gve_clock_nic_ts_read(priv, &nic_raw, NULL);
> > >       if (err) {
> > >               dev_err(&priv->pdev->dev, "failed to read NIC clock %d\n", err);
> > >               goto release_nic_ts_report;
> >
> > Is there a race condition exposing the PTP device to userspace before its
> > resources are fully initialized?
> >
> > Looking at gve_init_clock(), gve_ptp_init() is called before allocating
> > nic_ts_report and initializing nic_ts_read_lock:
> >
> >     err = gve_ptp_init(priv);
> >     if (err)
> >         return err;
> >
> >     priv->nic_ts_report = dma_alloc_coherent(...);
> >     ...
> >     mutex_init(&priv->nic_ts_read_lock);
> >
> > If a concurrent userspace process immediately invokes the
> > PTP_SYS_OFFSET_EXTENDED ioctl after gve_ptp_init() registers the /dev/ptpX
> > device, could it call gve_ptp_gettimex64() and attempt to lock the
> > uninitialized mutex or dereference the NULL nic_ts_report pointer?
> >
> > Additionally, in the error path for gve_init_clock():
> >
> >     release_nic_ts_report:
> >         mutex_destroy(&priv->nic_ts_read_lock);
> >         dma_free_coherent(...);
> >         priv->nic_ts_report = NULL;
> >     release_ptp:
> >         gve_ptp_release(priv);
> >
> > Could destroying the mutex and freeing the memory before gve_ptp_release()
> > create a use-after-free window if an ioctl is currently running?
>
> Will be fixed in the previous patch in the series.

Re: [PATCH net-next v2 3/3] gve: implement PTP gettimex64

Posted by Jordan Rhee 3 hours ago

We know that gve_adminq_report_nic_ts() will cause a VM exit, which
will act as a full barrier. Additional barriers are not strictly
necessary from a functional perspective. Can we rely on this fact or
do we need explicit barriers?


On Tue, Mar 31, 2026 at 11:05 AM Naman Gulati <namangulati@google.com> wrote:
>
> On Sun, Mar 29, 2026 at 9:31 PM Jordan Rhee <jordanrhee@google.com> wrote:
> >
> > On Sun, Mar 29, 2026 at 2:20 PM Jakub Kicinski <kuba@kernel.org> wrote:
> > >
> > > This is an AI-generated review of your patch. The human sending this
> > > email has considered the AI review valid, or at least plausible.
> > > ---
> > > Subject: gve: implement PTP gettimex64
> > >
> > > This commit implements the PTP gettimex64 function to enable chrony and
> > > phc2sys to synchronize the system clock to the NIC clock. It reads the
> > > NIC timestamp along with pre and post system cycle counters to minimize
> > > the uncertainty window.
> > >
> > > > diff --git a/drivers/net/ethernet/google/gve/gve_ptp.c b/drivers/net/ethernet/google/gve/gve_ptp.c
> > > > index 140b8fbce4f4..bea794541f30 100644
> > > > --- a/drivers/net/ethernet/google/gve/gve_ptp.c
> > > > +++ b/drivers/net/ethernet/google/gve/gve_ptp.c
> > > > @@ -10,28 +10,203 @@
> > >
> > > [ ... ]
> > >
> > > > +static int gve_adminq_read_timestamp(struct gve_priv *priv,
> > > > +                                  cycles_t *pre_cycles,
> > > > +                                  cycles_t *post_cycles)
> > > > +{
> > > > +     unsigned long delay_us = 1000;
> > > > +     int retry_count = 0;
> > > > +     int err;
> > > > +
> > > > +     lockdep_assert_held(&priv->nic_ts_read_lock);
> > > > +
> > > > +     do {
> > > > +             *pre_cycles = get_cycles();
> > > > +             err = gve_adminq_report_nic_ts(priv, priv->nic_ts_report_bus);
> > > > +
> > > > +             /* Ensure cycle counter is sampled after AdminQ cmd returns */
> > > > +             rmb();
> > > > +             *post_cycles = get_cycles();
> > >
> > > Are execution barriers needed around get_cycles() here? rmb() orders
> > > memory accesses, but the CPU might still speculatively execute the cycle
> > > counter reads out of order with respect to the AdminQ commands.
> > >
> > > Could this speculative reordering cause the strict bounds validation in
> > > gve_ptp_gettimex64() to fail spuriously?
> >
> > I will add mb() barriers to be totally safe.
>
> I'm not so sure that rmb() is insufficient here. In x86_64 rmb() is an
> lfence. lfence is used to prevent speculative execution (per the x86
> barrier_nospec implementation) and rdtsc_ordered, which is called by
> the ktime* helpers, also prefaces rdtsc with an lfence. On ARM the
> get_cycles implementation adds an isb instruction before reading
> cntvct_el0.
>
>
> >
> > > [ ... ]
> > >
> > > >  static int gve_ptp_gettimex64(struct ptp_clock_info *info,
> > > >                             struct timespec64 *ts,
> > > >                             struct ptp_system_timestamp *sts)
> > > >  {
> > > > -     return -EOPNOTSUPP;
> > > > +     struct gve_ptp *ptp = container_of(info, struct gve_ptp, info);
> > > > +     struct gve_sysclock_sample sysclock = {0};
> > > > +     struct gve_priv *priv = ptp->priv;
> > > > +     struct system_time_snapshot snap;
> > > > +     u64 nic_ts;
> > > > +     int err;
> > > > +
> > > > +     /* Take system clock snapshot before sampling cycle counters */
> > > > +     if (sts)
> > > > +             ktime_get_snapshot(&snap);
> > > > +
> > > > +     err = gve_clock_nic_ts_read(priv, &nic_ts, &sysclock);
> > >
> > > Could the system time snapshot become stale here?
> > >
> > > If priv->nic_ts_read_lock is contended, or if gve_adminq_read_timestamp()
> > > retries and sleeps, the snapshot might be taken significantly before the
> > > actual hardware read.
> > >
> > > Would it be more accurate to take the snapshot immediately before reading the
> > > hardware, inside the mutex?
> >
> > Ack, will fix.
> >
> > >
> > > > +     if (err)
> > > > +             return err;
> > > > +
> > > > +     if (sts) {
> > > > +             /* Reject samples with out of order system clock values */
> > > > +             if (!(sysclock.host_pre_cycles <= sysclock.nic_pre_cycles &&
> > > > +                   sysclock.nic_pre_cycles  <= sysclock.nic_post_cycles &&
> > > > +                   sysclock.nic_post_cycles <= sysclock.host_post_cycles)) {
> > > > +                     dev_err_ratelimited(&priv->pdev->dev,
> > > > +                                         "AdminQ system clock cycle counts out of order. Expecting %llu <= %llu <= %llu <= %llu\n",
> > > > +                                         (u64)sysclock.host_pre_cycles,
> > > > +                                         sysclock.nic_pre_cycles,
> > > > +                                         sysclock.nic_post_cycles,
> > > > +                                         (u64)sysclock.host_post_cycles);
> > > > +                     return -EBADMSG;
> > >
> > > How does this behave on architectures where get_cycles() is not implemented
> > > and returns 0, or with older firmware that leaves nic_pre_cycles and
> > > nic_post_cycles as 0?
> >
> > Firmware is not allowed to return 0. 0 should trigger the check.
> >
> > > It looks like the bounds check would unconditionally fail and return -EBADMSG.
> > > Since userspace tools like chrony usually expect -EOPNOTSUPP for unsupported
> > > extended ioctls to fall back to basic PTP ioctls, will returning -EBADMSG
> > > cause a hard failure for clock synchronization?
> >
> > Will fix. GVE is only supported on x86_64 and arm64, and will fail on
> > other architectures with -EOPNOTSUPP due to the check in
> > gve_cycles_to_clock_fn(). However, if an architecture returns
> > get_cycles() as 0, it will fail here first with -EBADMSG. I will add
> > an explicit check for x86 || arm64 earlier in this function.
> >
> > >
> > > [ ... ]
> > >
> > > > @@ -132,7 +307,7 @@ int gve_init_clock(struct gve_priv *priv)
> > > >               goto release_ptp;
> > > >       }
> > > >       mutex_init(&priv->nic_ts_read_lock);
> > > > -     err = gve_clock_nic_ts_read(priv, &nic_raw);
> > > > +     err = gve_clock_nic_ts_read(priv, &nic_raw, NULL);
> > > >       if (err) {
> > > >               dev_err(&priv->pdev->dev, "failed to read NIC clock %d\n", err);
> > > >               goto release_nic_ts_report;
> > >
> > > Is there a race condition exposing the PTP device to userspace before its
> > > resources are fully initialized?
> > >
> > > Looking at gve_init_clock(), gve_ptp_init() is called before allocating
> > > nic_ts_report and initializing nic_ts_read_lock:
> > >
> > >     err = gve_ptp_init(priv);
> > >     if (err)
> > >         return err;
> > >
> > >     priv->nic_ts_report = dma_alloc_coherent(...);
> > >     ...
> > >     mutex_init(&priv->nic_ts_read_lock);
> > >
> > > If a concurrent userspace process immediately invokes the
> > > PTP_SYS_OFFSET_EXTENDED ioctl after gve_ptp_init() registers the /dev/ptpX
> > > device, could it call gve_ptp_gettimex64() and attempt to lock the
> > > uninitialized mutex or dereference the NULL nic_ts_report pointer?
> > >
> > > Additionally, in the error path for gve_init_clock():
> > >
> > >     release_nic_ts_report:
> > >         mutex_destroy(&priv->nic_ts_read_lock);
> > >         dma_free_coherent(...);
> > >         priv->nic_ts_report = NULL;
> > >     release_ptp:
> > >         gve_ptp_release(priv);
> > >
> > > Could destroying the mutex and freeing the memory before gve_ptp_release()
> > > create a use-after-free window if an ioctl is currently running?
> >
> > Will be fixed in the previous patch in the series.

Re: [PATCH net-next v2 3/3] gve: implement PTP gettimex64

Posted by Naman Gulati 3 hours ago

On Thu, Apr 2, 2026 at 8:38 AM Jordan Rhee <jordanrhee@google.com> wrote:
>
> We know that gve_adminq_report_nic_ts() will cause a VM exit, which
> will act as a full barrier. Additional barriers are not strictly
> necessary from a functional perspective. Can we rely on this fact or
> do we need explicit barriers?

I don't think that's enough. Without a barrier get_cycles can be
speculatively executed before gve_adminq_report_nic_ts.

>
>
> On Tue, Mar 31, 2026 at 11:05 AM Naman Gulati <namangulati@google.com> wrote:
> >
> > On Sun, Mar 29, 2026 at 9:31 PM Jordan Rhee <jordanrhee@google.com> wrote:
> > >
> > > On Sun, Mar 29, 2026 at 2:20 PM Jakub Kicinski <kuba@kernel.org> wrote:
> > > >
> > > > This is an AI-generated review of your patch. The human sending this
> > > > email has considered the AI review valid, or at least plausible.
> > > > ---
> > > > Subject: gve: implement PTP gettimex64
> > > >
> > > > This commit implements the PTP gettimex64 function to enable chrony and
> > > > phc2sys to synchronize the system clock to the NIC clock. It reads the
> > > > NIC timestamp along with pre and post system cycle counters to minimize
> > > > the uncertainty window.
> > > >
> > > > > diff --git a/drivers/net/ethernet/google/gve/gve_ptp.c b/drivers/net/ethernet/google/gve/gve_ptp.c
> > > > > index 140b8fbce4f4..bea794541f30 100644
> > > > > --- a/drivers/net/ethernet/google/gve/gve_ptp.c
> > > > > +++ b/drivers/net/ethernet/google/gve/gve_ptp.c
> > > > > @@ -10,28 +10,203 @@
> > > >
> > > > [ ... ]
> > > >
> > > > > +static int gve_adminq_read_timestamp(struct gve_priv *priv,
> > > > > +                                  cycles_t *pre_cycles,
> > > > > +                                  cycles_t *post_cycles)
> > > > > +{
> > > > > +     unsigned long delay_us = 1000;
> > > > > +     int retry_count = 0;
> > > > > +     int err;
> > > > > +
> > > > > +     lockdep_assert_held(&priv->nic_ts_read_lock);
> > > > > +
> > > > > +     do {
> > > > > +             *pre_cycles = get_cycles();
> > > > > +             err = gve_adminq_report_nic_ts(priv, priv->nic_ts_report_bus);
> > > > > +
> > > > > +             /* Ensure cycle counter is sampled after AdminQ cmd returns */
> > > > > +             rmb();
> > > > > +             *post_cycles = get_cycles();
> > > >
> > > > Are execution barriers needed around get_cycles() here? rmb() orders
> > > > memory accesses, but the CPU might still speculatively execute the cycle
> > > > counter reads out of order with respect to the AdminQ commands.
> > > >
> > > > Could this speculative reordering cause the strict bounds validation in
> > > > gve_ptp_gettimex64() to fail spuriously?
> > >
> > > I will add mb() barriers to be totally safe.
> >
> > I'm not so sure that rmb() is insufficient here. In x86_64 rmb() is an
> > lfence. lfence is used to prevent speculative execution (per the x86
> > barrier_nospec implementation) and rdtsc_ordered, which is called by
> > the ktime* helpers, also prefaces rdtsc with an lfence. On ARM the
> > get_cycles implementation adds an isb instruction before reading
> > cntvct_el0.
> >
> >
> > >
> > > > [ ... ]
> > > >
> > > > >  static int gve_ptp_gettimex64(struct ptp_clock_info *info,
> > > > >                             struct timespec64 *ts,
> > > > >                             struct ptp_system_timestamp *sts)
> > > > >  {
> > > > > -     return -EOPNOTSUPP;
> > > > > +     struct gve_ptp *ptp = container_of(info, struct gve_ptp, info);
> > > > > +     struct gve_sysclock_sample sysclock = {0};
> > > > > +     struct gve_priv *priv = ptp->priv;
> > > > > +     struct system_time_snapshot snap;
> > > > > +     u64 nic_ts;
> > > > > +     int err;
> > > > > +
> > > > > +     /* Take system clock snapshot before sampling cycle counters */
> > > > > +     if (sts)
> > > > > +             ktime_get_snapshot(&snap);
> > > > > +
> > > > > +     err = gve_clock_nic_ts_read(priv, &nic_ts, &sysclock);
> > > >
> > > > Could the system time snapshot become stale here?
> > > >
> > > > If priv->nic_ts_read_lock is contended, or if gve_adminq_read_timestamp()
> > > > retries and sleeps, the snapshot might be taken significantly before the
> > > > actual hardware read.
> > > >
> > > > Would it be more accurate to take the snapshot immediately before reading the
> > > > hardware, inside the mutex?
> > >
> > > Ack, will fix.
> > >
> > > >
> > > > > +     if (err)
> > > > > +             return err;
> > > > > +
> > > > > +     if (sts) {
> > > > > +             /* Reject samples with out of order system clock values */
> > > > > +             if (!(sysclock.host_pre_cycles <= sysclock.nic_pre_cycles &&
> > > > > +                   sysclock.nic_pre_cycles  <= sysclock.nic_post_cycles &&
> > > > > +                   sysclock.nic_post_cycles <= sysclock.host_post_cycles)) {
> > > > > +                     dev_err_ratelimited(&priv->pdev->dev,
> > > > > +                                         "AdminQ system clock cycle counts out of order. Expecting %llu <= %llu <= %llu <= %llu\n",
> > > > > +                                         (u64)sysclock.host_pre_cycles,
> > > > > +                                         sysclock.nic_pre_cycles,
> > > > > +                                         sysclock.nic_post_cycles,
> > > > > +                                         (u64)sysclock.host_post_cycles);
> > > > > +                     return -EBADMSG;
> > > >
> > > > How does this behave on architectures where get_cycles() is not implemented
> > > > and returns 0, or with older firmware that leaves nic_pre_cycles and
> > > > nic_post_cycles as 0?
> > >
> > > Firmware is not allowed to return 0. 0 should trigger the check.
> > >
> > > > It looks like the bounds check would unconditionally fail and return -EBADMSG.
> > > > Since userspace tools like chrony usually expect -EOPNOTSUPP for unsupported
> > > > extended ioctls to fall back to basic PTP ioctls, will returning -EBADMSG
> > > > cause a hard failure for clock synchronization?
> > >
> > > Will fix. GVE is only supported on x86_64 and arm64, and will fail on
> > > other architectures with -EOPNOTSUPP due to the check in
> > > gve_cycles_to_clock_fn(). However, if an architecture returns
> > > get_cycles() as 0, it will fail here first with -EBADMSG. I will add
> > > an explicit check for x86 || arm64 earlier in this function.
> > >
> > > >
> > > > [ ... ]
> > > >
> > > > > @@ -132,7 +307,7 @@ int gve_init_clock(struct gve_priv *priv)
> > > > >               goto release_ptp;
> > > > >       }
> > > > >       mutex_init(&priv->nic_ts_read_lock);
> > > > > -     err = gve_clock_nic_ts_read(priv, &nic_raw);
> > > > > +     err = gve_clock_nic_ts_read(priv, &nic_raw, NULL);
> > > > >       if (err) {
> > > > >               dev_err(&priv->pdev->dev, "failed to read NIC clock %d\n", err);
> > > > >               goto release_nic_ts_report;
> > > >
> > > > Is there a race condition exposing the PTP device to userspace before its
> > > > resources are fully initialized?
> > > >
> > > > Looking at gve_init_clock(), gve_ptp_init() is called before allocating
> > > > nic_ts_report and initializing nic_ts_read_lock:
> > > >
> > > >     err = gve_ptp_init(priv);
> > > >     if (err)
> > > >         return err;
> > > >
> > > >     priv->nic_ts_report = dma_alloc_coherent(...);
> > > >     ...
> > > >     mutex_init(&priv->nic_ts_read_lock);
> > > >
> > > > If a concurrent userspace process immediately invokes the
> > > > PTP_SYS_OFFSET_EXTENDED ioctl after gve_ptp_init() registers the /dev/ptpX
> > > > device, could it call gve_ptp_gettimex64() and attempt to lock the
> > > > uninitialized mutex or dereference the NULL nic_ts_report pointer?
> > > >
> > > > Additionally, in the error path for gve_init_clock():
> > > >
> > > >     release_nic_ts_report:
> > > >         mutex_destroy(&priv->nic_ts_read_lock);
> > > >         dma_free_coherent(...);
> > > >         priv->nic_ts_report = NULL;
> > > >     release_ptp:
> > > >         gve_ptp_release(priv);
> > > >
> > > > Could destroying the mutex and freeing the memory before gve_ptp_release()
> > > > create a use-after-free window if an ioctl is currently running?
> > >
> > > Will be fixed in the previous patch in the series.

Re: [PATCH net-next v2 3/3] gve: implement PTP gettimex64

Posted by Jordan Rhee 2 hours ago

The iowrite32be() inside gve_adminq_report_nic_ts() will prevent
reordering by the compiler because it uses volatile, and the VM exit
will prevent speculative execution by the CPU.

On Thu, Apr 2, 2026 at 8:53 AM Naman Gulati <namangulati@google.com> wrote:
>
> On Thu, Apr 2, 2026 at 8:38 AM Jordan Rhee <jordanrhee@google.com> wrote:
> >
> > We know that gve_adminq_report_nic_ts() will cause a VM exit, which
> > will act as a full barrier. Additional barriers are not strictly
> > necessary from a functional perspective. Can we rely on this fact or
> > do we need explicit barriers?
>
> I don't think that's enough. Without a barrier get_cycles can be
> speculatively executed before gve_adminq_report_nic_ts.
>
> >
> >
> > On Tue, Mar 31, 2026 at 11:05 AM Naman Gulati <namangulati@google.com> wrote:
> > >
> > > On Sun, Mar 29, 2026 at 9:31 PM Jordan Rhee <jordanrhee@google.com> wrote:
> > > >
> > > > On Sun, Mar 29, 2026 at 2:20 PM Jakub Kicinski <kuba@kernel.org> wrote:
> > > > >
> > > > > This is an AI-generated review of your patch. The human sending this
> > > > > email has considered the AI review valid, or at least plausible.
> > > > > ---
> > > > > Subject: gve: implement PTP gettimex64
> > > > >
> > > > > This commit implements the PTP gettimex64 function to enable chrony and
> > > > > phc2sys to synchronize the system clock to the NIC clock. It reads the
> > > > > NIC timestamp along with pre and post system cycle counters to minimize
> > > > > the uncertainty window.
> > > > >
> > > > > > diff --git a/drivers/net/ethernet/google/gve/gve_ptp.c b/drivers/net/ethernet/google/gve/gve_ptp.c
> > > > > > index 140b8fbce4f4..bea794541f30 100644
> > > > > > --- a/drivers/net/ethernet/google/gve/gve_ptp.c
> > > > > > +++ b/drivers/net/ethernet/google/gve/gve_ptp.c
> > > > > > @@ -10,28 +10,203 @@
> > > > >
> > > > > [ ... ]
> > > > >
> > > > > > +static int gve_adminq_read_timestamp(struct gve_priv *priv,
> > > > > > +                                  cycles_t *pre_cycles,
> > > > > > +                                  cycles_t *post_cycles)
> > > > > > +{
> > > > > > +     unsigned long delay_us = 1000;
> > > > > > +     int retry_count = 0;
> > > > > > +     int err;
> > > > > > +
> > > > > > +     lockdep_assert_held(&priv->nic_ts_read_lock);
> > > > > > +
> > > > > > +     do {
> > > > > > +             *pre_cycles = get_cycles();
> > > > > > +             err = gve_adminq_report_nic_ts(priv, priv->nic_ts_report_bus);
> > > > > > +
> > > > > > +             /* Ensure cycle counter is sampled after AdminQ cmd returns */
> > > > > > +             rmb();
> > > > > > +             *post_cycles = get_cycles();
> > > > >
> > > > > Are execution barriers needed around get_cycles() here? rmb() orders
> > > > > memory accesses, but the CPU might still speculatively execute the cycle
> > > > > counter reads out of order with respect to the AdminQ commands.
> > > > >
> > > > > Could this speculative reordering cause the strict bounds validation in
> > > > > gve_ptp_gettimex64() to fail spuriously?
> > > >
> > > > I will add mb() barriers to be totally safe.
> > >
> > > I'm not so sure that rmb() is insufficient here. In x86_64 rmb() is an
> > > lfence. lfence is used to prevent speculative execution (per the x86
> > > barrier_nospec implementation) and rdtsc_ordered, which is called by
> > > the ktime* helpers, also prefaces rdtsc with an lfence. On ARM the
> > > get_cycles implementation adds an isb instruction before reading
> > > cntvct_el0.
> > >
> > >
> > > >
> > > > > [ ... ]
> > > > >
> > > > > >  static int gve_ptp_gettimex64(struct ptp_clock_info *info,
> > > > > >                             struct timespec64 *ts,
> > > > > >                             struct ptp_system_timestamp *sts)
> > > > > >  {
> > > > > > -     return -EOPNOTSUPP;
> > > > > > +     struct gve_ptp *ptp = container_of(info, struct gve_ptp, info);
> > > > > > +     struct gve_sysclock_sample sysclock = {0};
> > > > > > +     struct gve_priv *priv = ptp->priv;
> > > > > > +     struct system_time_snapshot snap;
> > > > > > +     u64 nic_ts;
> > > > > > +     int err;
> > > > > > +
> > > > > > +     /* Take system clock snapshot before sampling cycle counters */
> > > > > > +     if (sts)
> > > > > > +             ktime_get_snapshot(&snap);
> > > > > > +
> > > > > > +     err = gve_clock_nic_ts_read(priv, &nic_ts, &sysclock);
> > > > >
> > > > > Could the system time snapshot become stale here?
> > > > >
> > > > > If priv->nic_ts_read_lock is contended, or if gve_adminq_read_timestamp()
> > > > > retries and sleeps, the snapshot might be taken significantly before the
> > > > > actual hardware read.
> > > > >
> > > > > Would it be more accurate to take the snapshot immediately before reading the
> > > > > hardware, inside the mutex?
> > > >
> > > > Ack, will fix.
> > > >
> > > > >
> > > > > > +     if (err)
> > > > > > +             return err;
> > > > > > +
> > > > > > +     if (sts) {
> > > > > > +             /* Reject samples with out of order system clock values */
> > > > > > +             if (!(sysclock.host_pre_cycles <= sysclock.nic_pre_cycles &&
> > > > > > +                   sysclock.nic_pre_cycles  <= sysclock.nic_post_cycles &&
> > > > > > +                   sysclock.nic_post_cycles <= sysclock.host_post_cycles)) {
> > > > > > +                     dev_err_ratelimited(&priv->pdev->dev,
> > > > > > +                                         "AdminQ system clock cycle counts out of order. Expecting %llu <= %llu <= %llu <= %llu\n",
> > > > > > +                                         (u64)sysclock.host_pre_cycles,
> > > > > > +                                         sysclock.nic_pre_cycles,
> > > > > > +                                         sysclock.nic_post_cycles,
> > > > > > +                                         (u64)sysclock.host_post_cycles);
> > > > > > +                     return -EBADMSG;
> > > > >
> > > > > How does this behave on architectures where get_cycles() is not implemented
> > > > > and returns 0, or with older firmware that leaves nic_pre_cycles and
> > > > > nic_post_cycles as 0?
> > > >
> > > > Firmware is not allowed to return 0. 0 should trigger the check.
> > > >
> > > > > It looks like the bounds check would unconditionally fail and return -EBADMSG.
> > > > > Since userspace tools like chrony usually expect -EOPNOTSUPP for unsupported
> > > > > extended ioctls to fall back to basic PTP ioctls, will returning -EBADMSG
> > > > > cause a hard failure for clock synchronization?
> > > >
> > > > Will fix. GVE is only supported on x86_64 and arm64, and will fail on
> > > > other architectures with -EOPNOTSUPP due to the check in
> > > > gve_cycles_to_clock_fn(). However, if an architecture returns
> > > > get_cycles() as 0, it will fail here first with -EBADMSG. I will add
> > > > an explicit check for x86 || arm64 earlier in this function.
> > > >
> > > > >
> > > > > [ ... ]
> > > > >
> > > > > > @@ -132,7 +307,7 @@ int gve_init_clock(struct gve_priv *priv)
> > > > > >               goto release_ptp;
> > > > > >       }
> > > > > >       mutex_init(&priv->nic_ts_read_lock);
> > > > > > -     err = gve_clock_nic_ts_read(priv, &nic_raw);
> > > > > > +     err = gve_clock_nic_ts_read(priv, &nic_raw, NULL);
> > > > > >       if (err) {
> > > > > >               dev_err(&priv->pdev->dev, "failed to read NIC clock %d\n", err);
> > > > > >               goto release_nic_ts_report;
> > > > >
> > > > > Is there a race condition exposing the PTP device to userspace before its
> > > > > resources are fully initialized?
> > > > >
> > > > > Looking at gve_init_clock(), gve_ptp_init() is called before allocating
> > > > > nic_ts_report and initializing nic_ts_read_lock:
> > > > >
> > > > >     err = gve_ptp_init(priv);
> > > > >     if (err)
> > > > >         return err;
> > > > >
> > > > >     priv->nic_ts_report = dma_alloc_coherent(...);
> > > > >     ...
> > > > >     mutex_init(&priv->nic_ts_read_lock);
> > > > >
> > > > > If a concurrent userspace process immediately invokes the
> > > > > PTP_SYS_OFFSET_EXTENDED ioctl after gve_ptp_init() registers the /dev/ptpX
> > > > > device, could it call gve_ptp_gettimex64() and attempt to lock the
> > > > > uninitialized mutex or dereference the NULL nic_ts_report pointer?
> > > > >
> > > > > Additionally, in the error path for gve_init_clock():
> > > > >
> > > > >     release_nic_ts_report:
> > > > >         mutex_destroy(&priv->nic_ts_read_lock);
> > > > >         dma_free_coherent(...);
> > > > >         priv->nic_ts_report = NULL;
> > > > >     release_ptp:
> > > > >         gve_ptp_release(priv);
> > > > >
> > > > > Could destroying the mutex and freeing the memory before gve_ptp_release()
> > > > > create a use-after-free window if an ioctl is currently running?
> > > >
> > > > Will be fixed in the previous patch in the series.

[PATCH net-next v2 1/3] gve: skip error logging for retryable AdminQ commands
[PATCH net-next v2 2/3] gve: make nic clock reads thread safe
[PATCH net-next v2 3/3] gve: implement PTP gettimex64