When a watchdog fires, the domain is crashed and can't dump any state.
Xen allows a domain to have two separate watchdogs. Therefore, for a
domain running multiple watchdogs (e.g. one based around network, one
for disk), it is important for diagnostics to know which watchdog
fired.
As the printk() is in a timer callback, this is a bit awkward to
arrange, but there are 12 spare bits in the bottom of the domain
pointer owing to its alignment.
Reuse these bits to encode the watchdog id too, so the one which fired
is identified when the domain is crashed.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Anthony PERARD <anthony.perard@vates.tech>
CC: Michal Orzel <michal.orzel@amd.com>
CC: Jan Beulich <jbeulich@suse.com>
CC: Julien Grall <julien@xen.org>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Stefano Stabellini <sstabellini@kernel.org>
CC: Dario Faggioli <dfaggioli@suse.com>
CC: Juergen Gross <jgross@suse.com>
CC: George Dunlap <gwd@xenproject.org>
v2:
* BUILD_BUG_ON() against the alignment of d.
---
xen/common/sched/core.c | 23 ++++++++++++++++++++---
1 file changed, 20 insertions(+), 3 deletions(-)
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index d6296d99fdb9..3db0fe32ccd8 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -1534,12 +1534,19 @@ long vcpu_yield(void)
static void cf_check domain_watchdog_timeout(void *data)
{
- struct domain *d = data;
+ /*
+ * The data parameter encodes the watchdog id in the low bits of
+ * the domain pointer.
+ */
+ struct domain *d = _p((unsigned long)data & PAGE_MASK);
+ unsigned int id = (unsigned long)data & ~PAGE_MASK;
+
+ BUILD_BUG_ON(!(alignof(d) < PAGE_SIZE));
if ( d->is_shutting_down || d->is_dying )
return;
- printk("Watchdog timer fired for domain %u\n", d->domain_id);
+ printk("Watchdog timer %u fired for %pd\n", id, d);
domain_shutdown(d, SHUTDOWN_watchdog);
}
@@ -1593,7 +1600,17 @@ void watchdog_domain_init(struct domain *d)
d->watchdog_inuse_map = 0;
for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
- init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
+ {
+ void *data = d;
+
+ BUILD_BUG_ON(NR_DOMAIN_WATCHDOG_TIMERS > alignof(d));
+
+ /*
+ * For the timer callback parameter, encode the watchdog id in
+ * the low bits of the domain pointer.
+ */
+ init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, data + i, 0);
+ }
}
void watchdog_domain_destroy(struct domain *d)
--
2.39.5
On 04.03.2025 00:29, Andrew Cooper wrote: > --- a/xen/common/sched/core.c > +++ b/xen/common/sched/core.c > @@ -1534,12 +1534,19 @@ long vcpu_yield(void) > > static void cf_check domain_watchdog_timeout(void *data) > { > - struct domain *d = data; > + /* > + * The data parameter encodes the watchdog id in the low bits of > + * the domain pointer. > + */ > + struct domain *d = _p((unsigned long)data & PAGE_MASK); > + unsigned int id = (unsigned long)data & ~PAGE_MASK; > + > + BUILD_BUG_ON(!(alignof(d) < PAGE_SIZE)); BUILD_BUG_ON(alignof(*d) < PAGE_SIZE); > @@ -1593,7 +1600,17 @@ void watchdog_domain_init(struct domain *d) > d->watchdog_inuse_map = 0; > > for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) > - init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0); > + { > + void *data = d; > + > + BUILD_BUG_ON(NR_DOMAIN_WATCHDOG_TIMERS > alignof(d)); BUILD_BUG_ON(NR_DOMAIN_WATCHDOG_TIMERS > alignof(*d)); Jan
On 05/03/2025 9:27 am, Jan Beulich wrote: > On 04.03.2025 00:29, Andrew Cooper wrote: >> --- a/xen/common/sched/core.c >> +++ b/xen/common/sched/core.c >> @@ -1534,12 +1534,19 @@ long vcpu_yield(void) >> >> static void cf_check domain_watchdog_timeout(void *data) >> { >> - struct domain *d = data; >> + /* >> + * The data parameter encodes the watchdog id in the low bits of >> + * the domain pointer. >> + */ >> + struct domain *d = _p((unsigned long)data & PAGE_MASK); >> + unsigned int id = (unsigned long)data & ~PAGE_MASK; >> + >> + BUILD_BUG_ON(!(alignof(d) < PAGE_SIZE)); > BUILD_BUG_ON(alignof(*d) < PAGE_SIZE); > >> @@ -1593,7 +1600,17 @@ void watchdog_domain_init(struct domain *d) >> d->watchdog_inuse_map = 0; >> >> for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) >> - init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0); >> + { >> + void *data = d; >> + >> + BUILD_BUG_ON(NR_DOMAIN_WATCHDOG_TIMERS > alignof(d)); > BUILD_BUG_ON(NR_DOMAIN_WATCHDOG_TIMERS > alignof(*d)); Well, that's embarrassing. That explains why I was having issues, and I was clearly asleep when writing... ~Andrew
On Mon, 3 Mar 2025, Andrew Cooper wrote: > When a watchdog fires, the domain is crashed and can't dump any state. > > Xen allows a domain to have two separate watchdogs. Therefore, for a > domain running multiple watchdogs (e.g. one based around network, one > for disk), it is important for diagnostics to know which watchdog > fired. > > As the printk() is in a timer callback, this is a bit awkward to > arrange, but there are 12 spare bits in the bottom of the domain > pointer owing to its alignment. > > Reuse these bits to encode the watchdog id too, so the one which fired > is identified when the domain is crashed. > > Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> The code looks like it would work as intended. I checked with the MISRA rules and it looks like it would fall under the allowed exception. Please have a run through ECLAIR to make sure it doesn't cause regressions (especially R11.2). > --- > CC: Anthony PERARD <anthony.perard@vates.tech> > CC: Michal Orzel <michal.orzel@amd.com> > CC: Jan Beulich <jbeulich@suse.com> > CC: Julien Grall <julien@xen.org> > CC: Roger Pau Monné <roger.pau@citrix.com> > CC: Stefano Stabellini <sstabellini@kernel.org> > CC: Dario Faggioli <dfaggioli@suse.com> > CC: Juergen Gross <jgross@suse.com> > CC: George Dunlap <gwd@xenproject.org> > > v2: > * BUILD_BUG_ON() against the alignment of d. > --- > xen/common/sched/core.c | 23 ++++++++++++++++++++--- > 1 file changed, 20 insertions(+), 3 deletions(-) > > diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c > index d6296d99fdb9..3db0fe32ccd8 100644 > --- a/xen/common/sched/core.c > +++ b/xen/common/sched/core.c > @@ -1534,12 +1534,19 @@ long vcpu_yield(void) > > static void cf_check domain_watchdog_timeout(void *data) > { > - struct domain *d = data; > + /* > + * The data parameter encodes the watchdog id in the low bits of > + * the domain pointer. > + */ > + struct domain *d = _p((unsigned long)data & PAGE_MASK); > + unsigned int id = (unsigned long)data & ~PAGE_MASK; > + > + BUILD_BUG_ON(!(alignof(d) < PAGE_SIZE)); > > if ( d->is_shutting_down || d->is_dying ) > return; > > - printk("Watchdog timer fired for domain %u\n", d->domain_id); > + printk("Watchdog timer %u fired for %pd\n", id, d); > domain_shutdown(d, SHUTDOWN_watchdog); > } > > @@ -1593,7 +1600,17 @@ void watchdog_domain_init(struct domain *d) > d->watchdog_inuse_map = 0; > > for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) > - init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0); > + { > + void *data = d; > + > + BUILD_BUG_ON(NR_DOMAIN_WATCHDOG_TIMERS > alignof(d)); > + > + /* > + * For the timer callback parameter, encode the watchdog id in > + * the low bits of the domain pointer. > + */ > + init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, data + i, 0); > + } > } > > void watchdog_domain_destroy(struct domain *d) > -- > 2.39.5 >
On 04/03/2025 10:47 pm, Stefano Stabellini wrote: > On Mon, 3 Mar 2025, Andrew Cooper wrote: >> When a watchdog fires, the domain is crashed and can't dump any state. >> >> Xen allows a domain to have two separate watchdogs. Therefore, for a >> domain running multiple watchdogs (e.g. one based around network, one >> for disk), it is important for diagnostics to know which watchdog >> fired. >> >> As the printk() is in a timer callback, this is a bit awkward to >> arrange, but there are 12 spare bits in the bottom of the domain >> pointer owing to its alignment. >> >> Reuse these bits to encode the watchdog id too, so the one which fired >> is identified when the domain is crashed. >> >> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> > The code looks like it would work as intended. It's already deployed into production[1]. (Guess /why/ we need to distinguish the two watchdogs.) > I checked with the MISRA rules and it looks like it would fall under the > allowed exception. Please have a run through ECLAIR to make sure it > doesn't cause regressions (especially R11.2). Eclair is happy: https://gitlab.com/xen-project/people/andyhhp/xen/-/pipelines/1700050949 ~Andrew [1] Actually, the version that took me 2 minute to write one evening over beer after work when the initial request for help came in. Followed shortly by an explanation of how alignment works.
When a watchdog fires, the domain is crashed and can't dump any state.
Xen allows a domain to have two separate watchdogs. Therefore, for a
domain running multiple watchdogs (e.g. one based around network, one
for disk), it is important for diagnostics to know which watchdog
fired.
As the printk() is in a timer callback, this is a bit awkward to
arrange, but there are 12 spare bits in the bottom of the domain
pointer owing to its alignment.
Reuse these bits to encode the watchdog id too, so the one which fired
is identified when the domain is crashed.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Anthony PERARD <anthony.perard@vates.tech>
CC: Michal Orzel <michal.orzel@amd.com>
CC: Jan Beulich <jbeulich@suse.com>
CC: Julien Grall <julien@xen.org>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Stefano Stabellini <sstabellini@kernel.org>
CC: Dario Faggioli <dfaggioli@suse.com>
CC: Juergen Gross <jgross@suse.com>
CC: George Dunlap <gwd@xenproject.org>
v3:
* Fix alignof() expressions.
v2:
* BUILD_BUG_ON() against the alignment of d.
---
xen/common/sched/core.c | 23 ++++++++++++++++++++---
1 file changed, 20 insertions(+), 3 deletions(-)
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index d6296d99fdb9..9043414290a8 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -1534,12 +1534,19 @@ long vcpu_yield(void)
static void cf_check domain_watchdog_timeout(void *data)
{
- struct domain *d = data;
+ /*
+ * The data parameter encodes the watchdog id in the low bits of
+ * the domain pointer.
+ */
+ struct domain *d = _p((unsigned long)data & PAGE_MASK);
+ unsigned int id = (unsigned long)data & ~PAGE_MASK;
+
+ BUILD_BUG_ON(alignof(*d) < PAGE_SIZE);
if ( d->is_shutting_down || d->is_dying )
return;
- printk("Watchdog timer fired for domain %u\n", d->domain_id);
+ printk("Watchdog timer %u fired for %pd\n", id, d);
domain_shutdown(d, SHUTDOWN_watchdog);
}
@@ -1593,7 +1600,17 @@ void watchdog_domain_init(struct domain *d)
d->watchdog_inuse_map = 0;
for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
- init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
+ {
+ void *data = d;
+
+ BUILD_BUG_ON(NR_DOMAIN_WATCHDOG_TIMERS > alignof(*d));
+
+ /*
+ * For the timer callback parameter, encode the watchdog id in
+ * the low bits of the domain pointer.
+ */
+ init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, data + i, 0);
+ }
}
void watchdog_domain_destroy(struct domain *d)
base-commit: 4d4725901e474fb3a61814e3c7bcb563cbdd857a
prerequisite-patch-id: abf66c7a76178c5cf225f0c67efd6c430e95974a
--
2.39.5
© 2016 - 2025 Red Hat, Inc.