xen/common/page_alloc.c | 59 ++++++++++++++++++++++------------------- xen/include/xen/sched.h | 3 ++- 2 files changed, 33 insertions(+), 29 deletions(-)
The current logic splits the update of the amount of available memory in
the system (total_avail_pages) and pending claims into two separately
locked regions. This leads to a window between counters adjustments where
the result of total_avail_pages - outstanding_claims doesn't reflect the
real amount of free memory available, and can return a negative value due
to total_avail_pages having been updated ahead of outstanding_claims.
Fix by adjusting outstanding_claims and d->outstanding_pages in the same
place where total_avail_pages is updated. Note that accesses to
d->outstanding_pages is protected by the global heap_lock, just like
total_avail_pages or outstanding_claims. Add a comment to the field
declaration, and also adjust the comment at the top of
domain_set_outstanding_pages() to be clearer in that regard.
Note that failures in assign_pages() causes the claimed amount that has
been allocated to be lost, as the amount is not added back to the domain
quota once pages are freed. Given the intended usage of claims is limited
to initial physmap populate, and the current failure paths in
assign_pages() would lead to the domain being destroyed anyway, don't
add extra logic to recover the claimed amount on failure - it's just adding
complexity for no real benefit.
Fixes: 65c9792df600 ("mmu: Introduce XENMEM_claim_pages (subop of memory ops)")
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
Changes since v2:
- Revert back to the approach in v1.
- Add extra comments and justification in commit message.
Changes since v1:
- Regain the claim if allocated page cannot be assigned to the domain.
- Adjust comments regarding d->outstanding_pages being protected by the
heap_lock (instead of the d->page_alloc_lock).
---
xen/common/page_alloc.c | 59 ++++++++++++++++++++++-------------------
xen/include/xen/sched.h | 3 ++-
2 files changed, 33 insertions(+), 29 deletions(-)
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 1f67b88a8933..49ca70334458 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -515,30 +515,6 @@ unsigned long domain_adjust_tot_pages(struct domain *d, long pages)
ASSERT(rspin_is_locked(&d->page_alloc_lock));
d->tot_pages += pages;
- /*
- * can test d->outstanding_pages race-free because it can only change
- * if d->page_alloc_lock and heap_lock are both held, see also
- * domain_set_outstanding_pages below
- */
- if ( !d->outstanding_pages || pages <= 0 )
- goto out;
-
- spin_lock(&heap_lock);
- BUG_ON(outstanding_claims < d->outstanding_pages);
- if ( d->outstanding_pages < pages )
- {
- /* `pages` exceeds the domain's outstanding count. Zero it out. */
- outstanding_claims -= d->outstanding_pages;
- d->outstanding_pages = 0;
- }
- else
- {
- outstanding_claims -= pages;
- d->outstanding_pages -= pages;
- }
- spin_unlock(&heap_lock);
-
-out:
return d->tot_pages;
}
@@ -548,9 +524,10 @@ int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
unsigned long claim, avail_pages;
/*
- * take the domain's page_alloc_lock, else all d->tot_page adjustments
- * must always take the global heap_lock rather than only in the much
- * rarer case that d->outstanding_pages is non-zero
+ * Two locks are needed here:
+ * - d->page_alloc_lock: protects accesses to d->{tot,max,extra}_pages.
+ * - heap_lock: protects accesses to d->outstanding_pages, total_avail_pages
+ * and outstanding_claims.
*/
nrspin_lock(&d->page_alloc_lock);
spin_lock(&heap_lock);
@@ -999,7 +976,7 @@ static struct page_info *alloc_heap_pages(
{
nodeid_t node;
unsigned int i, buddy_order, zone, first_dirty;
- unsigned long request = 1UL << order;
+ unsigned int request = 1UL << order;
struct page_info *pg;
bool need_tlbflush = false;
uint32_t tlbflush_timestamp = 0;
@@ -1008,6 +985,8 @@ static struct page_info *alloc_heap_pages(
/* Make sure there are enough bits in memflags for nodeID. */
BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
+ /* Make sure max order doesn't overflow the local storage type. */
+ BUILD_BUG_ON(MAX_ORDER >= sizeof(request) * 8);
ASSERT(zone_lo <= zone_hi);
ASSERT(zone_hi < NR_ZONES);
@@ -1071,6 +1050,30 @@ static struct page_info *alloc_heap_pages(
total_avail_pages -= request;
ASSERT(total_avail_pages >= 0);
+ if ( d && d->outstanding_pages && !(memflags & MEMF_no_refcount) )
+ {
+ /*
+ * Adjust claims in the same locked region where total_avail_pages is
+ * adjusted, not doing so would lead to a window where the amount of
+ * free memory (avail - claimed) would be incorrect.
+ *
+ * Note that by adjusting the claimed amount here it's possible for
+ * pages to fail to be assigned to the claiming domain while already
+ * having been subtracted from d->outstanding_pages. Such claimed
+ * amount is then lost, as the pages that fail to be assigned to the
+ * domain are freed without replenishing the claim. This is fine given
+ * claims are only to be used during physmap population as part of
+ * domain build, and any failure in assign_pages() there will result in
+ * the domain being destroyed before creation is finished. Losing part
+ * of the claim makes no difference.
+ */
+ unsigned int outstanding = min(d->outstanding_pages, request);
+
+ outstanding_claims -= outstanding;
+ BUG_ON(outstanding > d->outstanding_pages);
+ d->outstanding_pages -= outstanding;
+ }
+
check_low_mem_virq();
if ( d != NULL )
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 1f77e0869b5d..d922c908c29f 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -413,7 +413,8 @@ struct domain
unsigned int tot_pages;
unsigned int xenheap_pages; /* pages allocated from Xen heap */
- unsigned int outstanding_pages; /* pages claimed but not possessed */
+ /* Pages claimed but not possessed, protected by global heap_lock. */
+ unsigned int outstanding_pages;
unsigned int max_pages; /* maximum value for domain_tot_pages() */
unsigned int extra_pages; /* pages not included in domain_tot_pages() */
--
2.51.0
On 07.01.2026 18:56, Roger Pau Monne wrote:
> The current logic splits the update of the amount of available memory in
> the system (total_avail_pages) and pending claims into two separately
> locked regions. This leads to a window between counters adjustments where
> the result of total_avail_pages - outstanding_claims doesn't reflect the
> real amount of free memory available, and can return a negative value due
> to total_avail_pages having been updated ahead of outstanding_claims.
>
> Fix by adjusting outstanding_claims and d->outstanding_pages in the same
> place where total_avail_pages is updated. Note that accesses to
> d->outstanding_pages is protected by the global heap_lock, just like
> total_avail_pages or outstanding_claims. Add a comment to the field
> declaration, and also adjust the comment at the top of
> domain_set_outstanding_pages() to be clearer in that regard.
>
> Note that failures in assign_pages() causes the claimed amount that has
> been allocated to be lost, as the amount is not added back to the domain
> quota once pages are freed. Given the intended usage of claims is limited
> to initial physmap populate, and the current failure paths in
> assign_pages() would lead to the domain being destroyed anyway, don't
> add extra logic to recover the claimed amount on failure - it's just adding
> complexity for no real benefit.
>
> Fixes: 65c9792df600 ("mmu: Introduce XENMEM_claim_pages (subop of memory ops)")
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> Changes since v2:
> - Revert back to the approach in v1.
You didn't fully go back to v1. While ...
> @@ -548,9 +524,10 @@ int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
> unsigned long claim, avail_pages;
>
> /*
> - * take the domain's page_alloc_lock, else all d->tot_page adjustments
> - * must always take the global heap_lock rather than only in the much
> - * rarer case that d->outstanding_pages is non-zero
> + * Two locks are needed here:
> + * - d->page_alloc_lock: protects accesses to d->{tot,max,extra}_pages.
> + * - heap_lock: protects accesses to d->outstanding_pages, total_avail_pages
> + * and outstanding_claims.
> */
> nrspin_lock(&d->page_alloc_lock);
> spin_lock(&heap_lock);
... the comment improvement is of course okay to keep, ...
> @@ -999,7 +976,7 @@ static struct page_info *alloc_heap_pages(
> {
> nodeid_t node;
> unsigned int i, buddy_order, zone, first_dirty;
> - unsigned long request = 1UL << order;
> + unsigned int request = 1UL << order;
... this I'm less certain about (and if it was to be kept, it should also
become 1U). For one, this bounds check:
if ( (outstanding_claims + request > total_avail_pages) &&
ends up still being okay (perhaps except on Arm32, but the wrapping issue
there is pre-existing, albeit possibly benign due to other constraints),
but just because outstanding_claims is "long" (and it's unclear to me why
it isn't "unsigned long").
And then, what exactly is it that you want the more narrow type for (the
description says nothing in that regard)? The other relevant uses of the
variable are
avail[node][zone] -= request;
total_avail_pages -= request;
where both avail[][] and total_avail_pages are (unsigned) long (again
unclear to me why for total_avail_pages it's plain long).
Oh, wait, it is ...
> @@ -1071,6 +1050,30 @@ static struct page_info *alloc_heap_pages(
> total_avail_pages -= request;
> ASSERT(total_avail_pages >= 0);
>
> + if ( d && d->outstanding_pages && !(memflags & MEMF_no_refcount) )
> + {
> + /*
> + * Adjust claims in the same locked region where total_avail_pages is
> + * adjusted, not doing so would lead to a window where the amount of
> + * free memory (avail - claimed) would be incorrect.
> + *
> + * Note that by adjusting the claimed amount here it's possible for
> + * pages to fail to be assigned to the claiming domain while already
> + * having been subtracted from d->outstanding_pages. Such claimed
> + * amount is then lost, as the pages that fail to be assigned to the
> + * domain are freed without replenishing the claim. This is fine given
> + * claims are only to be used during physmap population as part of
> + * domain build, and any failure in assign_pages() there will result in
> + * the domain being destroyed before creation is finished. Losing part
> + * of the claim makes no difference.
> + */
> + unsigned int outstanding = min(d->outstanding_pages, request);
... the desire to avoid use of min_t() here which wants "request" to be
"unsigned int". At some point we'll want to change the struct domain fields
to unsigned long anyway, at which point the above would need adjustment. It's
well possible that such an adjustment would end up being to then use min_t().
Imo we'd be better off using e.g.
unsigned int outstanding = min(d->outstanding_pages + 0UL, request);
or even
typeof(d->outstanding_pages) outstanding =
min(d->outstanding_pages + 0UL, request);
right away. In the latter case the decl wouldn't even need touching when the
struct domain fields are promoted.
> + BUG_ON(outstanding > d->outstanding_pages);
Unlike in v1, where the min() was different, this is now dead code.
Jan
On Thu, Jan 08, 2026 at 09:24:51AM +0100, Jan Beulich wrote:
> On 07.01.2026 18:56, Roger Pau Monne wrote:
> > The current logic splits the update of the amount of available memory in
> > the system (total_avail_pages) and pending claims into two separately
> > locked regions. This leads to a window between counters adjustments where
> > the result of total_avail_pages - outstanding_claims doesn't reflect the
> > real amount of free memory available, and can return a negative value due
> > to total_avail_pages having been updated ahead of outstanding_claims.
> >
> > Fix by adjusting outstanding_claims and d->outstanding_pages in the same
> > place where total_avail_pages is updated. Note that accesses to
> > d->outstanding_pages is protected by the global heap_lock, just like
> > total_avail_pages or outstanding_claims. Add a comment to the field
> > declaration, and also adjust the comment at the top of
> > domain_set_outstanding_pages() to be clearer in that regard.
> >
> > Note that failures in assign_pages() causes the claimed amount that has
> > been allocated to be lost, as the amount is not added back to the domain
> > quota once pages are freed. Given the intended usage of claims is limited
> > to initial physmap populate, and the current failure paths in
> > assign_pages() would lead to the domain being destroyed anyway, don't
> > add extra logic to recover the claimed amount on failure - it's just adding
> > complexity for no real benefit.
> >
> > Fixes: 65c9792df600 ("mmu: Introduce XENMEM_claim_pages (subop of memory ops)")
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> > Changes since v2:
> > - Revert back to the approach in v1.
>
> You didn't fully go back to v1. While ...
>
> > @@ -548,9 +524,10 @@ int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
> > unsigned long claim, avail_pages;
> >
> > /*
> > - * take the domain's page_alloc_lock, else all d->tot_page adjustments
> > - * must always take the global heap_lock rather than only in the much
> > - * rarer case that d->outstanding_pages is non-zero
> > + * Two locks are needed here:
> > + * - d->page_alloc_lock: protects accesses to d->{tot,max,extra}_pages.
> > + * - heap_lock: protects accesses to d->outstanding_pages, total_avail_pages
> > + * and outstanding_claims.
> > */
> > nrspin_lock(&d->page_alloc_lock);
> > spin_lock(&heap_lock);
>
> ... the comment improvement is of course okay to keep, ...
>
> > @@ -999,7 +976,7 @@ static struct page_info *alloc_heap_pages(
> > {
> > nodeid_t node;
> > unsigned int i, buddy_order, zone, first_dirty;
> > - unsigned long request = 1UL << order;
> > + unsigned int request = 1UL << order;
>
> ... this I'm less certain about (and if it was to be kept, it should also
> become 1U). For one, this bounds check:
>
> if ( (outstanding_claims + request > total_avail_pages) &&
>
> ends up still being okay (perhaps except on Arm32, but the wrapping issue
> there is pre-existing, albeit possibly benign due to other constraints),
> but just because outstanding_claims is "long" (and it's unclear to me why
> it isn't "unsigned long").
>
> And then, what exactly is it that you want the more narrow type for (the
> description says nothing in that regard)? The other relevant uses of the
> variable are
>
> avail[node][zone] -= request;
> total_avail_pages -= request;
>
> where both avail[][] and total_avail_pages are (unsigned) long (again
> unclear to me why for total_avail_pages it's plain long).
>
> Oh, wait, it is ...
>
> > @@ -1071,6 +1050,30 @@ static struct page_info *alloc_heap_pages(
> > total_avail_pages -= request;
> > ASSERT(total_avail_pages >= 0);
> >
> > + if ( d && d->outstanding_pages && !(memflags & MEMF_no_refcount) )
> > + {
> > + /*
> > + * Adjust claims in the same locked region where total_avail_pages is
> > + * adjusted, not doing so would lead to a window where the amount of
> > + * free memory (avail - claimed) would be incorrect.
> > + *
> > + * Note that by adjusting the claimed amount here it's possible for
> > + * pages to fail to be assigned to the claiming domain while already
> > + * having been subtracted from d->outstanding_pages. Such claimed
> > + * amount is then lost, as the pages that fail to be assigned to the
> > + * domain are freed without replenishing the claim. This is fine given
> > + * claims are only to be used during physmap population as part of
> > + * domain build, and any failure in assign_pages() there will result in
> > + * the domain being destroyed before creation is finished. Losing part
> > + * of the claim makes no difference.
> > + */
> > + unsigned int outstanding = min(d->outstanding_pages, request);
>
> ... the desire to avoid use of min_t() here which wants "request" to be
> "unsigned int". At some point we'll want to change the struct domain fields
> to unsigned long anyway, at which point the above would need adjustment. It's
> well possible that such an adjustment would end up being to then use min_t().
> Imo we'd be better off using e.g.
>
> unsigned int outstanding = min(d->outstanding_pages + 0UL, request);
>
> or even
>
> typeof(d->outstanding_pages) outstanding =
> min(d->outstanding_pages + 0UL, request);
>
> right away. In the latter case the decl wouldn't even need touching when the
> struct domain fields are promoted.
My preference would be:
unsigned long outstanding = min(d->outstanding_pages + 0UL, request);
If that's fine with you.
> > + BUG_ON(outstanding > d->outstanding_pages);
>
> Unlike in v1, where the min() was different, this is now dead code.
Oh, I need to adjust this so it's outstanding > outstanding_claims
instead.
Thanks, Roger.
On 08.01.2026 09:44, Roger Pau Monné wrote:
> On Thu, Jan 08, 2026 at 09:24:51AM +0100, Jan Beulich wrote:
>> On 07.01.2026 18:56, Roger Pau Monne wrote:
>>> The current logic splits the update of the amount of available memory in
>>> the system (total_avail_pages) and pending claims into two separately
>>> locked regions. This leads to a window between counters adjustments where
>>> the result of total_avail_pages - outstanding_claims doesn't reflect the
>>> real amount of free memory available, and can return a negative value due
>>> to total_avail_pages having been updated ahead of outstanding_claims.
>>>
>>> Fix by adjusting outstanding_claims and d->outstanding_pages in the same
>>> place where total_avail_pages is updated. Note that accesses to
>>> d->outstanding_pages is protected by the global heap_lock, just like
>>> total_avail_pages or outstanding_claims. Add a comment to the field
>>> declaration, and also adjust the comment at the top of
>>> domain_set_outstanding_pages() to be clearer in that regard.
>>>
>>> Note that failures in assign_pages() causes the claimed amount that has
>>> been allocated to be lost, as the amount is not added back to the domain
>>> quota once pages are freed. Given the intended usage of claims is limited
>>> to initial physmap populate, and the current failure paths in
>>> assign_pages() would lead to the domain being destroyed anyway, don't
>>> add extra logic to recover the claimed amount on failure - it's just adding
>>> complexity for no real benefit.
>>>
>>> Fixes: 65c9792df600 ("mmu: Introduce XENMEM_claim_pages (subop of memory ops)")
>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>>> ---
>>> Changes since v2:
>>> - Revert back to the approach in v1.
>>
>> You didn't fully go back to v1. While ...
>>
>>> @@ -548,9 +524,10 @@ int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
>>> unsigned long claim, avail_pages;
>>>
>>> /*
>>> - * take the domain's page_alloc_lock, else all d->tot_page adjustments
>>> - * must always take the global heap_lock rather than only in the much
>>> - * rarer case that d->outstanding_pages is non-zero
>>> + * Two locks are needed here:
>>> + * - d->page_alloc_lock: protects accesses to d->{tot,max,extra}_pages.
>>> + * - heap_lock: protects accesses to d->outstanding_pages, total_avail_pages
>>> + * and outstanding_claims.
>>> */
>>> nrspin_lock(&d->page_alloc_lock);
>>> spin_lock(&heap_lock);
>>
>> ... the comment improvement is of course okay to keep, ...
>>
>>> @@ -999,7 +976,7 @@ static struct page_info *alloc_heap_pages(
>>> {
>>> nodeid_t node;
>>> unsigned int i, buddy_order, zone, first_dirty;
>>> - unsigned long request = 1UL << order;
>>> + unsigned int request = 1UL << order;
>>
>> ... this I'm less certain about (and if it was to be kept, it should also
>> become 1U). For one, this bounds check:
>>
>> if ( (outstanding_claims + request > total_avail_pages) &&
>>
>> ends up still being okay (perhaps except on Arm32, but the wrapping issue
>> there is pre-existing, albeit possibly benign due to other constraints),
>> but just because outstanding_claims is "long" (and it's unclear to me why
>> it isn't "unsigned long").
>>
>> And then, what exactly is it that you want the more narrow type for (the
>> description says nothing in that regard)? The other relevant uses of the
>> variable are
>>
>> avail[node][zone] -= request;
>> total_avail_pages -= request;
>>
>> where both avail[][] and total_avail_pages are (unsigned) long (again
>> unclear to me why for total_avail_pages it's plain long).
>>
>> Oh, wait, it is ...
>>
>>> @@ -1071,6 +1050,30 @@ static struct page_info *alloc_heap_pages(
>>> total_avail_pages -= request;
>>> ASSERT(total_avail_pages >= 0);
>>>
>>> + if ( d && d->outstanding_pages && !(memflags & MEMF_no_refcount) )
>>> + {
>>> + /*
>>> + * Adjust claims in the same locked region where total_avail_pages is
>>> + * adjusted, not doing so would lead to a window where the amount of
>>> + * free memory (avail - claimed) would be incorrect.
>>> + *
>>> + * Note that by adjusting the claimed amount here it's possible for
>>> + * pages to fail to be assigned to the claiming domain while already
>>> + * having been subtracted from d->outstanding_pages. Such claimed
>>> + * amount is then lost, as the pages that fail to be assigned to the
>>> + * domain are freed without replenishing the claim. This is fine given
>>> + * claims are only to be used during physmap population as part of
>>> + * domain build, and any failure in assign_pages() there will result in
>>> + * the domain being destroyed before creation is finished. Losing part
>>> + * of the claim makes no difference.
>>> + */
>>> + unsigned int outstanding = min(d->outstanding_pages, request);
>>
>> ... the desire to avoid use of min_t() here which wants "request" to be
>> "unsigned int". At some point we'll want to change the struct domain fields
>> to unsigned long anyway, at which point the above would need adjustment. It's
>> well possible that such an adjustment would end up being to then use min_t().
>> Imo we'd be better off using e.g.
>>
>> unsigned int outstanding = min(d->outstanding_pages + 0UL, request);
>>
>> or even
>>
>> typeof(d->outstanding_pages) outstanding =
>> min(d->outstanding_pages + 0UL, request);
>>
>> right away. In the latter case the decl wouldn't even need touching when the
>> struct domain fields are promoted.
>
> My preference would be:
>
> unsigned long outstanding = min(d->outstanding_pages + 0UL, request);
>
> If that's fine with you.
It is.
>>> + BUG_ON(outstanding > d->outstanding_pages);
>>
>> Unlike in v1, where the min() was different, this is now dead code.
>
> Oh, I need to adjust this so it's outstanding > outstanding_claims
> instead.
And then:
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Jan
© 2016 - 2026 Red Hat, Inc.