[PATCH v3 1/9] x86/PVH: improve Dom0 memory size calculation

Jan Beulich posted 9 patches 3 years, 2 months ago
There is a newer version of this series
[PATCH v3 1/9] x86/PVH: improve Dom0 memory size calculation
Posted by Jan Beulich 3 years, 2 months ago
Assuming that the accounting for IOMMU page tables will also take care
of the P2M needs was wrong: dom0_paging_pages() can determine a far
higher value, high enough for the system to run out of memory while
setting up Dom0. Hence in the case of shared page tables the larger of
the two values needs to be used (without shared page tables the sum of
both continues to be applicable).

While there also account for two further aspects in the PV case: With
"iommu=dom0-passthrough" no IOMMU page tables would get allocated, so
none need accounting for. And if shadow mode is to be enabled, setting
aside a suitable amount for the P2M pool to get populated is also
necessary (i.e. similar to the non-shared-page-tables case of PVH).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/dom0_build.c
+++ b/xen/arch/x86/dom0_build.c
@@ -318,7 +318,7 @@ unsigned long __init dom0_compute_nr_pag
     struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
 {
     nodeid_t node;
-    unsigned long avail = 0, nr_pages, min_pages, max_pages;
+    unsigned long avail = 0, nr_pages, min_pages, max_pages, iommu_pages = 0;
     bool need_paging;
 
     /* The ordering of operands is to work around a clang5 issue. */
@@ -337,18 +337,23 @@ unsigned long __init dom0_compute_nr_pag
         avail -= d->max_vcpus - 1;
 
     /* Reserve memory for iommu_dom0_init() (rough estimate). */
-    if ( is_iommu_enabled(d) )
+    if ( is_iommu_enabled(d) && !iommu_hwdom_passthrough )
     {
         unsigned int s;
 
         for ( s = 9; s < BITS_PER_LONG; s += 9 )
-            avail -= max_pdx >> s;
+            iommu_pages += max_pdx >> s;
+
+        avail -= iommu_pages;
     }
 
-    need_paging = is_hvm_domain(d) &&
-        (!iommu_use_hap_pt(d) || !paging_mode_hap(d));
+    need_paging = is_hvm_domain(d)
+                  ? !iommu_use_hap_pt(d) || !paging_mode_hap(d)
+                  : opt_dom0_shadow;
     for ( ; ; need_paging = false )
     {
+        unsigned long paging_pages;
+
         nr_pages = get_memsize(&dom0_size, avail);
         min_pages = get_memsize(&dom0_min_size, avail);
         max_pages = get_memsize(&dom0_max_size, avail);
@@ -377,11 +382,20 @@ unsigned long __init dom0_compute_nr_pag
         nr_pages = min(nr_pages, max_pages);
         nr_pages = min(nr_pages, avail);
 
-        if ( !need_paging )
-            break;
+        paging_pages = paging_mode_enabled(d) || need_paging
+                       ? dom0_paging_pages(d, nr_pages) : 0;
 
         /* Reserve memory for shadow or HAP. */
-        avail -= dom0_paging_pages(d, nr_pages);
+        if ( !need_paging )
+        {
+            if ( paging_pages <= iommu_pages )
+                break;
+
+            avail -= paging_pages - iommu_pages;
+        }
+        else
+            avail -= paging_pages;
+        iommu_pages = paging_pages;
     }
 
     if ( is_pv_domain(d) &&


Re: [PATCH v3 1/9] x86/PVH: improve Dom0 memory size calculation
Posted by Roger Pau Monné 3 years, 2 months ago
On Tue, Sep 21, 2021 at 09:16:44AM +0200, Jan Beulich wrote:
> Assuming that the accounting for IOMMU page tables will also take care
> of the P2M needs was wrong: dom0_paging_pages() can determine a far
> higher value, high enough for the system to run out of memory while
> setting up Dom0. Hence in the case of shared page tables the larger of
> the two values needs to be used (without shared page tables the sum of
> both continues to be applicable).
> 
> While there also account for two further aspects in the PV case: With
> "iommu=dom0-passthrough" no IOMMU page tables would get allocated, so
> none need accounting for. And if shadow mode is to be enabled, setting
> aside a suitable amount for the P2M pool to get populated is also
> necessary (i.e. similar to the non-shared-page-tables case of PVH).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/dom0_build.c
> +++ b/xen/arch/x86/dom0_build.c
> @@ -318,7 +318,7 @@ unsigned long __init dom0_compute_nr_pag
>      struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
>  {
>      nodeid_t node;
> -    unsigned long avail = 0, nr_pages, min_pages, max_pages;
> +    unsigned long avail = 0, nr_pages, min_pages, max_pages, iommu_pages = 0;
>      bool need_paging;
>  
>      /* The ordering of operands is to work around a clang5 issue. */
> @@ -337,18 +337,23 @@ unsigned long __init dom0_compute_nr_pag
>          avail -= d->max_vcpus - 1;
>  
>      /* Reserve memory for iommu_dom0_init() (rough estimate). */
> -    if ( is_iommu_enabled(d) )
> +    if ( is_iommu_enabled(d) && !iommu_hwdom_passthrough )
>      {
>          unsigned int s;
>  
>          for ( s = 9; s < BITS_PER_LONG; s += 9 )
> -            avail -= max_pdx >> s;
> +            iommu_pages += max_pdx >> s;
> +
> +        avail -= iommu_pages;
>      }
>  
> -    need_paging = is_hvm_domain(d) &&
> -        (!iommu_use_hap_pt(d) || !paging_mode_hap(d));
> +    need_paging = is_hvm_domain(d)
> +                  ? !iommu_use_hap_pt(d) || !paging_mode_hap(d)
> +                  : opt_dom0_shadow;
>      for ( ; ; need_paging = false )
>      {
> +        unsigned long paging_pages;
> +
>          nr_pages = get_memsize(&dom0_size, avail);
>          min_pages = get_memsize(&dom0_min_size, avail);
>          max_pages = get_memsize(&dom0_max_size, avail);
> @@ -377,11 +382,20 @@ unsigned long __init dom0_compute_nr_pag
>          nr_pages = min(nr_pages, max_pages);
>          nr_pages = min(nr_pages, avail);
>  
> -        if ( !need_paging )
> -            break;
> +        paging_pages = paging_mode_enabled(d) || need_paging
> +                       ? dom0_paging_pages(d, nr_pages) : 0;
>  
>          /* Reserve memory for shadow or HAP. */
> -        avail -= dom0_paging_pages(d, nr_pages);
> +        if ( !need_paging )
> +        {
> +            if ( paging_pages <= iommu_pages )
> +                break;
> +
> +            avail -= paging_pages - iommu_pages;
> +        }
> +        else
> +            avail -= paging_pages;
> +        iommu_pages = paging_pages;
>      }

I always found this loop extremely confusing to reason about. Now that
we account for the iommu page tables using separate logic, do we
really need a loop here?

In fact I would suggest something like:

unsigned long cpu_pages = 0;

if ( is_iommu_enabled(d) && !iommu_hwdom_passthrough )
{
    unsigned int s;

    for ( s = 9; s < BITS_PER_LONG; s += 9 )
        iommu_pages += max_pdx >> s;
}

[perform all the nr_pages adjustments]

if ( paging_mode_enabled(d) ||
     opt_dom0_shadow /* shadow paging gets enabled later for PV dom0. */ )
    cpu_pages = dom0_paging_pages(d, nr_pages);

if ( is_hvm_domain(d) && iommu_use_hap_pt(d) && paging_mode_hap(d) )
    avail -= max(iommu_pages, cpu_pages);
else
    avail -= cpu_pages + iommu_pages;

There will be a slight over estimation of cpu_pages, as the value
passed in doesn't account for the iommu pages in case they are used,
but still it's better to over estimate than to under estimate.

Roger.

Re: [PATCH v3 1/9] x86/PVH: improve Dom0 memory size calculation
Posted by Jan Beulich 3 years, 2 months ago
On 22.09.2021 13:59, Roger Pau Monné wrote:
> On Tue, Sep 21, 2021 at 09:16:44AM +0200, Jan Beulich wrote:
>> @@ -337,18 +337,23 @@ unsigned long __init dom0_compute_nr_pag
>>          avail -= d->max_vcpus - 1;
>>  
>>      /* Reserve memory for iommu_dom0_init() (rough estimate). */
>> -    if ( is_iommu_enabled(d) )
>> +    if ( is_iommu_enabled(d) && !iommu_hwdom_passthrough )
>>      {
>>          unsigned int s;
>>  
>>          for ( s = 9; s < BITS_PER_LONG; s += 9 )
>> -            avail -= max_pdx >> s;
>> +            iommu_pages += max_pdx >> s;
>> +
>> +        avail -= iommu_pages;
>>      }
>>  
>> -    need_paging = is_hvm_domain(d) &&
>> -        (!iommu_use_hap_pt(d) || !paging_mode_hap(d));
>> +    need_paging = is_hvm_domain(d)
>> +                  ? !iommu_use_hap_pt(d) || !paging_mode_hap(d)
>> +                  : opt_dom0_shadow;
>>      for ( ; ; need_paging = false )
>>      {
>> +        unsigned long paging_pages;
>> +
>>          nr_pages = get_memsize(&dom0_size, avail);
>>          min_pages = get_memsize(&dom0_min_size, avail);
>>          max_pages = get_memsize(&dom0_max_size, avail);
>> @@ -377,11 +382,20 @@ unsigned long __init dom0_compute_nr_pag
>>          nr_pages = min(nr_pages, max_pages);
>>          nr_pages = min(nr_pages, avail);
>>  
>> -        if ( !need_paging )
>> -            break;
>> +        paging_pages = paging_mode_enabled(d) || need_paging
>> +                       ? dom0_paging_pages(d, nr_pages) : 0;
>>  
>>          /* Reserve memory for shadow or HAP. */
>> -        avail -= dom0_paging_pages(d, nr_pages);
>> +        if ( !need_paging )
>> +        {
>> +            if ( paging_pages <= iommu_pages )
>> +                break;
>> +
>> +            avail -= paging_pages - iommu_pages;
>> +        }
>> +        else
>> +            avail -= paging_pages;
>> +        iommu_pages = paging_pages;
>>      }
> 
> I always found this loop extremely confusing to reason about. Now that
> we account for the iommu page tables using separate logic, do we
> really need a loop here?
> 
> In fact I would suggest something like:
> 
> unsigned long cpu_pages = 0;
> 
> if ( is_iommu_enabled(d) && !iommu_hwdom_passthrough )
> {
>     unsigned int s;
> 
>     for ( s = 9; s < BITS_PER_LONG; s += 9 )
>         iommu_pages += max_pdx >> s;
> }
> 
> [perform all the nr_pages adjustments]
> 
> if ( paging_mode_enabled(d) ||
>      opt_dom0_shadow /* shadow paging gets enabled later for PV dom0. */ )
>     cpu_pages = dom0_paging_pages(d, nr_pages);
> 
> if ( is_hvm_domain(d) && iommu_use_hap_pt(d) && paging_mode_hap(d) )
>     avail -= max(iommu_pages, cpu_pages);
> else
>     avail -= cpu_pages + iommu_pages;
> 
> There will be a slight over estimation of cpu_pages, as the value
> passed in doesn't account for the iommu pages in case they are used,
> but still it's better to over estimate than to under estimate.

Overestimating cpu_pages isn't a primary concern, I agree. But we
want to get the min/max clamping reasonably close. Therefore, while
dropping the loop as you suggest I've introduced two instances of
that logic, before and after calculating cpu_pages. I'll see to
send out v4 soonish (provided the result actually works for, in
particular, the case that I needed the fix for in the first place).

Jan