[PATCH 0/3] introduce and use xvmalloc() et al / shrink x86 xstate area

Jan Beulich posted 3 patches 3 weeks, 1 day ago
Only 0 patches received!

[PATCH 0/3] introduce and use xvmalloc() et al / shrink x86 xstate area

Posted by Jan Beulich 3 weeks, 1 day ago
While these may seem somewhat unrelated, the connection between them
is the middle of the patches

1: mm: introduce xvmalloc() et al and use for grant table allocations
2: x86/xstate: use xvzalloc() for save area allocation
3: x86/xstate: re-size save area when CPUID policy changes

Jan

[PATCH 1/3] mm: introduce xvmalloc() et al and use for grant table allocations

Posted by Jan Beulich 3 weeks, 1 day ago
All of the array allocations in grant_table_init() can exceed a page's
worth of memory, which xmalloc()-based interfaces aren't really suitable
for after boot. Introduce interfaces dynamically switching between
xmalloc() et al and vmalloc() et al, based on requested size, and use
them instead.

All the wrappers in the new header get cloned mostly verbatim from
xmalloc.h, with the sole adjustment to switch unsigned long to size_t
for sizes and to unsigned int for alignments.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -37,7 +37,7 @@
 #include <xen/iommu.h>
 #include <xen/paging.h>
 #include <xen/keyhandler.h>
-#include <xen/vmap.h>
+#include <xen/xvmalloc.h>
 #include <xen/nospec.h>
 #include <xsm/xsm.h>
 #include <asm/flushtlb.h>
@@ -1925,27 +1925,28 @@ int grant_table_init(struct domain *d, i
     d->grant_table = gt;
 
     /* Active grant table. */
-    gt->active = xzalloc_array(struct active_grant_entry *,
-                               max_nr_active_grant_frames(gt));
+    gt->active = xvzalloc_array(struct active_grant_entry *,
+                                max_nr_active_grant_frames(gt));
     if ( gt->active == NULL )
         goto out;
 
     /* Tracking of mapped foreign frames table */
     if ( gt->max_maptrack_frames )
     {
-        gt->maptrack = vzalloc(gt->max_maptrack_frames * sizeof(*gt->maptrack));
+        gt->maptrack = xvzalloc_array(struct grant_mapping *,
+                                      gt->max_maptrack_frames);
         if ( gt->maptrack == NULL )
             goto out;
     }
 
     /* Shared grant table. */
-    gt->shared_raw = xzalloc_array(void *, gt->max_grant_frames);
+    gt->shared_raw = xvzalloc_array(void *, gt->max_grant_frames);
     if ( gt->shared_raw == NULL )
         goto out;
 
     /* Status pages for grant table - for version 2 */
-    gt->status = xzalloc_array(grant_status_t *,
-                               grant_to_status_frames(gt->max_grant_frames));
+    gt->status = xvzalloc_array(grant_status_t *,
+                                grant_to_status_frames(gt->max_grant_frames));
     if ( gt->status == NULL )
         goto out;
 
@@ -3870,19 +3871,19 @@ grant_table_destroy(
 
     for ( i = 0; i < nr_grant_frames(t); i++ )
         free_xenheap_page(t->shared_raw[i]);
-    xfree(t->shared_raw);
+    xvfree(t->shared_raw);
 
     for ( i = 0; i < nr_maptrack_frames(t); i++ )
         free_xenheap_page(t->maptrack[i]);
-    vfree(t->maptrack);
+    xvfree(t->maptrack);
 
     for ( i = 0; i < nr_active_grant_frames(t); i++ )
         free_xenheap_page(t->active[i]);
-    xfree(t->active);
+    xvfree(t->active);
 
     for ( i = 0; i < nr_status_frames(t); i++ )
         free_xenheap_page(t->status[i]);
-    xfree(t->status);
+    xvfree(t->status);
 
     xfree(t);
     d->grant_table = NULL;
--- a/xen/common/vmap.c
+++ b/xen/common/vmap.c
@@ -7,6 +7,7 @@
 #include <xen/spinlock.h>
 #include <xen/types.h>
 #include <xen/vmap.h>
+#include <xen/xvmalloc.h>
 #include <asm/page.h>
 
 static DEFINE_SPINLOCK(vm_lock);
@@ -299,11 +300,29 @@ void *vzalloc(size_t size)
     return p;
 }
 
-void vfree(void *va)
+static void _vfree(const void *va, unsigned int pages, enum vmap_region type)
 {
-    unsigned int i, pages;
+    unsigned int i;
     struct page_info *pg;
     PAGE_LIST_HEAD(pg_list);
+
+    ASSERT(pages);
+
+    for ( i = 0; i < pages; i++ )
+    {
+        pg = vmap_to_page(va + i * PAGE_SIZE);
+        ASSERT(pg);
+        page_list_add(pg, &pg_list);
+    }
+    vunmap(va);
+
+    while ( (pg = page_list_remove_head(&pg_list)) != NULL )
+        free_domheap_page(pg);
+}
+
+void vfree(void *va)
+{
+    unsigned int pages;
     enum vmap_region type = VMAP_DEFAULT;
 
     if ( !va )
@@ -315,18 +334,71 @@ void vfree(void *va)
         type = VMAP_XEN;
         pages = vm_size(va, type);
     }
-    ASSERT(pages);
 
-    for ( i = 0; i < pages; i++ )
+    _vfree(va, pages, type);
+}
+
+void xvfree(void *va)
+{
+    unsigned int pages = vm_size(va, VMAP_DEFAULT);
+
+    if ( pages )
+        _vfree(va, pages, VMAP_DEFAULT);
+    else
+        xfree(va);
+}
+
+void *_xvmalloc(size_t size, unsigned int align)
+{
+    ASSERT(align <= PAGE_SIZE);
+    return size <= PAGE_SIZE ? _xmalloc(size, align) : vmalloc(size);
+}
+
+void *_xvzalloc(size_t size, unsigned int align)
+{
+    ASSERT(align <= PAGE_SIZE);
+    return size <= PAGE_SIZE ? _xzalloc(size, align) : vzalloc(size);
+}
+
+void *_xvrealloc(void *va, size_t size, unsigned int align)
+{
+    size_t pages = vm_size(va, VMAP_DEFAULT);
+    void *ptr;
+
+    ASSERT(align <= PAGE_SIZE);
+
+    if ( !pages )
     {
-        struct page_info *page = vmap_to_page(va + i * PAGE_SIZE);
+        if ( size <= PAGE_SIZE )
+            return _xrealloc(va, size, align);
 
-        ASSERT(page);
-        page_list_add(page, &pg_list);
+        ptr = vmalloc(size);
+        if ( ptr && va && va != ZERO_BLOCK_PTR )
+        {
+            /*
+             * xmalloc-based allocations up to PAGE_SIZE don't cross page
+             * boundaries. Therefore, without needing to know the exact
+             * prior allocation size, simply copy the entire tail of the
+             * page containing the earlier allocation.
+             */
+            memcpy(ptr, va, PAGE_SIZE - PAGE_OFFSET(va));
+            xfree(va);
+        }
+    }
+    else if ( pages == PFN_UP(size) )
+        ptr = va;
+    else
+    {
+        ptr = _xvmalloc(size, align);
+        if ( ptr )
+        {
+            memcpy(ptr, va, min(size, pages << PAGE_SHIFT));
+            vfree(va);
+        }
+        else if ( pages > PFN_UP(size) )
+            ptr = va;
     }
-    vunmap(va);
 
-    while ( (pg = page_list_remove_head(&pg_list)) != NULL )
-        free_domheap_page(pg);
+    return ptr;
 }
 #endif
--- /dev/null
+++ b/xen/include/xen/xvmalloc.h
@@ -0,0 +1,70 @@
+
+#ifndef __XVMALLOC_H__
+#define __XVMALLOC_H__
+
+#include <xen/cache.h>
+#include <xen/types.h>
+
+/*
+ * Xen malloc/free-style interface.
+ */
+
+/* Allocate space for typed object. */
+#define xvmalloc(_type) ((_type *)_xvmalloc(sizeof(_type), __alignof__(_type)))
+#define xvzalloc(_type) ((_type *)_xvzalloc(sizeof(_type), __alignof__(_type)))
+
+/* Allocate space for array of typed objects. */
+#define xvmalloc_array(_type, _num) \
+    ((_type *)_xvmalloc_array(sizeof(_type), __alignof__(_type), _num))
+#define xvzalloc_array(_type, _num) \
+    ((_type *)_xvzalloc_array(sizeof(_type), __alignof__(_type), _num))
+
+/* Allocate space for a structure with a flexible array of typed objects. */
+#define xvzalloc_flex_struct(type, field, nr) \
+    ((type *)_xvzalloc(offsetof(type, field[nr]), __alignof__(type)))
+
+#define xvmalloc_flex_struct(type, field, nr) \
+    ((type *)_xvmalloc(offsetof(type, field[nr]), __alignof__(type)))
+
+/* Re-allocate space for a structure with a flexible array of typed objects. */
+#define xvrealloc_flex_struct(ptr, field, nr)                          \
+    ((typeof(ptr))_xvrealloc(ptr, offsetof(typeof(*(ptr)), field[nr]), \
+                             __alignof__(typeof(*(ptr)))))
+
+/* Allocate untyped storage. */
+#define xvmalloc_bytes(_bytes) _xvmalloc(_bytes, SMP_CACHE_BYTES)
+#define xvzalloc_bytes(_bytes) _xvzalloc(_bytes, SMP_CACHE_BYTES)
+
+/* Free any of the above. */
+extern void xvfree(void *);
+
+/* Free an allocation, and zero the pointer to it. */
+#define XVFREE(p) do { \
+    xvfree(p);         \
+    (p) = NULL;        \
+} while ( false )
+
+/* Underlying functions */
+extern void *_xvmalloc(size_t size, unsigned int align);
+extern void *_xvzalloc(size_t size, unsigned int align);
+extern void *_xvrealloc(void *ptr, size_t size, unsigned int align);
+
+static inline void *_xvmalloc_array(
+    size_t size, unsigned int align, unsigned long num)
+{
+    /* Check for overflow. */
+    if ( size && num > UINT_MAX / size )
+        return NULL;
+    return _xvmalloc(size * num, align);
+}
+
+static inline void *_xvzalloc_array(
+    size_t size, unsigned int align, unsigned long num)
+{
+    /* Check for overflow. */
+    if ( size && num > UINT_MAX / size )
+        return NULL;
+    return _xvzalloc(size * num, align);
+}
+
+#endif /* __XVMALLOC_H__ */


Re: [PATCH 1/3] mm: introduce xvmalloc() et al and use for grant table allocations

Posted by Julien Grall 1 week, 3 days ago
Hi Jan,

On 06/11/2020 07:11, Jan Beulich wrote:
> All of the array allocations in grant_table_init() can exceed a page's
> worth of memory, which xmalloc()-based interfaces aren't really suitable
> for after boot. 

I can see a few reasons why they are not suitable:
   - xmalloc() will try to using an order and then throw memory. This is 
pretty inneficient.
   - xmalloc() will allocate physically contiguous memory

It would be good to clarify which one you refer because none of them are 
really a problem only after boot...

One thing to be aware thought is xv*() are going to be more inefficient 
because they involve touching the page-tables (at least until the work 
to map on-demand the direct map is not merged). In addition, on Arm, 
they will also use only 4K mappings (I have a TODO to fix that).

So I think we will need to be careful when to use xmalloc() vs 
xvalloc(). It might be worth outlining that in the documentation of xv*().

The current use in the grant-table code looks fine to me.

[...]

> --- a/xen/common/vmap.c
> +++ b/xen/common/vmap.c
> @@ -7,6 +7,7 @@
>   #include <xen/spinlock.h>
>   #include <xen/types.h>
>   #include <xen/vmap.h>
> +#include <xen/xvmalloc.h>
>   #include <asm/page.h>
>   
>   static DEFINE_SPINLOCK(vm_lock);
> @@ -299,11 +300,29 @@ void *vzalloc(size_t size)
>       return p;
>   }
>   
> -void vfree(void *va)
> +static void _vfree(const void *va, unsigned int pages, enum vmap_region type)

I don't think "unsigned int" is sufficient to cover big size. AFAICT, 
this is not in a new problem in this code and seems to be a latent issue 
so far.

However, I feel that it is wrong to introduce a new set of allocation 
helpers that contained a flaw fixed in xm*alloc() recently (see  commit 
cf38b4926e2b "xmalloc: guard against integer overflow").

> -    unsigned int i, pages;
> +    unsigned int i;
>       struct page_info *pg;
>       PAGE_LIST_HEAD(pg_list);
> +
> +    ASSERT(pages);
> +
> +    for ( i = 0; i < pages; i++ )
> +    {
> +        pg = vmap_to_page(va + i * PAGE_SIZE);
> +        ASSERT(pg);
> +        page_list_add(pg, &pg_list);
> +    }
> +    vunmap(va);
> +
> +    while ( (pg = page_list_remove_head(&pg_list)) != NULL )
> +        free_domheap_page(pg);
> +}
> +
> +void vfree(void *va)
> +{
> +    unsigned int pages;
>       enum vmap_region type = VMAP_DEFAULT;
>   
>       if ( !va )
> @@ -315,18 +334,71 @@ void vfree(void *va)
>           type = VMAP_XEN;
>           pages = vm_size(va, type);
>       }
> -    ASSERT(pages);
>   
> -    for ( i = 0; i < pages; i++ )
> +    _vfree(va, pages, type);
> +}
> +

[...]

> --- /dev/null
> +++ b/xen/include/xen/xvmalloc.h
> @@ -0,0 +1,70 @@
> +
> +#ifndef __XVMALLOC_H__
> +#define __XVMALLOC_H__
> +
> +#include <xen/cache.h>
> +#include <xen/types.h>
> +
> +/*
> + * Xen malloc/free-style interface.

It would be useful to emphase that they should only be used if the 
caller does *not* need physically contiguous memory.

> + */
> +
> +/* Allocate space for typed object. */
> +#define xvmalloc(_type) ((_type *)_xvmalloc(sizeof(_type), __alignof__(_type)))
> +#define xvzalloc(_type) ((_type *)_xvzalloc(sizeof(_type), __alignof__(_type)))
> +
> +/* Allocate space for array of typed objects. */
> +#define xvmalloc_array(_type, _num) \
> +    ((_type *)_xvmalloc_array(sizeof(_type), __alignof__(_type), _num))
> +#define xvzalloc_array(_type, _num) \
> +    ((_type *)_xvzalloc_array(sizeof(_type), __alignof__(_type), _num))
> +
> +/* Allocate space for a structure with a flexible array of typed objects. */
> +#define xvzalloc_flex_struct(type, field, nr) \
> +    ((type *)_xvzalloc(offsetof(type, field[nr]), __alignof__(type)))
> +
> +#define xvmalloc_flex_struct(type, field, nr) \
> +    ((type *)_xvmalloc(offsetof(type, field[nr]), __alignof__(type)))
> +
> +/* Re-allocate space for a structure with a flexible array of typed objects. */
> +#define xvrealloc_flex_struct(ptr, field, nr)                          \
> +    ((typeof(ptr))_xvrealloc(ptr, offsetof(typeof(*(ptr)), field[nr]), \
> +                             __alignof__(typeof(*(ptr)))))
> +
> +/* Allocate untyped storage. */
> +#define xvmalloc_bytes(_bytes) _xvmalloc(_bytes, SMP_CACHE_BYTES)
> +#define xvzalloc_bytes(_bytes) _xvzalloc(_bytes, SMP_CACHE_BYTES)
> +
> +/* Free any of the above. */
> +extern void xvfree(void *);
> +
> +/* Free an allocation, and zero the pointer to it. */
> +#define XVFREE(p) do { \
> +    xvfree(p);         \
> +    (p) = NULL;        \
> +} while ( false )
> +
> +/* Underlying functions */
> +extern void *_xvmalloc(size_t size, unsigned int align);
> +extern void *_xvzalloc(size_t size, unsigned int align);
> +extern void *_xvrealloc(void *ptr, size_t size, unsigned int align);
> +
> +static inline void *_xvmalloc_array(
> +    size_t size, unsigned int align, unsigned long num)
> +{
> +    /* Check for overflow. */
> +    if ( size && num > UINT_MAX / size )
> +        return NULL;
> +    return _xvmalloc(size * num, align);
> +}
> +
> +static inline void *_xvzalloc_array(
> +    size_t size, unsigned int align, unsigned long num)
> +{
> +    /* Check for overflow. */
> +    if ( size && num > UINT_MAX / size )
> +        return NULL;
> +    return _xvzalloc(size * num, align);
> +}
> +
> +#endif /* __XVMALLOC_H__ */
> 

-- 
Julien Grall

Re: [PATCH 1/3] mm: introduce xvmalloc() et al and use for grant table allocations

Posted by Jan Beulich 1 week, 2 days ago
On 18.11.2020 20:10, Julien Grall wrote:
> On 06/11/2020 07:11, Jan Beulich wrote:
>> All of the array allocations in grant_table_init() can exceed a page's
>> worth of memory, which xmalloc()-based interfaces aren't really suitable
>> for after boot. 
> 
> I can see a few reasons why they are not suitable:
>    - xmalloc() will try to using an order and then throw memory. This is 
> pretty inneficient.

But addressing this inefficiency, while a nice side effect, is
not the goal here.

>    - xmalloc() will allocate physically contiguous memory

This aspect matters here only indirectly: What we care about
avoiding are runtime allocations of non-zero order. The assumption
of how I worded the description is that during boot non-zero-
order allocations can typically be fulfilled and hence aren't a
(general) problem.

> It would be good to clarify which one you refer because none of them are 
> really a problem only after boot...

Given the above, I'm not sure in which way you would see this be
clarified. Any suggestions welcome.

> One thing to be aware thought is xv*() are going to be more inefficient 
> because they involve touching the page-tables (at least until the work 
> to map on-demand the direct map is not merged). In addition, on Arm, 
> they will also use only 4K mappings (I have a TODO to fix that).
> 
> So I think we will need to be careful when to use xmalloc() vs 
> xvalloc(). It might be worth outlining that in the documentation of xv*().

The rule is quite simple and the inefficiencies you mention
shouldn't matter imo: Post-boot there should not be any
implicit allocations of non-zero order. "Implicit" here meaning
to still permit e.g. direct alloc_domheap_pages() invocations,
making apparent at the call site that the aspect of memory
fragmentation was (hopefully) taken into consideration. I'm
actually inclined to suggest (down the road) to have _xmalloc()
no longer fall back to multi-page allocations post-boot, but
instead return NULL.

If you think it is really needed, I can add something like "These
should be used in preference to xmalloc() et al whenever the size
is not known to be constrained to at most a single page" to the
comment at the top of the new header file.

Where the inefficiencies you mention would imo matter is in
(future) decisions whether to use vmalloc() et al vs xvmalloc()
et al: If the size _may_ be no more than a page, the latter may
want preferring.

>> --- a/xen/common/vmap.c
>> +++ b/xen/common/vmap.c
>> @@ -7,6 +7,7 @@
>>   #include <xen/spinlock.h>
>>   #include <xen/types.h>
>>   #include <xen/vmap.h>
>> +#include <xen/xvmalloc.h>
>>   #include <asm/page.h>
>>   
>>   static DEFINE_SPINLOCK(vm_lock);
>> @@ -299,11 +300,29 @@ void *vzalloc(size_t size)
>>       return p;
>>   }
>>   
>> -void vfree(void *va)
>> +static void _vfree(const void *va, unsigned int pages, enum vmap_region type)
> 
> I don't think "unsigned int" is sufficient to cover big size. AFAICT, 
> this is not in a new problem in this code and seems to be a latent issue 
> so far.
> 
> However, I feel that it is wrong to introduce a new set of allocation 
> helpers that contained a flaw fixed in xm*alloc() recently (see  commit 
> cf38b4926e2b "xmalloc: guard against integer overflow").

For _xmalloc() we're talking about bytes (and the guarding you
refer to is actually orthogonal to the limiting done by the
page allocator, as follows from the description of that change).
Here we're talking about pages. I hope it will be decades until we
have to consider allocating 16Tb all in one chunk (and we'd need
to have large enough vmap virtual address space set aside first).
Also note that
- the entire vmap.c right now uses unsigned int for page counts,
  so it would be outright inconsistent to use unsigned long here,
- at least on x86 passing around 64-bit function arguments is
  slightly less efficient than 32-bit ones, and hence I'd prefer
  to keep it like this.

>> --- /dev/null
>> +++ b/xen/include/xen/xvmalloc.h
>> @@ -0,0 +1,70 @@
>> +
>> +#ifndef __XVMALLOC_H__
>> +#define __XVMALLOC_H__
>> +
>> +#include <xen/cache.h>
>> +#include <xen/types.h>
>> +
>> +/*
>> + * Xen malloc/free-style interface.
> 
> It would be useful to emphase that they should only be used if the 
> caller does *not* need physically contiguous memory.

Actually first of all I shouldn't have copied to comment without
editing. I've now made it

/*
 * Xen malloc/free-style interface preferable for allocations possibly
 * exceeding a page's worth of memory, as long as there's no need to have
 * physically contiguous memory allocated.
 */

albeit I'm not sure the "physically discontiguous" really needs
pointing out, considering the 'v' in the function names.

Jan

Re: [PATCH 1/3] mm: introduce xvmalloc() et al and use for grant table allocations

Posted by Julien Grall 1 week, 2 days ago
Hi,

On 19/11/2020 09:46, Jan Beulich wrote:
> On 18.11.2020 20:10, Julien Grall wrote:
>> On 06/11/2020 07:11, Jan Beulich wrote:
>>> All of the array allocations in grant_table_init() can exceed a page's
>>> worth of memory, which xmalloc()-based interfaces aren't really suitable
>>> for after boot.
>>
>> I can see a few reasons why they are not suitable:
>>     - xmalloc() will try to using an order and then throw memory. This is
>> pretty inneficient.
> 
> But addressing this inefficiency, while a nice side effect, is
> not the goal here.
> 
>>     - xmalloc() will allocate physically contiguous memory
> 
> This aspect matters here only indirectly: What we care about
> avoiding are runtime allocations of non-zero order. The assumption
> of how I worded the description is that during boot non-zero-
> order allocations can typically be fulfilled and hence aren't a
> (general) problem.
Well... In the case of the grant table, if you can't find a small order 
of physically contiguous pages then you have bigger trouble on your 
platform. You will either not have enough space for the allocating the 
domain memory, or the performance will be awful because only 4K pages 
are used.

So while I agree that having xvmalloc() is a good move, I am not 
convinced of your argument regarding the boot vs runtime.

I think a better argument is the allocation doesn't need to be 
physically contiguous in memory. So better avoid it when we can.

> 
>> It would be good to clarify which one you refer because none of them are
>> really a problem only after boot...
> 
> Given the above, I'm not sure in which way you would see this be
> clarified. Any suggestions welcome.
> 
>> One thing to be aware thought is xv*() are going to be more inefficient
>> because they involve touching the page-tables (at least until the work
>> to map on-demand the direct map is not merged). In addition, on Arm,
>> they will also use only 4K mappings (I have a TODO to fix that).
>>
>> So I think we will need to be careful when to use xmalloc() vs
>> xvalloc(). It might be worth outlining that in the documentation of xv*().
> 
> The rule is quite simple and the inefficiencies you mention
> shouldn't matter imo: Post-boot there should not be any
> implicit allocations of non-zero order. "Implicit" here meaning
> to still permit e.g. direct alloc_domheap_pages() invocations,
> making apparent at the call site that the aspect of memory
> fragmentation was (hopefully) taken into consideration. I'm
> actually inclined to suggest (down the road) to have _xmalloc()
> no longer fall back to multi-page allocations post-boot, but
> instead return NULL.

One advantage of xmalloc() is it is able to allocate a suitable xenheap 
area. So it will not touch the page-tables and therefore useful for 
short-life allocation as the overhead will be more limited compare to 
xvalloc().

There is also the problem that alloc_{dom, xen}heap_pages() works using 
order. xmalloc() is handy because it will give back the unnecessary pages.

Maybe we should consider a version of alloc_*heap_pages() that will take 
the number of pages rather than order.

> 
> If you think it is really needed, I can add something like "These
> should be used in preference to xmalloc() et al whenever the size
> is not known to be constrained to at most a single page" to the
> comment at the top of the new header file.

There are quite a few users of xmalloc() with large allocations. Yet, 
they would not be suitable for xvalloc() because they require physically 
contiguous memory. So I think you would want to mention that in the 
sentence.

> Where the inefficiencies you mention would imo matter is in
> (future) decisions whether to use vmalloc() et al vs xvmalloc()
> et al: If the size _may_ be no more than a page, the latter may
> want preferring.
I am not sure to understand this... why would we want to keep vmalloc() 
extern when xvalloc() will be calling it for allocation over a PAGE_SIZE?

> 
>>> --- a/xen/common/vmap.c
>>> +++ b/xen/common/vmap.c
>>> @@ -7,6 +7,7 @@
>>>    #include <xen/spinlock.h>
>>>    #include <xen/types.h>
>>>    #include <xen/vmap.h>
>>> +#include <xen/xvmalloc.h>
>>>    #include <asm/page.h>
>>>    
>>>    static DEFINE_SPINLOCK(vm_lock);
>>> @@ -299,11 +300,29 @@ void *vzalloc(size_t size)
>>>        return p;
>>>    }
>>>    
>>> -void vfree(void *va)
>>> +static void _vfree(const void *va, unsigned int pages, enum vmap_region type)
>>
>> I don't think "unsigned int" is sufficient to cover big size. AFAICT,
>> this is not in a new problem in this code and seems to be a latent issue
>> so far.
>>
>> However, I feel that it is wrong to introduce a new set of allocation
>> helpers that contained a flaw fixed in xm*alloc() recently (see  commit
>> cf38b4926e2b "xmalloc: guard against integer overflow").
> 
> For _xmalloc() we're talking about bytes (and the guarding you
> refer to is actually orthogonal to the limiting done by the
> page allocator, as follows from the description of that change).
> Here we're talking about pages. I hope it will be decades until we
> have to consider allocating 16Tb all in one chunk (and we'd need
> to have large enough vmap virtual address space set aside first).

I think you misunderstood my point here. I am not suggesting that a 
normal user would ask to allocate 16TB but that a caller may pass by 
mistake an unsanitized value to xv*() functions.

IIRC, the overflow check in xm*() were added after we discovered that 
some callers where passing unsanitized values.

I would expect xv*() functions to be more used in the future, so I think 
it would be unwise to not guard against overflow.

I would be happy with just checking that nr always fit in a 32-bit value.

> Also note that
> - the entire vmap.c right now uses unsigned int for page counts,
>    so it would be outright inconsistent to use unsigned long here,

I didn't suggest this would be the only place (note that "new problem"). 
This was the best place I could find to mention an existing problem that 
is widened with the introduction of xv*() helpers.

> - at least on x86 passing around 64-bit function arguments is
>    slightly less efficient than 32-bit ones, and hence I'd prefer
>    to keep it like this.

Don't you have 64-bit registers on x86-64?

But, I am really surprised this is a concern to you when all the 
functions in this code will modify the pages tables. You dismissed this 
overhead in the same e-mail...

> 
>>> --- /dev/null
>>> +++ b/xen/include/xen/xvmalloc.h
>>> @@ -0,0 +1,70 @@
>>> +
>>> +#ifndef __XVMALLOC_H__
>>> +#define __XVMALLOC_H__
>>> +
>>> +#include <xen/cache.h>
>>> +#include <xen/types.h>
>>> +
>>> +/*
>>> + * Xen malloc/free-style interface.
>>
>> It would be useful to emphase that they should only be used if the
>> caller does *not* need physically contiguous memory.
> 
> Actually first of all I shouldn't have copied to comment without
> editing. I've now made it
> 
> /*
>   * Xen malloc/free-style interface preferable for allocations possibly
>   * exceeding a page's worth of memory, as long as there's no need to have
>   * physically contiguous memory allocated.
>   */
> 
> albeit I'm not sure the "physically discontiguous" really needs
> pointing out, considering the 'v' in the function names.

Verbosity never hurt. I would not say the same with figuring out what 
'v' means.

I am not ashame to say that when began to work on Linux/Xen, I had some 
trouble to figure out the difference between kmalloc() and vmalloc().

Cheers,

-- 
Julien Grall

Re: [PATCH 1/3] mm: introduce xvmalloc() et al and use for grant table allocations

Posted by Jan Beulich 1 week, 2 days ago
On 19.11.2020 11:59, Julien Grall wrote:
> On 19/11/2020 09:46, Jan Beulich wrote:
>> On 18.11.2020 20:10, Julien Grall wrote:
>>> On 06/11/2020 07:11, Jan Beulich wrote:
>>>> All of the array allocations in grant_table_init() can exceed a page's
>>>> worth of memory, which xmalloc()-based interfaces aren't really suitable
>>>> for after boot.
>>>
>>> I can see a few reasons why they are not suitable:
>>>     - xmalloc() will try to using an order and then throw memory. This is
>>> pretty inneficient.
>>
>> But addressing this inefficiency, while a nice side effect, is
>> not the goal here.
>>
>>>     - xmalloc() will allocate physically contiguous memory
>>
>> This aspect matters here only indirectly: What we care about
>> avoiding are runtime allocations of non-zero order. The assumption
>> of how I worded the description is that during boot non-zero-
>> order allocations can typically be fulfilled and hence aren't a
>> (general) problem.
> Well... In the case of the grant table, if you can't find a small order 
> of physically contiguous pages then you have bigger trouble on your 
> platform. You will either not have enough space for the allocating the 
> domain memory, or the performance will be awful because only 4K pages 
> are used.

Performance will be affected, yes, but I'm not sure I'd call this
"awful". 

> So while I agree that having xvmalloc() is a good move, I am not 
> convinced of your argument regarding the boot vs runtime.
> 
> I think a better argument is the allocation doesn't need to be 
> physically contiguous in memory. So better avoid it when we can.

Well... I've added a sentence.

>>> It would be good to clarify which one you refer because none of them are
>>> really a problem only after boot...
>>
>> Given the above, I'm not sure in which way you would see this be
>> clarified. Any suggestions welcome.
>>
>>> One thing to be aware thought is xv*() are going to be more inefficient
>>> because they involve touching the page-tables (at least until the work
>>> to map on-demand the direct map is not merged). In addition, on Arm,
>>> they will also use only 4K mappings (I have a TODO to fix that).
>>>
>>> So I think we will need to be careful when to use xmalloc() vs
>>> xvalloc(). It might be worth outlining that in the documentation of xv*().
>>
>> The rule is quite simple and the inefficiencies you mention
>> shouldn't matter imo: Post-boot there should not be any
>> implicit allocations of non-zero order. "Implicit" here meaning
>> to still permit e.g. direct alloc_domheap_pages() invocations,
>> making apparent at the call site that the aspect of memory
>> fragmentation was (hopefully) taken into consideration. I'm
>> actually inclined to suggest (down the road) to have _xmalloc()
>> no longer fall back to multi-page allocations post-boot, but
>> instead return NULL.
> 
> One advantage of xmalloc() is it is able to allocate a suitable xenheap 
> area. So it will not touch the page-tables and therefore useful for 
> short-life allocation as the overhead will be more limited compare to 
> xvalloc().

Yet it still shouldn't be used post-boot when the size may exceed
system page size, to avoid reporting -ENOMEM or alike when really
there's ample but fragmented memory available.

> There is also the problem that alloc_{dom, xen}heap_pages() works using 
> order. xmalloc() is handy because it will give back the unnecessary pages.
> 
> Maybe we should consider a version of alloc_*heap_pages() that will take 
> the number of pages rather than order.

Possibly, I've indeed been wondering more than once whether we should.

>> If you think it is really needed, I can add something like "These
>> should be used in preference to xmalloc() et al whenever the size
>> is not known to be constrained to at most a single page" to the
>> comment at the top of the new header file.
> 
> There are quite a few users of xmalloc() with large allocations. Yet, 
> they would not be suitable for xvalloc() because they require physically 
> contiguous memory.

Isn't this a mistake? I.e. am I unaware of some comment saying that
xmalloc() actually guarantees to return physically contiguous memory?
It's certainly against the "Xen malloc/free-style interface" comment
at the top of the header.

It was my understanding so far that with the removal of the direct-
map this property would go away anyway.

> So I think you would want to mention that in the 
> sentence.

Well, as you've seen further down the comment already mentions that
aspect.

>> Where the inefficiencies you mention would imo matter is in
>> (future) decisions whether to use vmalloc() et al vs xvmalloc()
>> et al: If the size _may_ be no more than a page, the latter may
>> want preferring.
> I am not sure to understand this... why would we want to keep vmalloc() 
> extern when xvalloc() will be calling it for allocation over a PAGE_SIZE?

Why force callers knowing they'll allocate more than a page to go
through the extra layer? If that was the plan, then we wouldn't
need a set of new functions, but could instead tweak vmalloc() etc.

>>>> --- a/xen/common/vmap.c
>>>> +++ b/xen/common/vmap.c
>>>> @@ -7,6 +7,7 @@
>>>>    #include <xen/spinlock.h>
>>>>    #include <xen/types.h>
>>>>    #include <xen/vmap.h>
>>>> +#include <xen/xvmalloc.h>
>>>>    #include <asm/page.h>
>>>>    
>>>>    static DEFINE_SPINLOCK(vm_lock);
>>>> @@ -299,11 +300,29 @@ void *vzalloc(size_t size)
>>>>        return p;
>>>>    }
>>>>    
>>>> -void vfree(void *va)
>>>> +static void _vfree(const void *va, unsigned int pages, enum vmap_region type)
>>>
>>> I don't think "unsigned int" is sufficient to cover big size. AFAICT,
>>> this is not in a new problem in this code and seems to be a latent issue
>>> so far.
>>>
>>> However, I feel that it is wrong to introduce a new set of allocation
>>> helpers that contained a flaw fixed in xm*alloc() recently (see  commit
>>> cf38b4926e2b "xmalloc: guard against integer overflow").
>>
>> For _xmalloc() we're talking about bytes (and the guarding you
>> refer to is actually orthogonal to the limiting done by the
>> page allocator, as follows from the description of that change).
>> Here we're talking about pages. I hope it will be decades until we
>> have to consider allocating 16Tb all in one chunk (and we'd need
>> to have large enough vmap virtual address space set aside first).
> 
> I think you misunderstood my point here. I am not suggesting that a 
> normal user would ask to allocate 16TB but that a caller may pass by 
> mistake an unsanitized value to xv*() functions.
> 
> IIRC, the overflow check in xm*() were added after we discovered that 
> some callers where passing unsanitized values.
> 
> I would expect xv*() functions to be more used in the future, so I think 
> it would be unwise to not guard against overflow.
> 
> I would be happy with just checking that nr always fit in a 32-bit value.

The two callers of the function obtain the value from vm_size(),
which returns unsigned int.

>> Also note that
>> - the entire vmap.c right now uses unsigned int for page counts,
>>    so it would be outright inconsistent to use unsigned long here,
> 
> I didn't suggest this would be the only place (note that "new problem"). 
> This was the best place I could find to mention an existing problem that 
> is widened with the introduction of xv*() helpers.

Oh, so you're talking of a separate and entirely unrelated patch
making sure the existing vmalloc() won't suffer such a problem.
Yes, vmalloc_type() could be fixed to this effect. But do you
realize we'd have a security issue much earlier if any guest
action could lead to such a gigantic vmalloc(), as the time to
both allocate and then map 4 billion pages is going to be way
longer than what we may tolerate without preempting?

And no, there's no overflowing calculation anywhere afaics which
would resemble the ones in xmalloc() you refer to.

>> - at least on x86 passing around 64-bit function arguments is
>>    slightly less efficient than 32-bit ones, and hence I'd prefer
>>    to keep it like this.
> 
> Don't you have 64-bit registers on x86-64?

Yes, yet to operate on them most insns need an extra prefix
byte.

> But, I am really surprised this is a concern to you when all the 
> functions in this code will modify the pages tables. You dismissed this 
> overhead in the same e-mail...

Entirely different considerations: The goal of limiting variable
(and parameter) types to 32 bits where possible is a generic one.
Which is, if for nothing else, to avoid introducing bad precedent.

Jan

Re: [PATCH 1/3] mm: introduce xvmalloc() et al and use for grant table allocations

Posted by Julien Grall 1 week, 2 days ago

On 19/11/2020 12:33, Jan Beulich wrote:
> On 19.11.2020 11:59, Julien Grall wrote:
>> On 19/11/2020 09:46, Jan Beulich wrote:
>>> On 18.11.2020 20:10, Julien Grall wrote:
>>>> On 06/11/2020 07:11, Jan Beulich wrote:
>>>>> All of the array allocations in grant_table_init() can exceed a page's
>>>>> worth of memory, which xmalloc()-based interfaces aren't really suitable
>>>>> for after boot.
>>>>
>>>> I can see a few reasons why they are not suitable:
>>>>      - xmalloc() will try to using an order and then throw memory. This is
>>>> pretty inneficient.
>>>
>>> But addressing this inefficiency, while a nice side effect, is
>>> not the goal here.
>>>
>>>>      - xmalloc() will allocate physically contiguous memory
>>>
>>> This aspect matters here only indirectly: What we care about
>>> avoiding are runtime allocations of non-zero order. The assumption
>>> of how I worded the description is that during boot non-zero-
>>> order allocations can typically be fulfilled and hence aren't a
>>> (general) problem.
>> Well... In the case of the grant table, if you can't find a small order
>> of physically contiguous pages then you have bigger trouble on your
>> platform. You will either not have enough space for the allocating the
>> domain memory, or the performance will be awful because only 4K pages
>> are used.
> 
> Performance will be affected, yes, but I'm not sure I'd call this
> "awful".

Performance is always subjective...

> 
>> So while I agree that having xvmalloc() is a good move, I am not
>> convinced of your argument regarding the boot vs runtime.
>>
>> I think a better argument is the allocation doesn't need to be
>> physically contiguous in memory. So better avoid it when we can.
> 
> Well... I've added a sentence.
> 
>>>> It would be good to clarify which one you refer because none of them are
>>>> really a problem only after boot...
>>>
>>> Given the above, I'm not sure in which way you would see this be
>>> clarified. Any suggestions welcome.
>>>
>>>> One thing to be aware thought is xv*() are going to be more inefficient
>>>> because they involve touching the page-tables (at least until the work
>>>> to map on-demand the direct map is not merged). In addition, on Arm,
>>>> they will also use only 4K mappings (I have a TODO to fix that).
>>>>
>>>> So I think we will need to be careful when to use xmalloc() vs
>>>> xvalloc(). It might be worth outlining that in the documentation of xv*().
>>>
>>> The rule is quite simple and the inefficiencies you mention
>>> shouldn't matter imo: Post-boot there should not be any
>>> implicit allocations of non-zero order. "Implicit" here meaning
>>> to still permit e.g. direct alloc_domheap_pages() invocations,
>>> making apparent at the call site that the aspect of memory
>>> fragmentation was (hopefully) taken into consideration. I'm
>>> actually inclined to suggest (down the road) to have _xmalloc()
>>> no longer fall back to multi-page allocations post-boot, but
>>> instead return NULL.
>>
>> One advantage of xmalloc() is it is able to allocate a suitable xenheap
>> area. So it will not touch the page-tables and therefore useful for
>> short-life allocation as the overhead will be more limited compare to
>> xvalloc().
> 
> Yet it still shouldn't be used post-boot when the size may exceed
> system page size, to avoid reporting -ENOMEM or alike when really
> there's ample but fragmented memory available.
> 
>> There is also the problem that alloc_{dom, xen}heap_pages() works using
>> order. xmalloc() is handy because it will give back the unnecessary pages.
>>
>> Maybe we should consider a version of alloc_*heap_pages() that will take
>> the number of pages rather than order.
> 
> Possibly, I've indeed been wondering more than once whether we should.
> 
>>> If you think it is really needed, I can add something like "These
>>> should be used in preference to xmalloc() et al whenever the size
>>> is not known to be constrained to at most a single page" to the
>>> comment at the top of the new header file.
>>
>> There are quite a few users of xmalloc() with large allocations. Yet,
>> they would not be suitable for xvalloc() because they require physically
>> contiguous memory.
> 
> Isn't this a mistake? I.e. am I unaware of some comment saying that
> xmalloc() actually guarantees to return physically contiguous memory?

Well, we have been pretty bad at documenting code... So some of us may 
have inferred some behavior from xmalloc().

This is also why I requested to make more explicit what 'v' means.

However...

> It's certainly against the "Xen malloc/free-style interface" comment
> at the top of the header.

... if you consider it as a mistaken, then why did you introduce 
xvalloc() rather than modifying the implementation of xmalloc()?

> 
> It was my understanding so far that with the removal of the direct-
> map this property would go away anyway.

Direct-map is going to disappear on x86, but there are no plan for that 
on Arm so far.

I am not saying I don't want to remove it, I just want to point out that 
decision should not be made solely based on what x86 is doing (see more 
below).

> 
>> So I think you would want to mention that in the
>> sentence.
> 
> Well, as you've seen further down the comment already mentions that
> aspect.
> 
>>> Where the inefficiencies you mention would imo matter is in
>>> (future) decisions whether to use vmalloc() et al vs xvmalloc()
>>> et al: If the size _may_ be no more than a page, the latter may
>>> want preferring.
>> I am not sure to understand this... why would we want to keep vmalloc()
>> extern when xvalloc() will be calling it for allocation over a PAGE_SIZE?
> 
> Why force callers knowing they'll allocate more than a page to go
> through the extra layer? If that was the plan, then we wouldn't
> need a set of new functions, but could instead tweak vmalloc() etc.

Two reasons:
   1) There are too many ways to allocate memory in Xen. This adds 
extra-confusion to use.
   2) The impact of the extra check is going to be insignificant compare 
to the cost of the function. Feel free to prove me otherwise with numbers.

> 
>>>>> --- a/xen/common/vmap.c
>>>>> +++ b/xen/common/vmap.c
>>>>> @@ -7,6 +7,7 @@
>>>>>     #include <xen/spinlock.h>
>>>>>     #include <xen/types.h>
>>>>>     #include <xen/vmap.h>
>>>>> +#include <xen/xvmalloc.h>
>>>>>     #include <asm/page.h>
>>>>>     
>>>>>     static DEFINE_SPINLOCK(vm_lock);
>>>>> @@ -299,11 +300,29 @@ void *vzalloc(size_t size)
>>>>>         return p;
>>>>>     }
>>>>>     
>>>>> -void vfree(void *va)
>>>>> +static void _vfree(const void *va, unsigned int pages, enum vmap_region type)
>>>>
>>>> I don't think "unsigned int" is sufficient to cover big size. AFAICT,
>>>> this is not in a new problem in this code and seems to be a latent issue
>>>> so far.
>>>>
>>>> However, I feel that it is wrong to introduce a new set of allocation
>>>> helpers that contained a flaw fixed in xm*alloc() recently (see  commit
>>>> cf38b4926e2b "xmalloc: guard against integer overflow").
>>>
>>> For _xmalloc() we're talking about bytes (and the guarding you
>>> refer to is actually orthogonal to the limiting done by the
>>> page allocator, as follows from the description of that change).
>>> Here we're talking about pages. I hope it will be decades until we
>>> have to consider allocating 16Tb all in one chunk (and we'd need
>>> to have large enough vmap virtual address space set aside first).
>>
>> I think you misunderstood my point here. I am not suggesting that a
>> normal user would ask to allocate 16TB but that a caller may pass by
>> mistake an unsanitized value to xv*() functions.
>>
>> IIRC, the overflow check in xm*() were added after we discovered that
>> some callers where passing unsanitized values.
>>
>> I would expect xv*() functions to be more used in the future, so I think
>> it would be unwise to not guard against overflow.
>>
>> I would be happy with just checking that nr always fit in a 32-bit value.
> 
> The two callers of the function obtain the value from vm_size(),
> which returns unsigned int.

I can't see a use of vm_size() in vmalloc_type(). I can only see a 
implicit downcast.

> 
>>> Also note that
>>> - the entire vmap.c right now uses unsigned int for page counts,
>>>     so it would be outright inconsistent to use unsigned long here,
>>
>> I didn't suggest this would be the only place (note that "new problem").
>> This was the best place I could find to mention an existing problem that
>> is widened with the introduction of xv*() helpers.
> 
> Oh, so you're talking of a separate and entirely unrelated patch
> making sure the existing vmalloc() won't suffer such a problem.
> Yes, vmalloc_type() could be fixed to this effect. But do you
> realize we'd have a security issue much earlier if any guest
> action could lead to such a gigantic vmalloc(), as the time to
> both allocate and then map 4 billion pages is going to be way
> longer than what we may tolerate without preempting?

Yes I missed that point. But I am not sure where you are trying to infer...

If it wasn't clear enough, I didn't suggest to fix in this patch. I am 
only pointed out that we hardened _xmalloc() and this looks like going 
backwards.

> 
> And no, there's no overflowing calculation anywhere afaics which
> would resemble the ones in xmalloc() you refer to.

"overflow" was probably the wrong word. It would be more a downcast when 
computing the number of pages.

__vmap() is taking an "unsigned int", yet the number of pages is 
computed using size_t.

>>> - at least on x86 passing around 64-bit function arguments is
>>>     slightly less efficient than 32-bit ones, and hence I'd prefer
>>>     to keep it like this.
>>
>> Don't you have 64-bit registers on x86-64?
> 
> Yes, yet to operate on them most insns need an extra prefix
> byte.

... Thankfully we have only 2 architectures to care... Otherwise we 
would never be able to change common code without bikeshedding on 
micro-optimization.

> 
>> But, I am really surprised this is a concern to you when all the
>> functions in this code will modify the pages tables. You dismissed this
>> overhead in the same e-mail...
> 
> Entirely different considerations: The goal of limiting variable
> (and parameter) types to 32 bits where possible is a generic one.

At the cost of introducing multiple implicit downcast that one day or 
another is going to bite us.

> Which is, if for nothing else, to avoid introducing bad precedent.

I am ok with 32-bit internal value, but please at least check the 
downcasting is going to be harmless.

Cheers,

-- 
Julien Grall

Re: [PATCH 1/3] mm: introduce xvmalloc() et al and use for grant table allocations

Posted by Jan Beulich 1 week, 2 days ago
On 19.11.2020 14:19, Julien Grall wrote:
> On 19/11/2020 12:33, Jan Beulich wrote:
>> It's certainly against the "Xen malloc/free-style interface" comment
>> at the top of the header.
> 
> ... if you consider it as a mistaken, then why did you introduce 
> xvalloc() rather than modifying the implementation of xmalloc()?

(a) it didn't occur to me as an option and (b) even if it did, I wouldn't
have wanted to go audit all use sites.

>> It was my understanding so far that with the removal of the direct-
>> map this property would go away anyway.
> 
> Direct-map is going to disappear on x86, but there are no plan for that 
> on Arm so far.
> 
> I am not saying I don't want to remove it, I just want to point out that 
> decision should not be made solely based on what x86 is doing (see more 
> below).

I didn't mean to imply anything like this. I was merely tryin to point
out that there may be a point in time, not too far in the future,
when any such assumption may turn out broken. You said there are a
number of such uses; I don't think I'm aware of any. Is what you're
aware of all in Arm code?

>>>> Where the inefficiencies you mention would imo matter is in
>>>> (future) decisions whether to use vmalloc() et al vs xvmalloc()
>>>> et al: If the size _may_ be no more than a page, the latter may
>>>> want preferring.
>>> I am not sure to understand this... why would we want to keep vmalloc()
>>> extern when xvalloc() will be calling it for allocation over a PAGE_SIZE?
>>
>> Why force callers knowing they'll allocate more than a page to go
>> through the extra layer? If that was the plan, then we wouldn't
>> need a set of new functions, but could instead tweak vmalloc() etc.
> 
> Two reasons:
>    1) There are too many ways to allocate memory in Xen. This adds 
> extra-confusion to use.

Maybe; I wouldn't have thought so.

>    2) The impact of the extra check is going to be insignificant compare 
> to the cost of the function. Feel free to prove me otherwise with numbers.

My point wasn't primarily (if at all) about performance, but about
call layering in general.

>>>>>> --- a/xen/common/vmap.c
>>>>>> +++ b/xen/common/vmap.c
>>>>>> @@ -7,6 +7,7 @@
>>>>>>     #include <xen/spinlock.h>
>>>>>>     #include <xen/types.h>
>>>>>>     #include <xen/vmap.h>
>>>>>> +#include <xen/xvmalloc.h>
>>>>>>     #include <asm/page.h>
>>>>>>     
>>>>>>     static DEFINE_SPINLOCK(vm_lock);
>>>>>> @@ -299,11 +300,29 @@ void *vzalloc(size_t size)
>>>>>>         return p;
>>>>>>     }
>>>>>>     
>>>>>> -void vfree(void *va)
>>>>>> +static void _vfree(const void *va, unsigned int pages, enum vmap_region type)
>>>>>
>>>>> I don't think "unsigned int" is sufficient to cover big size. AFAICT,
>>>>> this is not in a new problem in this code and seems to be a latent issue
>>>>> so far.
>>>>>
>>>>> However, I feel that it is wrong to introduce a new set of allocation
>>>>> helpers that contained a flaw fixed in xm*alloc() recently (see  commit
>>>>> cf38b4926e2b "xmalloc: guard against integer overflow").
>>>>
>>>> For _xmalloc() we're talking about bytes (and the guarding you
>>>> refer to is actually orthogonal to the limiting done by the
>>>> page allocator, as follows from the description of that change).
>>>> Here we're talking about pages. I hope it will be decades until we
>>>> have to consider allocating 16Tb all in one chunk (and we'd need
>>>> to have large enough vmap virtual address space set aside first).
>>>
>>> I think you misunderstood my point here. I am not suggesting that a
>>> normal user would ask to allocate 16TB but that a caller may pass by
>>> mistake an unsanitized value to xv*() functions.
>>>
>>> IIRC, the overflow check in xm*() were added after we discovered that
>>> some callers where passing unsanitized values.
>>>
>>> I would expect xv*() functions to be more used in the future, so I think
>>> it would be unwise to not guard against overflow.
>>>
>>> I would be happy with just checking that nr always fit in a 32-bit value.
>>
>> The two callers of the function obtain the value from vm_size(),
>> which returns unsigned int.
> 
> I can't see a use of vm_size() in vmalloc_type(). I can only see a 
> implicit downcast.

My "the function" was still referring to your initial comment, which
was about _vfree()'s parameter type.

>>>> Also note that
>>>> - the entire vmap.c right now uses unsigned int for page counts,
>>>>     so it would be outright inconsistent to use unsigned long here,
>>>
>>> I didn't suggest this would be the only place (note that "new problem").
>>> This was the best place I could find to mention an existing problem that
>>> is widened with the introduction of xv*() helpers.
>>
>> Oh, so you're talking of a separate and entirely unrelated patch
>> making sure the existing vmalloc() won't suffer such a problem.
>> Yes, vmalloc_type() could be fixed to this effect. But do you
>> realize we'd have a security issue much earlier if any guest
>> action could lead to such a gigantic vmalloc(), as the time to
>> both allocate and then map 4 billion pages is going to be way
>> longer than what we may tolerate without preempting?
> 
> Yes I missed that point. But I am not sure where you are trying to infer...
> 
> If it wasn't clear enough, I didn't suggest to fix in this patch.

Okay, this hadn't become clear to me at all. It was my understanding
that you're requesting changes to be made in this patch.

> I am 
> only pointed out that we hardened _xmalloc() and this looks like going 
> backwards.

I'm not seeing any step backwards. If there's an issue with vmalloc()
that we think needs addressing, let's address it. The patch here only
layers around existing xmalloc() / vmalloc(); I haven't managed to
spot any overflow or truncation issue in it, and I don't think I've
seen you point out any. Quite the contrary, I think I could relax
the checking in _xv{m,z}alloc_array() (but I guess I won't for now).

>> And no, there's no overflowing calculation anywhere afaics which
>> would resemble the ones in xmalloc() you refer to.
> 
> "overflow" was probably the wrong word. It would be more a downcast when 
> computing the number of pages.
> 
> __vmap() is taking an "unsigned int", yet the number of pages is 
> computed using size_t.

Yes, that's what I assume you referred to further up as implicit
downcast. And that's also the one place I spotted when trying to
understand your earlier comments.

>>> But, I am really surprised this is a concern to you when all the
>>> functions in this code will modify the pages tables. You dismissed this
>>> overhead in the same e-mail...
>>
>> Entirely different considerations: The goal of limiting variable
>> (and parameter) types to 32 bits where possible is a generic one.
> 
> At the cost of introducing multiple implicit downcast that one day or 
> another is going to bite us.
> 
>> Which is, if for nothing else, to avoid introducing bad precedent.
> 
> I am ok with 32-bit internal value, but please at least check the 
> downcasting is going to be harmless.

So here things get confusing again: Further up you've just said 
"I didn't suggest to fix in this patch". Here it sounds again as
if you were expecting this to be fixed here and now. So I guess
you may have meant to ask that I add a prereq patch addressing
this?

Jan

[PATCH 2/3] x86/xstate: use xvzalloc() for save area allocation

Posted by Jan Beulich 3 weeks, 1 day ago
This is in preparation for the area size exceeding a page's worth of
space, as will happen with AMX as well as Architectural LBR.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -8,6 +8,7 @@
 #include <xen/param.h>
 #include <xen/percpu.h>
 #include <xen/sched.h>
+#include <xen/xvmalloc.h>
 #include <asm/current.h>
 #include <asm/processor.h>
 #include <asm/hvm/support.h>
@@ -522,7 +523,7 @@ int xstate_alloc_save_area(struct vcpu *
 
     /* XSAVE/XRSTOR requires the save area be 64-byte-boundary aligned. */
     BUILD_BUG_ON(__alignof(*save_area) < 64);
-    save_area = _xzalloc(size, __alignof(*save_area));
+    save_area = _xvzalloc(size, __alignof(*save_area));
     if ( save_area == NULL )
         return -ENOMEM;
 
@@ -543,8 +544,7 @@ int xstate_alloc_save_area(struct vcpu *
 
 void xstate_free_save_area(struct vcpu *v)
 {
-    xfree(v->arch.xsave_area);
-    v->arch.xsave_area = NULL;
+    XVFREE(v->arch.xsave_area);
 }
 
 static unsigned int _xstate_ctxt_size(u64 xcr0)

[PATCH 3/3] x86/xstate: re-size save area when CPUID policy changes

Posted by Jan Beulich 3 weeks, 1 day ago
vCPU-s get maximum size areas allocated initially. Hidden (and in
particular default-off) features may allow for a smaller size area to
suffice.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Seeing that both vcpu_init_fpu() and cpuid_policy_updated() get called
from arch_vcpu_create(), I'm not sure we really need this two-stage
approach - the slightly longer period of time during which
v->arch.xsave_area would remain NULL doesn't look all that problematic.
But since xstate_alloc_save_area() gets called for idle vCPU-s, it has
to stay anyway in some form, so the extra code churn may not be worth
it.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -294,7 +294,21 @@ void update_guest_memory_policy(struct v
     }
 }
 
-void domain_cpu_policy_changed(struct domain *d)
+/*
+ * Called during vcpu construction, and each time the toolstack changes the
+ * CPUID configuration for the domain.
+ */
+static int __must_check cpuid_policy_updated(struct vcpu *v)
+{
+    int rc = xstate_update_save_area(v);
+
+    if ( !rc && is_hvm_vcpu(v) )
+        hvm_cpuid_policy_changed(v);
+
+    return rc;
+}
+
+int domain_cpu_policy_changed(struct domain *d)
 {
     const struct cpuid_policy *p = d->arch.cpuid;
     struct vcpu *v;
@@ -452,13 +466,18 @@ void domain_cpu_policy_changed(struct do
 
     for_each_vcpu ( d, v )
     {
-        cpuid_policy_updated(v);
+        int rc = cpuid_policy_updated(v);
+
+        if ( rc )
+            return rc;
 
         /* If PMU version is zero then the guest doesn't have VPMU */
         if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
              p->basic.pmu_version == 0 )
             vpmu_destroy(v);
     }
+
+    return 0;
 }
 
 #ifndef CONFIG_BIGMEM
@@ -597,7 +616,7 @@ int arch_vcpu_create(struct vcpu *v)
     {
         vpmu_initialise(v);
 
-        cpuid_policy_updated(v);
+        rc = cpuid_policy_updated(v);
     }
 
     return rc;
@@ -841,9 +860,9 @@ int arch_domain_create(struct domain *d,
      */
     d->arch.x87_fip_width = cpu_has_fpu_sel ? 0 : 8;
 
-    domain_cpu_policy_changed(d);
-
-    return 0;
+    rc = domain_cpu_policy_changed(d);
+    if ( !rc )
+        return 0;
 
  fail:
     d->is_dying = DOMDYING_dead;
@@ -2434,16 +2453,6 @@ int domain_relinquish_resources(struct d
     return 0;
 }
 
-/*
- * Called during vcpu construction, and each time the toolstack changes the
- * CPUID configuration for the domain.
- */
-void cpuid_policy_updated(struct vcpu *v)
-{
-    if ( is_hvm_vcpu(v) )
-        hvm_cpuid_policy_changed(v);
-}
-
 void arch_dump_domain_info(struct domain *d)
 {
     paging_dump_domain_info(d);
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -91,7 +91,7 @@ static int update_domain_cpu_policy(stru
     recalculate_cpuid_policy(d);
 
     /* Recalculate relevant dom/vcpu state now the policy has changed. */
-    domain_cpu_policy_changed(d);
+    ret = domain_cpu_policy_changed(d);
 
  out:
     /* Free whichever cpuid/msr structs are not installed in struct domain. */
--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -541,6 +541,41 @@ int xstate_alloc_save_area(struct vcpu *
 
     return 0;
 }
+
+int xstate_update_save_area(struct vcpu *v)
+{
+    unsigned int i, size, old;
+    struct xsave_struct *save_area;
+    uint64_t xcr0_max = cpuid_policy_xcr0_max(v->domain->arch.cpuid);
+
+    ASSERT(!is_idle_vcpu(v));
+
+    if ( !cpu_has_xsave )
+        return 0;
+
+    if ( v->arch.xcr0_accum & ~xcr0_max )
+        return -EBUSY;
+
+    for ( size = old = XSTATE_AREA_MIN_SIZE, i = 2; i < xstate_features; ++i )
+    {
+        if ( xcr0_max & (1ull << i) )
+            size = max(size, xstate_offsets[i] + xstate_sizes[i]);
+        if ( v->arch.xcr0_accum & (1ull << i) )
+            old = max(old, xstate_offsets[i] + xstate_sizes[i]);
+    }
+
+    save_area = _xvrealloc(v->arch.xsave_area, size, __alignof(*save_area));
+    if ( !save_area )
+        return -ENOMEM;
+
+    ASSERT(old <= size);
+    memset((void *)save_area + old, 0, size - old);
+
+    v->arch.xsave_area = save_area;
+    v->arch.fpu_ctxt = &v->arch.xsave_area->fpu_sse;
+
+    return 0;
+}
 
 void xstate_free_save_area(struct vcpu *v)
 {
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -78,8 +78,6 @@ void toggle_guest_mode(struct vcpu *);
 /* x86/64: toggle guest page tables between kernel and user modes. */
 void toggle_guest_pt(struct vcpu *);
 
-void cpuid_policy_updated(struct vcpu *v);
-
 /*
  * Initialise a hypercall-transfer page. The given pointer must be mapped
  * in Xen virtual address space (accesses are not validated or checked).
@@ -667,7 +665,7 @@ struct guest_memory_policy
 void update_guest_memory_policy(struct vcpu *v,
                                 struct guest_memory_policy *policy);
 
-void domain_cpu_policy_changed(struct domain *d);
+int __must_check domain_cpu_policy_changed(struct domain *d);
 
 bool update_runstate_area(struct vcpu *);
 bool update_secondary_system_time(struct vcpu *,
--- a/xen/include/asm-x86/xstate.h
+++ b/xen/include/asm-x86/xstate.h
@@ -106,6 +106,7 @@ void compress_xsave_states(struct vcpu *
 /* extended state init and cleanup functions */
 void xstate_free_save_area(struct vcpu *v);
 int xstate_alloc_save_area(struct vcpu *v);
+int xstate_update_save_area(struct vcpu *v);
 void xstate_init(struct cpuinfo_x86 *c);
 unsigned int xstate_ctxt_size(u64 xcr0);
 


Re: [PATCH 3/3] x86/xstate: re-size save area when CPUID policy changes

Posted by Jan Beulich 3 weeks, 1 day ago
On 06.11.2020 08:13, Jan Beulich wrote:
> --- a/xen/arch/x86/xstate.c
> +++ b/xen/arch/x86/xstate.c
> @@ -541,6 +541,41 @@ int xstate_alloc_save_area(struct vcpu *
>  
>      return 0;
>  }
> +
> +int xstate_update_save_area(struct vcpu *v)
> +{
> +    unsigned int i, size, old;
> +    struct xsave_struct *save_area;
> +    uint64_t xcr0_max = cpuid_policy_xcr0_max(v->domain->arch.cpuid);
> +
> +    ASSERT(!is_idle_vcpu(v));
> +
> +    if ( !cpu_has_xsave )
> +        return 0;
> +
> +    if ( v->arch.xcr0_accum & ~xcr0_max )
> +        return -EBUSY;
> +
> +    for ( size = old = XSTATE_AREA_MIN_SIZE, i = 2; i < xstate_features; ++i )
> +    {
> +        if ( xcr0_max & (1ull << i) )
> +            size = max(size, xstate_offsets[i] + xstate_sizes[i]);
> +        if ( v->arch.xcr0_accum & (1ull << i) )
> +            old = max(old, xstate_offsets[i] + xstate_sizes[i]);
> +    }

This could be further shrunk if we used XSAVEC / if we really
used XSAVES, as then we don't need to also cover the holes. But
since we currently use neither of the two in reality, this would
require more work than just adding the alternative size
calculation here.

Jan