[v1] mm/pdx: Add comments throughout the codebase for pdx

[PATCH] mm/pdx: Add comments throughout the codebase for pdx

Posted by Alejandro Vallejo 10 months, 2 weeks ago

Document the behaviour of the pdx machinery in Xen. Some logic is fairly
opaque and hard to follow without it being documented anywhere. This
explains the rationale behind compression and its relationship to
frametable indexing and directmap management.

While modifying the file:
  * Convert u64 -> uint64_t
  * Remove extern keyword from function prototypes

No functional change.

Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com>
---
 xen/common/pdx.c      |  66 ++++++++++++++++++++--
 xen/include/xen/mm.h  |  16 ++++++
 xen/include/xen/pdx.h | 128 ++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 202 insertions(+), 8 deletions(-)

diff --git a/xen/common/pdx.c b/xen/common/pdx.c
index c91875fabe..0121ab1915 100644
--- a/xen/common/pdx.c
+++ b/xen/common/pdx.c
@@ -20,13 +20,55 @@
 #include <xen/bitops.h>
 #include <xen/nospec.h>
 
-/* Parameters for PFN/MADDR compression. */
+/*
+ * Diagram to make sense of the following variables. The masks and shifts
+ * are done on mfn values in order to convert to/from pdx:
+ *
+ *                      pfn_hole_mask
+ *                      pfn_pdx_hole_shift (mask bitsize)
+ *                      |
+ *                 |---------|
+ *                 |         |
+ *                 V         V
+ *         --------------------------
+ *         |HHHHHHH|000000000|LLLLLL| <--- mfn
+ *         --------------------------
+ *         ^       ^         ^      ^
+ *         |       |         |------|
+ *         |       |             |
+ *         |       |             pfn_pdx_bottom_mask
+ *         |       |
+ *         |-------|
+ *             |
+ *             pfn_top_mask
+ *
+ * ma_{top,bottom}_mask is simply a shifted pfn_{top,pdx_bottom}_mask where the
+ * bottom one shifts in 1s rather than 0s.
+ */
+
+/** Maximum (non-inclusive) usable pdx */
 unsigned long __read_mostly max_pdx;
+
+/** Mask for the lower non-compressible bits of an mfn */
 unsigned long __read_mostly pfn_pdx_bottom_mask = ~0UL;
+
+/** Mask for the lower non-compressible bits of an maddr or vaddr */
 unsigned long __read_mostly ma_va_bottom_mask = ~0UL;
+
+/** Mask for the higher non-compressible bits of an mfn */
 unsigned long __read_mostly pfn_top_mask = 0;
+
+/** Mask for the higher non-compressible bits of an maddr or vaddr */
 unsigned long __read_mostly ma_top_mask = 0;
+
+/**
+ * Mask for a pdx compression bit slice.
+ *
+ *  Invariant: valid(mfn) implies (mfn & pfn_hole_mask) == 0
+ */
 unsigned long __read_mostly pfn_hole_mask = 0;
+
+/** Number of bits of the "compressible" bit slice of an mfn */
 unsigned int __read_mostly pfn_pdx_hole_shift = 0;
 
 unsigned long __read_mostly pdx_group_valid[BITS_TO_LONGS(
@@ -42,7 +84,7 @@ bool __mfn_valid(unsigned long mfn)
 }
 
 /* Sets all bits from the most-significant 1-bit down to the LSB */
-static u64 __init fill_mask(u64 mask)
+static uint64_t __init fill_mask(uint64_t mask)
 {
     while (mask & (mask + 1))
         mask |= mask + 1;
@@ -57,9 +99,25 @@ uint64_t __init pdx_init_mask(uint64_t base_addr)
                          (uint64_t)1 << (MAX_ORDER + PAGE_SHIFT)) - 1);
 }
 
-u64 __init pdx_region_mask(u64 base, u64 len)
+uint64_t __init pdx_region_mask(uint64_t base, uint64_t len)
 {
-    return fill_mask(base ^ (base + len - 1));
+    uint64_t last = base + len - 1;
+    /*
+     * The only bit that matters in base^last is the MSB. There are 2 cases.
+     *
+     * case msb(base) < msb(last):
+     *     then fill_mask(base^last) == fill_mask(last). This is non
+     *     compressible.
+     * case msb(base) == msb(last):
+     *     This means that there _may_ be a sequence of compressible zeroes
+     *     for all addresses between `base` and `last` iff `base` has enough
+     *     trailing zeroes. That is, it's compressible when
+     *     fill_mask(base^last) < fill_mask(last)
+     *
+     * The resulting mask is effectively the moving bits between `base` and
+     * `last`
+     */
+    return fill_mask(base ^ last);
 }
 
 void set_pdx_range(unsigned long smfn, unsigned long emfn)
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index b0dc3ba9c9..568cc396ab 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -31,6 +31,22 @@
  *   (i.e. all devices assigned to) a guest share a single DMA address space
  *   and, by default, Xen will ensure dfn == pfn.
  *
+ * pdx: Page InDeX
+ *   Indices into the frame table holding the per-page's book-keeping
+ *   metadata. A compression scheme is used and there's a non-identity
+ *   mapping between valid(mfn) <-> valid(pdx) See the comments in pdx.c
+ *   for an in-depth explanation of that mapping.
+ *
+ * maddr: Machine Address
+ *   The physical address that corresponds to an mfn
+ *
+ * vaddr: Xen Virtual Address
+ *   A virtual address of memory accesible to Xen. It is typically either
+ *   an address into the direct map or to Xen's own code/data. The direct
+ *   map implements several compression tricks to save memory, so an offset
+ *   into it does _not_ necessarily correspond to an maddr due to pdx
+ *   compression.
+ *
  * WARNING: Some of these terms have changed over time while others have been
  * used inconsistently, meaning that a lot of existing code does not match the
  * definitions above.  New code should use these terms as described here, and
diff --git a/xen/include/xen/pdx.h b/xen/include/xen/pdx.h
index 9fcfb0ce52..adc5707e88 100644
--- a/xen/include/xen/pdx.h
+++ b/xen/include/xen/pdx.h
@@ -1,6 +1,62 @@
 #ifndef __XEN_PDX_H__
 #define __XEN_PDX_H__
 
+/*
+ * PDX (Page inDeX)
+ *
+ * This file deals with optimisations pertaining frame table indexing,
+ * A pdx is an index into the frame table. However, having an identity
+ * relationship between mfn and pdx could waste copious amounts of memory
+ * in empty frame table entries. There are some techniques to bring memory
+ * wastage down.
+ *
+ * ## PDX grouping
+ *
+ * The frame table may have some sparsity even on systems where the memory
+ * banks are tightly packed. This is due to system quirks (like the PCI
+ * hole) which might introduce several GiB of unused page frame numbers
+ * that uselessly waste memory in the frame table. PDX grouping addresses
+ * this by keeping a bitmap of the ranges in the frame table containing
+ * invalid entries and not allocating backing memory for them.
+ *
+ * ## PDX compression
+ *
+ * This is a technique to avoid wasting memory on machines known to have
+ * split their machine address space in two big discontinuous and highly
+ * disjoint chunks.
+ *
+ * In its uncompressed form the frame table must have book-keeping metadata
+ * structures for every page between [0, max_mfn) (whether they exist or
+ * not), and a similar condition exists for the direct map. We know some
+ * architectures, however, that have some sparsity in their address space,
+ * leading to a lot of wastage in the form of unused frame table entries.
+ *
+ * This is where compression becomes useful. The idea is to note that if
+ * you have 2 big chunks of memory sufficiently far apart you can ignore
+ * the middle part of the address because it will always contain zeroes as
+ * long as the base address is sufficiently well aligned and the length of
+ * the region is much smaller than the base address.
+ *
+ * i.e:
+ *   Consider 2 regions of memory. One starts at 0 while the other starts
+ *   at offset 2^off_h. Furthermore, let's assume both regions are smaller
+ *   than 2^off_l. This means that all addresses between [2^off_l, 2^off_h)
+ *   are invalid and we can assume them to be zero on all valid addresses
+ *
+ *                 off_h     off_l
+ *                 |         |
+ *                 V         V
+ *         --------------------------
+ *         |HHHHHHH|000000000|LLLLLL| <--- mfn
+ *         --------------------------
+ *           ^ |
+ *           | | (de)compression by adding/removing "useless" zeroes
+ *           | V
+ *         ---------------
+ *         |HHHHHHHLLLLLL| <--- pdx
+ *         ---------------
+ */
+
 #ifdef CONFIG_HAS_PDX
 
 extern unsigned long max_pdx;
@@ -13,22 +69,77 @@ extern unsigned long pfn_top_mask, ma_top_mask;
                          (sizeof(*frame_table) & -sizeof(*frame_table)))
 extern unsigned long pdx_group_valid[];
 
-extern uint64_t pdx_init_mask(u64 base_addr);
-extern u64 pdx_region_mask(u64 base, u64 len);
+/**
+ * Calculates a mask covering "moving" bits of all addresses of a region
+ *
+ * e.g:
+ *       base=0x1B00000000
+ *   len+base=0x1B0008200;
+ *
+ *   ought to return 0x00000FFFFF;
+ *
+ * @param base Base address of the region
+ * @param len  Size in octets of the region
+ * @return Mask of moving bits at the bottom of all the region addresses
+ */
+uint64_t pdx_init_mask(u64 base_addr);
 
-extern void set_pdx_range(unsigned long smfn, unsigned long emfn);
+/**
+ * Calculates a mask covering "moving" bits of all addresses of a region
+ *
+ * e.g:
+ *       base=0x1B00000000
+ *   len+base=0x1B0008200;
+ *
+ *   ought to return 0x00000FFFFF;
+ *
+ * @param base Base address of the region
+ * @param len  Size in octets of the region
+ * @return Mask of moving bits at the bottom of all the region addresses
+ */
+uint64_t pdx_region_mask(uint64_t base, uint64_t len);
+
+/**
+ * Mark range between smfn and emfn is allocatable in the frame table
+ *
+ * @param smfn Start mfn
+ * @param emfn End mfn
+ */
+void set_pdx_range(unsigned long smfn, unsigned long emfn);
 
 #define page_to_pdx(pg)  ((pg) - frame_table)
 #define pdx_to_page(pdx) gcc11_wrap(frame_table + (pdx))
 
+/**
+ * Invoked to determine if an mfn maps to a legal pdx
+ *
+ * In order for it to be legal it must pass bounds, grouping and
+ * compression sanity checks.
+ *
+ * @param smfn Start mfn
+ * @param emfn End mfn
+ * @return True iff all checks pass
+ */
 bool __mfn_valid(unsigned long mfn);
 
+/**
+ * Map pfn to its corresponding pdx
+ *
+ * @param pfn Frame number
+ * @return Obtained pdx after compressing the pfn
+ */
 static inline unsigned long pfn_to_pdx(unsigned long pfn)
 {
     return (pfn & pfn_pdx_bottom_mask) |
            ((pfn & pfn_top_mask) >> pfn_pdx_hole_shift);
 }
 
+/**
+ * Map a pdx to its corresponding pfn
+ *
+ * @param pdx Page index
+ * @return Obtained pfn after decompressing the pdx
+ */
 static inline unsigned long pdx_to_pfn(unsigned long pdx)
 {
     return (pdx & pfn_pdx_bottom_mask) |
@@ -38,7 +149,16 @@ static inline unsigned long pdx_to_pfn(unsigned long pdx)
 #define mfn_to_pdx(mfn) pfn_to_pdx(mfn_x(mfn))
 #define pdx_to_mfn(pdx) _mfn(pdx_to_pfn(pdx))
 
-extern void pfn_pdx_hole_setup(unsigned long);
+/**
+ * Initializes global variables with information about the compressible
+ * range of the current memory regions.
+ *
+ * @param mask This mask is the biggest pdx_mask of every region in the
+ *             system ORed with all base addresses of every region in the
+ *             system. The result is a mask where every sequence of zeroes
+ *             surrounded by ones is compressible.
+ */
+void pfn_pdx_hole_setup(unsigned long mask);
 
 #endif /* HAS_PDX */
 #endif /* __XEN_PDX_H__ */
-- 
2.34.1

Re: [PATCH] mm/pdx: Add comments throughout the codebase for pdx

Posted by Jan Beulich 10 months, 1 week ago

On 15.06.2023 18:27, Alejandro Vallejo wrote:
> --- a/xen/common/pdx.c
> +++ b/xen/common/pdx.c
> @@ -20,13 +20,55 @@
>  #include <xen/bitops.h>
>  #include <xen/nospec.h>
>  
> -/* Parameters for PFN/MADDR compression. */
> +/*
> + * Diagram to make sense of the following variables. The masks and shifts
> + * are done on mfn values in order to convert to/from pdx:
> + *
> + *                      pfn_hole_mask
> + *                      pfn_pdx_hole_shift (mask bitsize)
> + *                      |
> + *                 |---------|
> + *                 |         |
> + *                 V         V
> + *         --------------------------
> + *         |HHHHHHH|000000000|LLLLLL| <--- mfn
> + *         --------------------------
> + *         ^       ^         ^      ^
> + *         |       |         |------|
> + *         |       |             |
> + *         |       |             pfn_pdx_bottom_mask
> + *         |       |
> + *         |-------|
> + *             |
> + *             pfn_top_mask
> + *
> + * ma_{top,bottom}_mask is simply a shifted pfn_{top,pdx_bottom}_mask where the
> + * bottom one shifts in 1s rather than 0s.
> + */

Nit: That 2nd bottom variable is ma_va_bottom_mask.

> @@ -57,9 +99,25 @@ uint64_t __init pdx_init_mask(uint64_t base_addr)
>                           (uint64_t)1 << (MAX_ORDER + PAGE_SHIFT)) - 1);
>  }
>  
> -u64 __init pdx_region_mask(u64 base, u64 len)
> +uint64_t __init pdx_region_mask(uint64_t base, uint64_t len)
>  {
> -    return fill_mask(base ^ (base + len - 1));
> +    uint64_t last = base + len - 1;
> +    /*
> +     * The only bit that matters in base^last is the MSB. There are 2 cases.
> +     *
> +     * case msb(base) < msb(last):
> +     *     then fill_mask(base^last) == fill_mask(last). This is non
> +     *     compressible.
> +     * case msb(base) == msb(last):
> +     *     This means that there _may_ be a sequence of compressible zeroes
> +     *     for all addresses between `base` and `last` iff `base` has enough
> +     *     trailing zeroes. That is, it's compressible when
> +     *     fill_mask(base^last) < fill_mask(last)
> +     *
> +     * The resulting mask is effectively the moving bits between `base` and
> +     * `last`
> +     */
> +    return fill_mask(base ^ last);
>  }

I don't see a need for you to actually change the code here. You can
as well introduce "last" as shorthand just for the comment. What I
dislike in your way of putting it is the use of fill_mask(last) when
such a call never occurs in code. Starting from the first sentence,
can't you explain things just in terms of said MSB (where the two
cases are "set" and "clear")?

> --- a/xen/include/xen/mm.h
> +++ b/xen/include/xen/mm.h
> @@ -31,6 +31,22 @@
>   *   (i.e. all devices assigned to) a guest share a single DMA address space
>   *   and, by default, Xen will ensure dfn == pfn.
>   *
> + * pdx: Page InDeX
> + *   Indices into the frame table holding the per-page's book-keeping
> + *   metadata. A compression scheme is used and there's a non-identity
> + *   mapping between valid(mfn) <-> valid(pdx) See the comments in pdx.c
> + *   for an in-depth explanation of that mapping.

The mapping may very well be (and on x86 typically is) an identity
one. IOW you want to describe not only the compression case, but also
the "no compression possible" one.

PDXes also aren't just indexes to the frame table, but also to the
direct mapping.

> + * maddr: Machine Address
> + *   The physical address that corresponds to an mfn
> + *
> + * vaddr: Xen Virtual Address
> + *   A virtual address of memory accesible to Xen. It is typically either
> + *   an address into the direct map or to Xen's own code/data. The direct
> + *   map implements several compression tricks to save memory, so an offset
> + *   into it does _not_ necessarily correspond to an maddr due to pdx
> + *   compression.

We need to be careful here: If I'm not mistaken at least Arm uses vaddr
also for guest addresses. In fact I'm not sure vaddr (and perhaps even
maddr) need explaining here, the more that nothing in this header uses
either term.

> --- a/xen/include/xen/pdx.h
> +++ b/xen/include/xen/pdx.h
> @@ -1,6 +1,62 @@
>  #ifndef __XEN_PDX_H__
>  #define __XEN_PDX_H__
>  
> +/*
> + * PDX (Page inDeX)
> + *
> + * This file deals with optimisations pertaining frame table indexing,
> + * A pdx is an index into the frame table. However, having an identity
> + * relationship between mfn and pdx could waste copious amounts of memory
> + * in empty frame table entries. There are some techniques to bring memory
> + * wastage down.
> + *
> + * ## PDX grouping
> + *
> + * The frame table may have some sparsity even on systems where the memory
> + * banks are tightly packed. This is due to system quirks (like the PCI
> + * hole) which might introduce several GiB of unused page frame numbers
> + * that uselessly waste memory in the frame table. PDX grouping addresses
> + * this by keeping a bitmap of the ranges in the frame table containing
> + * invalid entries and not allocating backing memory for them.
> + *
> + * ## PDX compression
> + *
> + * This is a technique to avoid wasting memory on machines known to have
> + * split their machine address space in two big discontinuous and highly
> + * disjoint chunks.

Why two? There can be any number, and in fact on the system I originally
had data from for reference (when first writing this code) iirc there
were 8 nodes, each with a chunk of memory far away from the other chunks.
The compression scheme used merely requires that some "inner" bits are
unused (always zero) in all of those ranges.

> + * In its uncompressed form the frame table must have book-keeping metadata
> + * structures for every page between [0, max_mfn) (whether they exist or

s/they exist/there is RAM/ ?

> + * not), and a similar condition exists for the direct map. We know some
> + * architectures, however, that have some sparsity in their address space,
> + * leading to a lot of wastage in the form of unused frame table entries.

Hmm, "architectures" suggests e.g. Arm might have such, but x86 won't.
Perhaps "systems", "designs", or "system designs"?

> @@ -13,22 +69,77 @@ extern unsigned long pfn_top_mask, ma_top_mask;
>                           (sizeof(*frame_table) & -sizeof(*frame_table)))
>  extern unsigned long pdx_group_valid[];
>  
> -extern uint64_t pdx_init_mask(u64 base_addr);
> -extern u64 pdx_region_mask(u64 base, u64 len);
> +/**
> + * Calculates a mask covering "moving" bits of all addresses of a region
> + *
> + * e.g:
> + *       base=0x1B00000000
> + *   len+base=0x1B0008200;
> + *
> + *   ought to return 0x00000FFFFF;
> + *
> + * @param base Base address of the region
> + * @param len  Size in octets of the region
> + * @return Mask of moving bits at the bottom of all the region addresses
> + */

This looks to be a copy-and-paste of pdx_region_mask()'s comment, when
the function has neither a "base" parameter, nor a and one at all.

> +uint64_t pdx_init_mask(u64 base_addr);

No u64 -> uint64_t here?

> -extern void set_pdx_range(unsigned long smfn, unsigned long emfn);
> +/**
> + * Calculates a mask covering "moving" bits of all addresses of a region
> + *
> + * e.g:
> + *       base=0x1B00000000
> + *   len+base=0x1B0008200;
> + *
> + *   ought to return 0x00000FFFFF;

I think it would help if you actually said how the return value actually
derives. The term "moving" may be understood differently be different
people, and hence such an explanation actually would also clarify what
"moving" means.

I also thing there's a 0 missing in the len+base value, without which
the result would be quite a bit different.

> + * @param base Base address of the region
> + * @param len  Size in octets of the region
> + * @return Mask of moving bits at the bottom of all the region addresses
> + */
> +uint64_t pdx_region_mask(uint64_t base, uint64_t len);
> +
> +/**
> + * Mark range between smfn and emfn is allocatable in the frame table
> + *
> + * @param smfn Start mfn
> + * @param emfn End mfn
> + */
> +void set_pdx_range(unsigned long smfn, unsigned long emfn);

This could do with mathematically expressing the range in the comment,
such that (in|ex)clusiveness of, in particular, emfn is clarified.

>  #define page_to_pdx(pg)  ((pg) - frame_table)
>  #define pdx_to_page(pdx) gcc11_wrap(frame_table + (pdx))
>  
> +/**
> + * Invoked to determine if an mfn maps to a legal pdx

I wouldn't use "pdx" here, but refer to frame_table[] instead.

> + * In order for it to be legal it must pass bounds, grouping and
> + * compression sanity checks.
> + *
> + * @param smfn Start mfn
> + * @param emfn End mfn
> + * @return True iff all checks pass
> + */
>  bool __mfn_valid(unsigned long mfn);

Comment again mentions inapplicable parameters.

> @@ -38,7 +149,16 @@ static inline unsigned long pdx_to_pfn(unsigned long pdx)
>  #define mfn_to_pdx(mfn) pfn_to_pdx(mfn_x(mfn))
>  #define pdx_to_mfn(pdx) _mfn(pdx_to_pfn(pdx))
>  
> -extern void pfn_pdx_hole_setup(unsigned long);
> +/**
> + * Initializes global variables with information about the compressible
> + * range of the current memory regions.
> + *
> + * @param mask This mask is the biggest pdx_mask of every region in the
> + *             system ORed with all base addresses of every region in the
> + *             system. The result is a mask where every sequence of zeroes
> + *             surrounded by ones is compressible.
> + */
> +void pfn_pdx_hole_setup(unsigned long mask);

With the function returning void, I find "The result" problematic. How about
"This results in ..."?

Btw, "surrounded by ones" isn't really necessary. We could compress shorter
sequences of zeros, so this may want re-wording a little to be as precise
as possible.

Jan

Re: [PATCH] mm/pdx: Add comments throughout the codebase for pdx

Posted by Alejandro Vallejo 10 months, 1 week ago

On Mon, Jun 19, 2023 at 05:30:20PM +0200, Jan Beulich wrote:
> > + * ma_{top,bottom}_mask is simply a shifted pfn_{top,pdx_bottom}_mask where the
> > + * bottom one shifts in 1s rather than 0s.
> > + */
> 
> Nit: That 2nd bottom variable is ma_va_bottom_mask.
Sure

> 
> > @@ -57,9 +99,25 @@ uint64_t __init pdx_init_mask(uint64_t base_addr)
> >                           (uint64_t)1 << (MAX_ORDER + PAGE_SHIFT)) - 1);
> >  }
> >  
> > -u64 __init pdx_region_mask(u64 base, u64 len)
> > +uint64_t __init pdx_region_mask(uint64_t base, uint64_t len)
> >  {
> > -    return fill_mask(base ^ (base + len - 1));
> > +    uint64_t last = base + len - 1;
> > +    /*
> > +     * The only bit that matters in base^last is the MSB. There are 2 cases.
> > +     *
> > +     * case msb(base) < msb(last):
> > +     *     then fill_mask(base^last) == fill_mask(last). This is non
> > +     *     compressible.
> > +     * case msb(base) == msb(last):
> > +     *     This means that there _may_ be a sequence of compressible zeroes
> > +     *     for all addresses between `base` and `last` iff `base` has enough
> > +     *     trailing zeroes. That is, it's compressible when
> > +     *     fill_mask(base^last) < fill_mask(last)
> > +     *
> > +     * The resulting mask is effectively the moving bits between `base` and
> > +     * `last`
> > +     */
> > +    return fill_mask(base ^ last);
> >  }
> 
> I don't see a need for you to actually change the code here. You can
> as well introduce "last" as shorthand just for the comment.
I thought as you did initially and wrote it as such. In the end it felt
wrong to have an explanation in terms of a token not present in the code.
Furthermore, explaining what the shorthand is in the comment takes more
space than introducing `last` in the code itself.

```
   uint64_t last = base + len - 1;
  /*
   * The only bit that matters in base^last is the MSB. There are 2 cases.
```
                              vs
```
  /*
   * Let `last = base + len -1` out of convenience.
   * The only bit that matters in base^last is the MSB. There are 2 cases.
```

TL;DR: I didn't factor out `last` due to aesthetics (I'd rather not touch
the code in this patch, in fact), but it seems warranted in order to reduce
the impedance mismatch between this big-ish comment and the call it
describes. I'll post v2 without that adjustment in case I managed to
convince you. Otherwise feel free to adjust it on commit.

> What I dislike in your way of putting it is the use of fill_mask(last) when
> such a call never occurs in code. Starting from the first sentence,
> can't you explain things just in terms of said MSB
I see. I can refer to the MSBs instead. Works equally well.

e.g:
  fill_mask(base^last) == fill_mask(last)
                 |
                 V
  msb(fill_mask(base^last)) == msb(last)

> (where the two cases are "set" and "clear")?
I'm not sure I understand what you mean here.

> 
> > --- a/xen/include/xen/mm.h
> > +++ b/xen/include/xen/mm.h
> > @@ -31,6 +31,22 @@
> >   *   (i.e. all devices assigned to) a guest share a single DMA address space
> >   *   and, by default, Xen will ensure dfn == pfn.
> >   *
> > + * pdx: Page InDeX
> > + *   Indices into the frame table holding the per-page's book-keeping
> > + *   metadata. A compression scheme is used and there's a non-identity
> > + *   mapping between valid(mfn) <-> valid(pdx) See the comments in pdx.c
> > + *   for an in-depth explanation of that mapping.
> 
> The mapping may very well be (and on x86 typically is) an identity
> one. IOW you want to describe not only the compression case, but also
> the "no compression possible" one.
Point taken. I'll rephrase it slightly as "possibly non-identity" and
explicitly state the "no compression is possible" case.

> 
> PDXes also aren't just indexes to the frame table, but also to the
> direct mapping.
I had something to that effect earlier on, but I removed it because it
doesn't seem to be the case on ARM. There's a directmap_base_pdx global
that states the first pdx to be mapped on the directmap.

> 
> > + * maddr: Machine Address
> > + *   The physical address that corresponds to an mfn
> > + *
> > + * vaddr: Xen Virtual Address
> > + *   A virtual address of memory accesible to Xen. It is typically either
> > + *   an address into the direct map or to Xen's own code/data. The direct
> > + *   map implements several compression tricks to save memory, so an offset
> > + *   into it does _not_ necessarily correspond to an maddr due to pdx
> > + *   compression.
> 
> We need to be careful here: If I'm not mistaken at least Arm uses vaddr
> also for guest addresses. In fact I'm not sure vaddr (and perhaps even
> maddr) need explaining here
I'd like to have at least maddr. It's sufficiently project-specific to be
otherwise confusing to find unexplained elsewhere. i.e: In other bare-metal
projects that would be a paddr instead.

vaddr might be trying too hard to boil the ocean as far as definitions go.
I can get rid of it.

> the more that nothing in this header uses either term.
True. But it should be somewhere and this is the main memory-management
header, where the frame number definitions are. In general, things that
change together ought to stay together.

> > + * ## PDX compression
> > + *
> > + * This is a technique to avoid wasting memory on machines known to have
> > + * split their machine address space in two big discontinuous and highly
> > + * disjoint chunks.
> 
> Why two? There can be any number, and in fact on the system I originally
> had data from for reference (when first writing this code) iirc there
> were 8 nodes, each with a chunk of memory far away from the other chunks.
> The compression scheme used merely requires that some "inner" bits are
> unused (always zero) in all of those ranges.
Well, our implementation only supports two and I didn't see any obvious
hints about intending to increasing that number. I see where you're coming
from, though. I can make it more general so it's not outdated if the
pfn_to_pdx()/pdx_to_pfn() pair ever increases in scope to do several holes.

Out of curiosity (and for posterity's sake), what was/is that system?

> 
> > + * In its uncompressed form the frame table must have book-keeping metadata
> > + * structures for every page between [0, max_mfn) (whether they exist or
> 
> s/they exist/there is RAM/ ?
They exist is ambiguous, true. Rewrote it as "they are backed by RAM"

> 
> > + * not), and a similar condition exists for the direct map. We know some
> > + * architectures, however, that have some sparsity in their address space,
> > + * leading to a lot of wastage in the form of unused frame table entries.
> 
> Hmm, "architectures" suggests e.g. Arm might have such, but x86 won't.
> Perhaps "systems", "designs", or "system designs"?
I like `systems` better. Sure.

> 
> > @@ -13,22 +69,77 @@ extern unsigned long pfn_top_mask, ma_top_mask;
> >                           (sizeof(*frame_table) & -sizeof(*frame_table)))
> >  extern unsigned long pdx_group_valid[];
> >  
> > -extern uint64_t pdx_init_mask(u64 base_addr);
> > -extern u64 pdx_region_mask(u64 base, u64 len);
> > +/**
> > + * Calculates a mask covering "moving" bits of all addresses of a region
> > + *
> > + * e.g:
> > + *       base=0x1B00000000
> > + *   len+base=0x1B0008200;
> > + *
> > + *   ought to return 0x00000FFFFF;
> > + *
> > + * @param base Base address of the region
> > + * @param len  Size in octets of the region
> > + * @return Mask of moving bits at the bottom of all the region addresses
> > + */
> 
> This looks to be a copy-and-paste of pdx_region_mask()'s comment, when
> the function has neither a "base" parameter, nor a and one at all.
Oops. A victim of incompatible rebases. I extracted these comments from an
ongoing patch series I'm working on. I'll (try to) write an actual comment
for it.

> 
> > +uint64_t pdx_init_mask(u64 base_addr);
> 
> No u64 -> uint64_t here?
> 
> > -extern void set_pdx_range(unsigned long smfn, unsigned long emfn);
> > +/**
> > + * Calculates a mask covering "moving" bits of all addresses of a region
> > + *
> > + * e.g:
> > + *       base=0x1B00000000
> > + *   len+base=0x1B0008200;
> > + *
> > + *   ought to return 0x00000FFFFF;
> 
> I think it would help if you actually said how the return value actually
> derives. The term "moving" may be understood differently be different
> people, and hence such an explanation actually would also clarify what
> "moving" means.
Hmmm, I'd rather not explicitly state the XOR here though. I'm adding a
couple more lines explaining things in terms of the i-th bit of the mask
and all the region addresses. Ideally this comment ought to explain the
intuition, while the comment in pdx.c explains the implementation.

> 
> I also thing there's a 0 missing in the len+base value, without which
> the result would be quite a bit different.
Indeed.

> > +/**
> > + * Mark range between smfn and emfn is allocatable in the frame table
> > + *
> > + * @param smfn Start mfn
> > + * @param emfn End mfn
> > + */
> > +void set_pdx_range(unsigned long smfn, unsigned long emfn);
> 
> This could do with mathematically expressing the range in the comment,
> such that (in|ex)clusiveness of, in particular, emfn is clarified.
Good point. Sure.


> > +/**
> > + * Invoked to determine if an mfn maps to a legal pdx
> 
> I wouldn't use "pdx" here, but refer to frame_table[] instead.
I can rewrite it as something along those lines, sure.

> 
> > + * In order for it to be legal it must pass bounds, grouping and
> > + * compression sanity checks.
> > + *
> > + * @param smfn Start mfn
> > + * @param emfn End mfn
> > + * @return True iff all checks pass
> > + */
> >  bool __mfn_valid(unsigned long mfn);
> 
> Comment again mentions inapplicable parameters.
Ack.

> 
> > @@ -38,7 +149,16 @@ static inline unsigned long pdx_to_pfn(unsigned long pdx)
> >  #define mfn_to_pdx(mfn) pfn_to_pdx(mfn_x(mfn))
> >  #define pdx_to_mfn(pdx) _mfn(pdx_to_pfn(pdx))
> >  
> > -extern void pfn_pdx_hole_setup(unsigned long);
> > +/**
> > + * Initializes global variables with information about the compressible
> > + * range of the current memory regions.
> > + *
> > + * @param mask This mask is the biggest pdx_mask of every region in the
> > + *             system ORed with all base addresses of every region in the
> > + *             system. The result is a mask where every sequence of zeroes
> > + *             surrounded by ones is compressible.
> > + */
> > +void pfn_pdx_hole_setup(unsigned long mask);
> 
> With the function returning void, I find "The result" problematic. How about
> "This results in ..."?
Sounds better. Sure.

> 
> Btw, "surrounded by ones" isn't really necessary. We could compress shorter
> sequences of zeros, so this may want re-wording a little to be as precise
> as possible.
> 
> Jan
Fair. I'll tweak the definition.

Alejandro

Re: [PATCH] mm/pdx: Add comments throughout the codebase for pdx

Posted by Jan Beulich 10 months, 1 week ago

On 21.06.2023 16:25, Alejandro Vallejo wrote:
> On Mon, Jun 19, 2023 at 05:30:20PM +0200, Jan Beulich wrote:
>>> @@ -57,9 +99,25 @@ uint64_t __init pdx_init_mask(uint64_t base_addr)
>>>                           (uint64_t)1 << (MAX_ORDER + PAGE_SHIFT)) - 1);
>>>  }
>>>  
>>> -u64 __init pdx_region_mask(u64 base, u64 len)
>>> +uint64_t __init pdx_region_mask(uint64_t base, uint64_t len)
>>>  {
>>> -    return fill_mask(base ^ (base + len - 1));
>>> +    uint64_t last = base + len - 1;
>>> +    /*
>>> +     * The only bit that matters in base^last is the MSB. There are 2 cases.
>>> +     *
>>> +     * case msb(base) < msb(last):
>>> +     *     then fill_mask(base^last) == fill_mask(last). This is non
>>> +     *     compressible.
>>> +     * case msb(base) == msb(last):
>>> +     *     This means that there _may_ be a sequence of compressible zeroes
>>> +     *     for all addresses between `base` and `last` iff `base` has enough
>>> +     *     trailing zeroes. That is, it's compressible when
>>> +     *     fill_mask(base^last) < fill_mask(last)
>>> +     *
>>> +     * The resulting mask is effectively the moving bits between `base` and
>>> +     * `last`
>>> +     */
>>> +    return fill_mask(base ^ last);
>>>  }
>>
>> I don't see a need for you to actually change the code here. You can
>> as well introduce "last" as shorthand just for the comment.
> I thought as you did initially and wrote it as such. In the end it felt
> wrong to have an explanation in terms of a token not present in the code.
> Furthermore, explaining what the shorthand is in the comment takes more
> space than introducing `last` in the code itself.
> 
> ```
>    uint64_t last = base + len - 1;
>   /*
>    * The only bit that matters in base^last is the MSB. There are 2 cases.
> ```
>                               vs
> ```
>   /*
>    * Let `last = base + len -1` out of convenience.
>    * The only bit that matters in base^last is the MSB. There are 2 cases.
> ```
> 
> TL;DR: I didn't factor out `last` due to aesthetics (I'd rather not touch
> the code in this patch, in fact), but it seems warranted in order to reduce
> the impedance mismatch between this big-ish comment and the call it
> describes. I'll post v2 without that adjustment in case I managed to
> convince you. Otherwise feel free to adjust it on commit.
> 
>> What I dislike in your way of putting it is the use of fill_mask(last) when
>> such a call never occurs in code. Starting from the first sentence,
>> can't you explain things just in terms of said MSB
> I see. I can refer to the MSBs instead. Works equally well.
> 
> e.g:
>   fill_mask(base^last) == fill_mask(last)
>                  |
>                  V
>   msb(fill_mask(base^last)) == msb(last)
> 
>> (where the two cases are "set" and "clear")?
> I'm not sure I understand what you mean here.

Hmm, yes, I think I was getting confused here.

>>> --- a/xen/include/xen/mm.h
>>> +++ b/xen/include/xen/mm.h
>>> @@ -31,6 +31,22 @@
>>>   *   (i.e. all devices assigned to) a guest share a single DMA address space
>>>   *   and, by default, Xen will ensure dfn == pfn.
>>>   *
>>> + * pdx: Page InDeX
>>> + *   Indices into the frame table holding the per-page's book-keeping
>>> + *   metadata. A compression scheme is used and there's a non-identity
>>> + *   mapping between valid(mfn) <-> valid(pdx) See the comments in pdx.c
>>> + *   for an in-depth explanation of that mapping.
>>
>> The mapping may very well be (and on x86 typically is) an identity
>> one. IOW you want to describe not only the compression case, but also
>> the "no compression possible" one.
> Point taken. I'll rephrase it slightly as "possibly non-identity" and
> explicitly state the "no compression is possible" case.
> 
>>
>> PDXes also aren't just indexes to the frame table, but also to the
>> direct mapping.
> I had something to that effect earlier on, but I removed it because it
> doesn't seem to be the case on ARM. There's a directmap_base_pdx global
> that states the first pdx to be mapped on the directmap.

Which would merely make it a biased index. I very much hope they
eliminate holes (and not just unused leading space) from the directmap
as well.

>>> + * ## PDX compression
>>> + *
>>> + * This is a technique to avoid wasting memory on machines known to have
>>> + * split their machine address space in two big discontinuous and highly
>>> + * disjoint chunks.
>>
>> Why two? There can be any number, and in fact on the system I originally
>> had data from for reference (when first writing this code) iirc there
>> were 8 nodes, each with a chunk of memory far away from the other chunks.
>> The compression scheme used merely requires that some "inner" bits are
>> unused (always zero) in all of those ranges.
> Well, our implementation only supports two and I didn't see any obvious
> hints about intending to increasing that number.

Where are you taking that "supports two" from? When I first wrote this code,
it was tested against a system with 8 (maybe it was 4, but certainly more
than 2) discontiguous regions (not counting the hole below 4G).

> I see where you're coming
> from, though. I can make it more general so it's not outdated if the
> pfn_to_pdx()/pdx_to_pfn() pair ever increases in scope to do several holes.
> 
> Out of curiosity (and for posterity's sake), what was/is that system?

I'm not sure I'm permitted to mention that. I'm pretty sure I carefully
avoided mentioning the partner of ours back at the time.

Jan

Re: [PATCH] mm/pdx: Add comments throughout the codebase for pdx

Posted by Alejandro Vallejo 10 months, 1 week ago

On Thu, Jun 22, 2023 at 11:15:17AM +0200, Jan Beulich wrote:
> >>> --- a/xen/include/xen/mm.h
> >>> +++ b/xen/include/xen/mm.h
> >>> @@ -31,6 +31,22 @@
> >>>   *   (i.e. all devices assigned to) a guest share a single DMA address space
> >>>   *   and, by default, Xen will ensure dfn == pfn.
> >>>   *
> >>> + * pdx: Page InDeX
> >>> + *   Indices into the frame table holding the per-page's book-keeping
> >>> + *   metadata. A compression scheme is used and there's a non-identity
> >>> + *   mapping between valid(mfn) <-> valid(pdx) See the comments in pdx.c
> >>> + *   for an in-depth explanation of that mapping.
> >>
> >> The mapping may very well be (and on x86 typically is) an identity
> >> one. IOW you want to describe not only the compression case, but also
> >> the "no compression possible" one.
> > Point taken. I'll rephrase it slightly as "possibly non-identity" and
> > explicitly state the "no compression is possible" case.
> > 
> >>
> >> PDXes also aren't just indexes to the frame table, but also to the
> >> direct mapping.
> > I had something to that effect earlier on, but I removed it because it
> > doesn't seem to be the case on ARM. There's a directmap_base_pdx global
> > that states the first pdx to be mapped on the directmap.
> 
> Which would merely make it a biased index. I very much hope they
> eliminate holes (and not just unused leading space) from the directmap
> as well.
Yes, the directmap offset is still just a shifted pdx (albeit biased, as
you said), so the holes are naturally removed. Regardless, having to
explain a port-specific quirk in the main mm header seems like too much, so
I sticked with the common denominator: "A pdx is the frame table index of a
valid entry", and that holds for every port.

> >>> + * This is a technique to avoid wasting memory on machines known to have
> >>> + * split their machine address space in two big discontinuous and highly
> >>> + * disjoint chunks.
> >>
> >> Why two? There can be any number, and in fact on the system I originally
> >> had data from for reference (when first writing this code) iirc there
> >> were 8 nodes, each with a chunk of memory far away from the other chunks.
> >> The compression scheme used merely requires that some "inner" bits are
> >> unused (always zero) in all of those ranges.
> > Well, our implementation only supports two and I didn't see any obvious
> > hints about intending to increasing that number.
> 
> Where are you taking that "supports two" from? When I first wrote this code,
> it was tested against a system with 8 (maybe it was 4, but certainly more
> than 2) discontiguous regions (not counting the hole below 4G).
You can have any number, but there's a single contiguous bit slice being
removed, as far as I can see. The adaptor functions in
xen/include/xen/pdx.h perform a single shift.

    static inline unsigned long pfn_to_pdx(unsigned long pfn)
    {
        return (pfn & pfn_pdx_bottom_mask) |
               ((pfn & pfn_top_mask) >> pfn_pdx_hole_shift);
    }

    static inline unsigned long pdx_to_pfn(unsigned long pdx)
    {
        return (pdx & pfn_pdx_bottom_mask) |
               ((pdx << pfn_pdx_hole_shift) & pfn_top_mask);
    }

Unless I'm missing some non-obvious piece of the puzzle, I'd say that for a
truly general compressor we'd need some kind of loop over the hole mask.

> > Out of curiosity (and for posterity's sake), what was/is that system?
> 
> I'm not sure I'm permitted to mention that. I'm pretty sure I carefully
> avoided mentioning the partner of ours back at the time.
Fair enough.

Alejandro

Re: [PATCH] mm/pdx: Add comments throughout the codebase for pdx

Posted by Jan Beulich 10 months, 1 week ago

On 22.06.2023 14:39, Alejandro Vallejo wrote:
> On Thu, Jun 22, 2023 at 11:15:17AM +0200, Jan Beulich wrote:
>>>>> + * This is a technique to avoid wasting memory on machines known to have
>>>>> + * split their machine address space in two big discontinuous and highly
>>>>> + * disjoint chunks.
>>>>
>>>> Why two? There can be any number, and in fact on the system I originally
>>>> had data from for reference (when first writing this code) iirc there
>>>> were 8 nodes, each with a chunk of memory far away from the other chunks.
>>>> The compression scheme used merely requires that some "inner" bits are
>>>> unused (always zero) in all of those ranges.
>>> Well, our implementation only supports two and I didn't see any obvious
>>> hints about intending to increasing that number.
>>
>> Where are you taking that "supports two" from? When I first wrote this code,
>> it was tested against a system with 8 (maybe it was 4, but certainly more
>> than 2) discontiguous regions (not counting the hole below 4G).
> You can have any number, but there's a single contiguous bit slice being
> removed, as far as I can see. The adaptor functions in
> xen/include/xen/pdx.h perform a single shift.
> 
>     static inline unsigned long pfn_to_pdx(unsigned long pfn)
>     {
>         return (pfn & pfn_pdx_bottom_mask) |
>                ((pfn & pfn_top_mask) >> pfn_pdx_hole_shift);
>     }
> 
>     static inline unsigned long pdx_to_pfn(unsigned long pdx)
>     {
>         return (pdx & pfn_pdx_bottom_mask) |
>                ((pdx << pfn_pdx_hole_shift) & pfn_top_mask);
>     }
> 
> Unless I'm missing some non-obvious piece of the puzzle, I'd say that for a
> truly general compressor we'd need some kind of loop over the hole mask.

Well, further compression might be possible that way, yes, but that's
entirely orthogonal to the number of discontiguous regions we're
talking about. Consider

0x0000100000000000-0x00001000ffffffff
0x0000200000000000-0x00002000ffffffff
0x0000300000000000-0x00003000ffffffff
0x0000400000000000-0x00004000ffffffff

The reference system's arrangement was slightly more complex (first and
foremost because of the memory below 4G that node 0 had), but came close
to the above conceptually.

Jan

Re: [PATCH] mm/pdx: Add comments throughout the codebase for pdx

Posted by Alejandro Vallejo 10 months, 1 week ago

On Thu, Jun 22, 2023 at 03:28:14PM +0200, Jan Beulich wrote:
> > Unless I'm missing some non-obvious piece of the puzzle, I'd say that for a
> > truly general compressor we'd need some kind of loop over the hole mask.
> 
> Well, further compression might be possible that way, yes, but that's
> entirely orthogonal to the number of discontiguous regions we're
> talking about. Consider
> 
> 0x0000100000000000-0x00001000ffffffff
> 0x0000200000000000-0x00002000ffffffff
> 0x0000300000000000-0x00003000ffffffff
> 0x0000400000000000-0x00004000ffffffff
> 
> The reference system's arrangement was slightly more complex (first and
> foremost because of the memory below 4G that node 0 had), but came close
> to the above conceptually.
> 
> Jan
Ah, I now see what you mean. A single hole can compress several regions if
they are all share sequences of zeroes. Fair point. I'll reflect that in that
paragraph.

Alejandro