Xen Security Advisory 409 v3 (CVE-2022-33747) - Arm: unbounded memory consumption for 2nd-level page tables

Posted by Xen.org security team 1 year, 6 months ago
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA256

            Xen Security Advisory CVE-2022-33747 / XSA-409
                               version 3

      Arm: unbounded memory consumption for 2nd-level page tables

UPDATES IN VERSION 3
====================

Public release.

ISSUE DESCRIPTION
=================

Certain actions require e.g. removing pages from a guest's P2M
(Physical-to-Machine) mapping.  When large pages are in use to map guest
pages in the 2nd-stage page tables, such a removal operation may incur a
memory allocation (to replace a large mapping with individual smaller
ones).

These memory allocations are taken from the global memory pool. A
malicious guest might be able to cause the global memory pool to be
exhausted by manipulating its own P2M mappings.

IMPACT
======

A malicious guest could cause a Denial of Service, preventing any system
operation requiring further allocation of Xen memory, including creating
new guests.  NB however that memory exhaustion by itself shouldn’t cause
either Xen or properly-written guests to crash.

VULNERABLE SYSTEMS
==================

All versions of Xen are affected.

Only Arm systems are vulnerable.  x86 systems are not vulnerable.

MITIGATION
==========

There is no known mitigation.

RESOLUTION
==========

Applying the appropriate set of attached patches resolves this issue.

Note that patches for released versions are generally prepared to
apply to the stable branches, and may not apply cleanly to the most
recent release tarball.  Downstreams are encouraged to update to the
tip of the stable branch before applying these patches.

Note further that the patches for this XSA depend on the patches for
XSA-410.

xsa409/*.patch           xen-unstable
xsa409-4.16/*.patch      Xen 4.16.x
xsa409-4.15/*.patch      Xen 4.15.x
xsa409-4.14/*.patch      Xen 4.14.x
xsa409-4.13/*.patch      Xen 4.13.x

$ sha256sum xsa409* xsa409*/*
a211afb31199a8edf189928f5285b6a58ce35aac991ae3f708b07274ad5f1082  xsa409.meta
96cc260fbf3c2bedd17d61080ba536791f1116cd7dcc6a172dbcccc452e66974  xsa409-4.13/0001-libxl-docs-Use-arch-specific-default-paging-memory.patch
f94376d12757312175e19b6c51c56bcb3e21055f729440eb9112bee9fc44cd65  xsa409-4.13/0002-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch
b52ca6538a0525dc1638391ee032a7aedced31cc3bcdc8efea02d975813fa251  xsa409-4.13/0003-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch
5a59740c398804950ce99102ae2741d5d539313e4a24d0727926d2b4965f148e  xsa409-4.13/0004-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch
b7c3438a4c6a4957b0e9b911419638c8719550c91db4587660a6d498a73747ae  xsa409-4.14/0001-libxl-docs-Use-arch-specific-default-paging-memory.patch
5a01d80c7157feeeb3374c221d306bd98a134a99597ebfdeee5d62df47e60f27  xsa409-4.14/0002-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch
d9b4385c1d55f9c758a108368ef5fbfc86ab2ff532314f88245cc1fce4f95ea2  xsa409-4.14/0003-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch
96456aea63d6471888b5364330e69c15ffd2ed055200cd286fb59cab379c3905  xsa409-4.14/0004-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch
4c31fd8b3f346e6e9834c33e61037d122b802a83dceec168ed5e699566ca01e2  xsa409-4.15/0001-libxl-docs-Use-arch-specific-default-paging-memory.patch
4b9b1ba9c5c7a644268500906b628664ea0630777653f86e62faf85d9e004b8c  xsa409-4.15/0002-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch
04a097e055e7faf9163e1e7105bfb3a78782fa6e9c3025597725a198d85d9887  xsa409-4.15/0003-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch
9b59622a9c00d75fe3f57b20d286e91df3589855d55e0bad83c64145002c3bc7  xsa409-4.15/0004-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch
2ce57902cff4ad61432b61bf8a10dcc699b88b6b9a02c6e7c51c720b276ec39d  xsa409-4.16/0001-libxl-docs-Use-arch-specific-default-paging-memory.patch
18ad838d9c4a6da8890d5d6b3165000e21d8db022bc743989dfda6cc43a7686c  xsa409-4.16/0002-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch
201bf6c15d0380f4588a12f33bff90f05fe3c8da75dcb0801063216bedcc00c7  xsa409-4.16/0003-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch
f8cea9b75636e73ffffb88b18d80f60ab9ca47856232f1cff787d5d0a1742106  xsa409-4.16/0004-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch
62be1c9896e1a0563abbe515bd50e117147a274b3bae0ce062d1e86cdd535b61  xsa409/0001-libxl-docs-Add-per-arch-extra-default-paging-memory.patch
6bcd3cdd9eb998f5714b1c44d3cf1aaa3b1f3615ef8ccb530cf804638b18c9e3  xsa409/0002-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch
b4740035de11fc0b4b7bcb281b288b1972ef3b97649ff3e61072384aeddf864b  xsa409/0003-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch
ac7af4fea2fa84384fd65308ee8cb50470515a96d2160e467867c8bb766b580a  xsa409/0004-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch
$

DEPLOYMENT DURING EMBARGO
=========================

Deployment of the patches and/or mitigations described above (or
others which are substantially similar) is permitted during the
embargo, even on public-facing systems with untrusted guest users and
administrators.

But: Distribution of updated software is prohibited (except to other
members of the predisclosure list).

Predisclosure list members who wish to deploy significantly different
patches and/or mitigations, please contact the Xen Project Security
Team.

(Note: this during-embargo deployment notice is retained in
post-embargo publicly released Xen Project advisories, even though it
is then no longer applicable.  This is to enable the community to have
oversight of the Xen Project Security Team's decisionmaking.)

For more information about permissible uses of embargoed information,
consult the Xen Project community's agreed Security Policy:
  http://www.xenproject.org/security-policy.html
-----BEGIN PGP SIGNATURE-----

iQFABAEBCAAqFiEEI+MiLBRfRHX6gGCng/4UyVfoK9kFAmNFS/cMHHBncEB4ZW4u
b3JnAAoJEIP+FMlX6CvZKqsIAMobhnQXNUKRUiS1TFrV5NhbUdx0r0PHX3alf3r0
ZUk3mQyq3lKK6MkXB0bpkgq95fv6dw9SIriPRZdivVBK7Yb2VBImdZ/YyXoU5JWN
3EPO8Svxzm8WCntk9smjwNix2SByWSVjQfROjrrgihWLbX4n0IQkOLFlvVgllJmK
ETc0q3bMKEODH7+kkmrTmT+nomlHbuq7HHAZk0jyw/hVs1JdRMN9TXBBdLjLOYFe
/hsDiLWwK51L7ehPZB4d/+rLQYo27chGwNGQwDDXXiWWhMmXJJCO3MhrB4NEt0JE
P4DAkmh2OXh6QyuZPTH48ADbAdL7ecq2atrM6HD2oulwFCI=
=/zM/
-----END PGP SIGNATURE-----
{
  "XSA": 409,
  "SupportedVersions": [
    "master",
    "4.16",
    "4.15",
    "4.14",
    "4.13"
  ],
  "Trees": [
    "xen"
  ],
  "Recipes": {
    "4.13": {
      "Recipes": {
        "xen": {
          "StableRef": "d8a693019845caa4e216bcac10f9501a814c99ae",
          "Prereqs": [
            410
          ],
          "Patches": [
            "xsa409-4.13/*.patch"
          ]
        }
      }
    },
    "4.14": {
      "Recipes": {
        "xen": {
          "StableRef": "261b882f7704515a01f74589f57f0c1303e3b701",
          "Prereqs": [
            410
          ],
          "Patches": [
            "xsa409-4.14/*.patch"
          ]
        }
      }
    },
    "4.15": {
      "Recipes": {
        "xen": {
          "StableRef": "df3395f6b2d759aba39fb67a7bc0fe49147c8b39",
          "Prereqs": [
            410
          ],
          "Patches": [
            "xsa409-4.15/*.patch"
          ]
        }
      }
    },
    "4.16": {
      "Recipes": {
        "xen": {
          "StableRef": "89fe6d0edea841d1d2690cf3f5173e334c687823",
          "Prereqs": [
            410
          ],
          "Patches": [
            "xsa409-4.16/*.patch"
          ]
        }
      }
    },
    "master": {
      "Recipes": {
        "xen": {
          "StableRef": "01ca29f0b17a50a94b0e232ba276c32e95d80ae3",
          "Prereqs": [
            410
          ],
          "Patches": [
            "xsa409/*.patch"
          ]
        }
      }
    }
  }
}From 2d55acec49b6714fd7ce42bd825524541613b98f Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Tue, 9 Aug 2022 06:39:23 +0000
Subject: [PATCH 1/4] libxl, docs: Use arch-specific default paging memory

The default paging memory (descibed in `shadow_memory` entry in xl
config) in libxl is used to determine the memory pool size for xl
guests. Currently this size is only used for x86, and contains a part
of RAM to shadow the resident processes. Since on Arm there is no
shadow mode guests, so the part of RAM to shadow the resident processes
is not necessary. Therefore, this commit splits the function
`libxl_get_required_shadow_memory()` to arch specific helpers and
renamed the helper to `libxl__arch_get_required_paging_memory()`.

On x86, this helper calls the original value from
`libxl_get_required_shadow_memory()` so no functional change intended.

On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM
for the P2M map.

Also update the xl.cfg documentation to add Arm documentation
according to code changes and correct the comment style following Xen
coding style.

This is part of CVE-2022-33747 / XSA-409.

Suggested-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
---
 docs/man/xl.cfg.5.pod.in  |  5 +++++
 tools/libxl/libxl_arch.h  |  4 ++++
 tools/libxl/libxl_arm.c   | 12 ++++++++++++
 tools/libxl/libxl_utils.c |  9 ++-------
 tools/libxl/libxl_x86.c   | 12 ++++++++++++
 5 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in
index 245d3f94728a..3b297c6a97f0 100644
--- a/docs/man/xl.cfg.5.pod.in
+++ b/docs/man/xl.cfg.5.pod.in
@@ -1790,6 +1790,11 @@ are not using hardware assisted paging (i.e. you are using shadow
 mode) and your guest workload consists of a very large number of
 similar processes then increasing this value may improve performance.
 
+On Arm, this field is used to determine the size of the guest P2M pages
+pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for
+the P2M map. Users should adjust this value if bigger P2M pool size is
+needed.
+
 =back
 
 =head3 Processor and Platform Features
diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h
index 6a91775b9e20..b09f868490ac 100644
--- a/tools/libxl/libxl_arch.h
+++ b/tools/libxl/libxl_arch.h
@@ -83,6 +83,10 @@ int libxl__arch_extra_memory(libxl__gc *gc,
                              const libxl_domain_build_info *info,
                              uint64_t *out);
 
+_hidden
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus);
+
 #if defined(__i386__) || defined(__x86_64__)
 
 #define LAPIC_BASE_ADDRESS  0xfee00000
diff --git a/tools/libxl/libxl_arm.c b/tools/libxl/libxl_arm.c
index 34f8a29056db..f4b3dc8e7139 100644
--- a/tools/libxl/libxl_arm.c
+++ b/tools/libxl/libxl_arm.c
@@ -153,6 +153,18 @@ out:
     return rc;
 }
 
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus)
+{
+    /*
+     * 256 pages (1MB) per vcpu,
+     * plus 1 page per MiB of RAM for the P2M map,
+     * This is higher than the minimum that Xen would allocate if no value
+     * were given (but the Xen minimum is for safety, not performance).
+     */
+    return 4 * (256 * smp_cpus + maxmem_kb / 1024);
+}
+
 static struct arch_info {
     const char *guest_type;
     const char *timer_compat;
diff --git a/tools/libxl/libxl_utils.c b/tools/libxl/libxl_utils.c
index b039143b8aef..e18b1524ef78 100644
--- a/tools/libxl/libxl_utils.c
+++ b/tools/libxl/libxl_utils.c
@@ -18,6 +18,7 @@
 #include <ctype.h>
 
 #include "libxl_internal.h"
+#include "libxl_arch.h"
 #include "_paths.h"
 
 #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE
@@ -39,13 +40,7 @@ char *libxl_basename(const char *name)
 
 unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus)
 {
-    /* 256 pages (1MB) per vcpu,
-       plus 1 page per MiB of RAM for the P2M map,
-       plus 1 page per MiB of RAM to shadow the resident processes.
-       This is higher than the minimum that Xen would allocate if no value
-       were given (but the Xen minimum is for safety, not performance).
-     */
-    return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
+    return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus);
 }
 
 char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid)
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index f34c0edc1029..348876e5c057 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -681,6 +681,18 @@ int libxl__arch_passthrough_mode_setdefault(libxl__gc *gc,
     return rc;
 }
 
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus)
+{
+    /*
+     * 256 pages (1MB) per vcpu,
+     * plus 1 page per MiB of RAM for the P2M map,
+     * plus 1 page per MiB of RAM to shadow the resident processes.
+     * This is higher than the minimum that Xen would allocate if no value
+     * were given (but the Xen minimum is for safety, not performance).
+     */
+    return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
+}
 
 /*
  * Local variables:
-- 
2.37.1

From 827fd84345ac7c82fdf9d16ebd3413339a91b678 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:28 +0000
Subject: [PATCH 2/4] xen/arm: Construct the P2M pages pool for guests

This commit constructs the p2m pages pool for guests from the
data structure and helper perspective.

This is implemented by:

- Adding a `struct paging_domain` which contains a freelist, a
counter variable and a spinlock to `struct arch_domain` to
indicate the free p2m pages and the number of p2m total pages in
the p2m pages pool.

- Adding a helper `p2m_get_allocation` to get the p2m pool size.

- Adding a helper `p2m_set_allocation` to set the p2m pages pool
size. This helper should be called before allocating memory for
a guest.

- Adding a helper `p2m_teardown_allocation` to free the p2m pages
pool. This helper should be called during the xl domain destory.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 xen/arch/arm/p2m.c           | 88 ++++++++++++++++++++++++++++++++++++
 xen/include/asm-arm/domain.h | 10 ++++
 xen/include/asm-arm/p2m.h    |  4 ++
 3 files changed, 102 insertions(+)

diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 42638787a295..7d6fec788795 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -53,6 +53,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+/* Return the size of the pool, rounded up to the nearest MB */
+unsigned int p2m_get_allocation(struct domain *d)
+{
+    unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages);
+
+    return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT);
+}
+
+/*
+ * Set the pool of pages to the required number of pages.
+ * Returns 0 for success, non-zero for failure.
+ * Call with d->arch.paging.lock held.
+ */
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)
+{
+    struct page_info *pg;
+
+    ASSERT(spin_is_locked(&d->arch.paging.lock));
+
+    for ( ; ; )
+    {
+        if ( d->arch.paging.p2m_total_pages < pages )
+        {
+            /* Need to allocate more memory from domheap */
+            pg = alloc_domheap_page(NULL, 0);
+            if ( pg == NULL )
+            {
+                printk(XENLOG_ERR "Failed to allocate P2M pages.\n");
+                return -ENOMEM;
+            }
+            ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                d->arch.paging.p2m_total_pages + 1;
+            page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+        }
+        else if ( d->arch.paging.p2m_total_pages > pages )
+        {
+            /* Need to return memory to domheap */
+            pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+            if( pg )
+            {
+                ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                    d->arch.paging.p2m_total_pages - 1;
+                free_domheap_page(pg);
+            }
+            else
+            {
+                printk(XENLOG_ERR
+                       "Failed to free P2M pages, P2M freelist is empty.\n");
+                return -ENOMEM;
+            }
+        }
+        else
+            break;
+
+        /* Check to see if we need to yield and try again */
+        if ( preempted && general_preempt_check() )
+        {
+            *preempted = true;
+            return -ERESTART;
+        }
+    }
+
+    return 0;
+}
+
+int p2m_teardown_allocation(struct domain *d)
+{
+    int ret = 0;
+    bool preempted = false;
+
+    spin_lock(&d->arch.paging.lock);
+    if ( d->arch.paging.p2m_total_pages != 0 )
+    {
+        ret = p2m_set_allocation(d, 0, &preempted);
+        if ( preempted )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return -ERESTART;
+        }
+        ASSERT(d->arch.paging.p2m_total_pages == 0);
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return ret;
+}
+
 /* Unlock the flush and do a P2M TLB flush if necessary */
 void p2m_write_unlock(struct p2m_domain *p2m)
 {
@@ -1567,7 +1653,9 @@ int p2m_init(struct domain *d)
     unsigned int cpu;
 
     rwlock_init(&p2m->lock);
+    spin_lock_init(&d->arch.paging.lock);
     INIT_PAGE_LIST_HEAD(&p2m->pages);
+    INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist);
 
     p2m->vmid = INVALID_VMID;
 
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 9b44a9648c50..7bc14c2e9e47 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -42,6 +42,14 @@ struct vtimer {
         uint64_t cval;
 };
 
+struct paging_domain {
+    spinlock_t lock;
+    /* Free P2M pages from the pre-allocated P2M pool */
+    struct page_list_head p2m_freelist;
+    /* Number of pages from the pre-allocated P2M pool */
+    unsigned long p2m_total_pages;
+};
+
 struct arch_domain
 {
 #ifdef CONFIG_ARM_64
@@ -53,6 +61,8 @@ struct arch_domain
 
     struct hvm_domain hvm;
 
+    struct paging_domain paging;
+
     struct vmmio vmmio;
 
     /* Continuable domain_relinquish_resources(). */
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index 20df6212712e..b1c9b947bb8f 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -197,6 +197,10 @@ void p2m_restore_state(struct vcpu *n);
 /* Print debugging/statistial info about a domain's p2m */
 void p2m_dump_info(struct domain *d);
 
+unsigned int p2m_get_allocation(struct domain *d);
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted);
+int p2m_teardown_allocation(struct domain *d);
+
 static inline void p2m_write_lock(struct p2m_domain *p2m)
 {
     write_lock(&p2m->lock);
-- 
2.37.1

From 3f3cec12763cd612fcc00b809da4a7f357e413e1 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:29 +0000
Subject: [PATCH 3/4] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm

This commit implements the `XEN_DOMCTL_shadow_op` support in Xen
for Arm. The p2m pages pool size for xl guests is supposed to be
determined by `XEN_DOMCTL_shadow_op`. Hence, this commit:

- Introduces a function `p2m_domctl` and implements the subops
`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and
`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`.

- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl.

Therefore enabling the setting of shadow memory pool size
when creating a guest from xl and getting shadow memory pool size
from Xen.

Note that the `XEN_DOMCTL_shadow_op` added in this commit is only
a dummy op, and the functionality of setting/getting p2m memory pool
size for xl guests will be added in following commits.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 tools/libxl/libxl_arm.c | 12 ++++++++++++
 xen/arch/arm/domctl.c   | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/tools/libxl/libxl_arm.c b/tools/libxl/libxl_arm.c
index f4b3dc8e7139..025df1bfd004 100644
--- a/tools/libxl/libxl_arm.c
+++ b/tools/libxl/libxl_arm.c
@@ -130,6 +130,18 @@ int libxl__arch_domain_save_config(libxl__gc *gc,
 int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
                               uint32_t domid)
 {
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+    unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024);
+
+    int r = xc_shadow_control(ctx->xch, domid,
+                              XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION,
+                              &shadow_mb, 0);
+    if (r) {
+        LOGED(ERROR, domid,
+              "Failed to set %u MiB shadow allocation", shadow_mb);
+        return ERROR_FAIL;
+    }
+
     return 0;
 }
 
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 9da88b8c64e2..ef1299ae1c64 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -45,11 +45,43 @@ static int handle_vuart_init(struct domain *d,
     return rc;
 }
 
+static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
+                       XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+{
+    if ( unlikely(d == current->domain) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
+        return -EINVAL;
+    }
+
+    if ( unlikely(d->is_dying) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n",
+               d->domain_id);
+        return -EINVAL;
+    }
+
+    switch ( sc->op )
+    {
+    case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
+        return 0;
+    case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+        return 0;
+    default:
+    {
+        printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
+        return -EINVAL;
+    }
+    }
+}
+
 long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
                     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
     switch ( domctl->cmd )
     {
+    case XEN_DOMCTL_shadow_op:
+        return p2m_domctl(d, &domctl->u.shadow_op, u_domctl);
     case XEN_DOMCTL_cacheflush:
     {
         gfn_t s = _gfn(domctl->u.cacheflush.start_pfn);
-- 
2.37.1

From 8ee0aa8f61ef6f1135d05decdecb45e7a940e784 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:30 +0000
Subject: [PATCH 4/4] xen/arm: Allocate and free P2M pages from the P2M pool

This commit sets/tearsdown of p2m pages pool for non-privileged Arm
guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`.

- For dom0, P2M pages should come from heap directly instead of p2m
pool, so that the kernel may take advantage of the extended regions.

- For xl guests, the setting of the p2m pool is called in
`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in
`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is
updated with the new size when setting the p2m pool.

- For dom0less domUs, the setting of the p2m pool is called before
allocating memory during domain creation. Users can specify the p2m
pool size by `xen,domain-p2m-mem-mb` dts property.

To actually allocate/free pages from the p2m pool, this commit adds
two helper functions namely `p2m_alloc_page` and `p2m_free_page` to
`struct p2m_domain`. By replacing the `alloc_domheap_page` and
`free_domheap_page` with these two helper functions, p2m pages can
be added/removed from the list of p2m pool rather than from the heap.

Since page from `p2m_alloc_page` is cleaned, take the opportunity
to remove the redundant `clean_page` in `p2m_create_table`.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 docs/misc/arm/device-tree/booting.txt |  8 ++++
 xen/arch/arm/domain.c                 |  8 ++++
 xen/arch/arm/domain_build.c           | 29 ++++++++++++++
 xen/arch/arm/domctl.c                 | 23 ++++++++++-
 xen/arch/arm/p2m.c                    | 57 +++++++++++++++++++++++++--
 xen/include/asm-arm/domain.h          |  1 +
 6 files changed, 121 insertions(+), 5 deletions(-)

diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
index 5243bc7fd344..470c9491a781 100644
--- a/docs/misc/arm/device-tree/booting.txt
+++ b/docs/misc/arm/device-tree/booting.txt
@@ -164,6 +164,14 @@ with the following properties:
     Both #address-cells and #size-cells need to be specified because
     both sub-nodes (described shortly) have reg properties.
 
+- xen,domain-p2m-mem-mb
+
+    Optional. A 32-bit integer specifying the amount of megabytes of RAM
+    used for the domain P2M pool. This is in-sync with the shadow_memory
+    option in xl.cfg. Leaving this field empty in device tree will lead to
+    the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB
+    per MB of guest RAM plus 512KB for guest extended regions.
+
 Under the "xen,domain" compatible node, one or more sub-nodes are present
 for the DomU kernel and ramdisk.
 
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 1e24a7dbb4a1..31abe7d6f9dc 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -1022,6 +1022,14 @@ int domain_relinquish_resources(struct domain *d)
         if ( ret )
             return ret;
 
+        d->arch.relmem = RELMEM_p2m_pool;
+        /* Fallthrough */
+
+    case RELMEM_p2m_pool:
+        ret = p2m_teardown_allocation(d);
+        if( ret )
+            return ret;
+
         d->arch.relmem = RELMEM_done;
         /* Fallthrough */
 
diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index ce7f61e8259f..eb859600e57d 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -2327,6 +2327,21 @@ static void __init find_gnttab_region(struct domain *d,
            kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size);
 }
 
+static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb,
+                                             unsigned int smp_cpus)
+{
+    /*
+     * Keep in sync with libxl__get_required_paging_memory().
+     * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map,
+     * plus 128 pages to cover extended regions.
+     */
+    unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128);
+
+    BUILD_BUG_ON(PAGE_SIZE != SZ_4K);
+
+    return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT);
+}
+
 static int __init construct_domain(struct domain *d, struct kernel_info *kinfo)
 {
     unsigned int i;
@@ -2418,6 +2433,8 @@ static int __init construct_domU(struct domain *d,
     struct kernel_info kinfo = {};
     int rc;
     u64 mem;
+    u32 p2m_mem_mb;
+    unsigned long p2m_pages;
 
     rc = dt_property_read_u64(node, "memory", &mem);
     if ( !rc )
@@ -2427,6 +2444,18 @@ static int __init construct_domU(struct domain *d,
     }
     kinfo.unassigned_mem = (paddr_t)mem * SZ_1K;
 
+    rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb);
+    /* If xen,domain-p2m-mem-mb is not specified, use the default value. */
+    p2m_pages = rc ?
+                p2m_mem_mb << (20 - PAGE_SHIFT) :
+                domain_p2m_pages(mem, d->max_vcpus);
+
+    spin_lock(&d->arch.paging.lock);
+    rc = p2m_set_allocation(d, p2m_pages, NULL);
+    spin_unlock(&d->arch.paging.lock);
+    if ( rc != 0 )
+        return rc;
+
     printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem);
 
     kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index ef1299ae1c64..dab3da3a23bc 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -48,6 +48,9 @@ static int handle_vuart_init(struct domain *d,
 static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
                        XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
+    long rc;
+    bool preempted = false;
+
     if ( unlikely(d == current->domain) )
     {
         printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
@@ -64,9 +67,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
     switch ( sc->op )
     {
     case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
-        return 0;
+    {
+        /* Allow and handle preemption */
+        spin_lock(&d->arch.paging.lock);
+        rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
+        spin_unlock(&d->arch.paging.lock);
+
+        if ( preempted )
+            /* Not finished. Set up to re-run the call. */
+            rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+                                               u_domctl);
+        else
+            /* Finished. Return the new allocation. */
+            sc->mb = p2m_get_allocation(d);
+
+        return rc;
+    }
     case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+    {
+        sc->mb = p2m_get_allocation(d);
         return 0;
+    }
     default:
     {
         printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 7d6fec788795..31966905446e 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -53,6 +53,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+static struct page_info *p2m_alloc_page(struct domain *d)
+{
+    struct page_info *pg;
+
+    spin_lock(&d->arch.paging.lock);
+    /*
+     * For hardware domain, there should be no limit in the number of pages that
+     * can be allocated, so that the kernel may take advantage of the extended
+     * regions. Hence, allocate p2m pages for hardware domains from heap.
+     */
+    if ( is_hardware_domain(d) )
+    {
+        pg = alloc_domheap_page(NULL, 0);
+        if ( pg == NULL )
+        {
+            printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n");
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+    }
+    else
+    {
+        pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+        if ( unlikely(!pg) )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+        d->arch.paging.p2m_total_pages--;
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return pg;
+}
+
+static void p2m_free_page(struct domain *d, struct page_info *pg)
+{
+    spin_lock(&d->arch.paging.lock);
+    if ( is_hardware_domain(d) )
+        free_domheap_page(pg);
+    else
+    {
+        d->arch.paging.p2m_total_pages++;
+        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+    }
+    spin_unlock(&d->arch.paging.lock);
+}
+
 /* Return the size of the pool, rounded up to the nearest MB */
 unsigned int p2m_get_allocation(struct domain *d)
 {
@@ -754,7 +802,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry)
 
     ASSERT(!p2m_is_valid(*entry));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( page == NULL )
         return -ENOMEM;
 
@@ -874,7 +922,7 @@ static void p2m_free_entry(struct p2m_domain *p2m,
     pg = mfn_to_page(mfn);
 
     page_list_del(pg, &p2m->pages);
-    free_domheap_page(pg);
+    p2m_free_page(p2m->domain, pg);
 }
 
 static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
@@ -898,7 +946,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
     ASSERT(level < target);
     ASSERT(p2m_is_superpage(*entry, level));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( !page )
         return false;
 
@@ -1609,7 +1657,7 @@ int p2m_teardown(struct domain *d)
 
     while ( (pg = page_list_remove_head(&p2m->pages)) )
     {
-        free_domheap_page(pg);
+        p2m_free_page(p2m->domain, pg);
         count++;
         /* Arbitrarily preempt every 512 iterations */
         if ( !(count % 512) && hypercall_preempt_check() )
@@ -1633,6 +1681,7 @@ void p2m_final_teardown(struct domain *d)
         return;
 
     ASSERT(page_list_empty(&p2m->pages));
+    ASSERT(page_list_empty(&d->arch.paging.p2m_freelist));
 
     if ( p2m->root )
         free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 7bc14c2e9e47..dc5b26d15e1d 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -73,6 +73,7 @@ struct arch_domain
         RELMEM_page,
         RELMEM_mapping,
         RELMEM_p2m,
+        RELMEM_p2m_pool,
         RELMEM_done,
     } relmem;
 
-- 
2.37.1

From 2609b6e6d52783f1a58e4c50832a9c6ebd671336 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Tue, 9 Aug 2022 06:39:23 +0000
Subject: [PATCH 1/4] libxl, docs: Use arch-specific default paging memory

The default paging memory (descibed in `shadow_memory` entry in xl
config) in libxl is used to determine the memory pool size for xl
guests. Currently this size is only used for x86, and contains a part
of RAM to shadow the resident processes. Since on Arm there is no
shadow mode guests, so the part of RAM to shadow the resident processes
is not necessary. Therefore, this commit splits the function
`libxl_get_required_shadow_memory()` to arch specific helpers and
renamed the helper to `libxl__arch_get_required_paging_memory()`.

On x86, this helper calls the original value from
`libxl_get_required_shadow_memory()` so no functional change intended.

On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM
for the P2M map.

Also update the xl.cfg documentation to add Arm documentation
according to code changes and correct the comment style following Xen
coding style.

This is part of CVE-2022-33747 / XSA-409.

Suggested-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
---
 docs/man/xl.cfg.5.pod.in  |  5 +++++
 tools/libxl/libxl_arch.h  |  4 ++++
 tools/libxl/libxl_arm.c   | 12 ++++++++++++
 tools/libxl/libxl_utils.c |  9 ++-------
 tools/libxl/libxl_x86.c   | 12 ++++++++++++
 5 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in
index 0532739c1fff..2224080b30ce 100644
--- a/docs/man/xl.cfg.5.pod.in
+++ b/docs/man/xl.cfg.5.pod.in
@@ -1803,6 +1803,11 @@ are not using hardware assisted paging (i.e. you are using shadow
 mode) and your guest workload consists of a very large number of
 similar processes then increasing this value may improve performance.
 
+On Arm, this field is used to determine the size of the guest P2M pages
+pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for
+the P2M map. Users should adjust this value if bigger P2M pool size is
+needed.
+
 =back
 
 =head3 Processor and Platform Features
diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h
index 6a91775b9e20..b09f868490ac 100644
--- a/tools/libxl/libxl_arch.h
+++ b/tools/libxl/libxl_arch.h
@@ -83,6 +83,10 @@ int libxl__arch_extra_memory(libxl__gc *gc,
                              const libxl_domain_build_info *info,
                              uint64_t *out);
 
+_hidden
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus);
+
 #if defined(__i386__) || defined(__x86_64__)
 
 #define LAPIC_BASE_ADDRESS  0xfee00000
diff --git a/tools/libxl/libxl_arm.c b/tools/libxl/libxl_arm.c
index 34f8a29056db..f4b3dc8e7139 100644
--- a/tools/libxl/libxl_arm.c
+++ b/tools/libxl/libxl_arm.c
@@ -153,6 +153,18 @@ out:
     return rc;
 }
 
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus)
+{
+    /*
+     * 256 pages (1MB) per vcpu,
+     * plus 1 page per MiB of RAM for the P2M map,
+     * This is higher than the minimum that Xen would allocate if no value
+     * were given (but the Xen minimum is for safety, not performance).
+     */
+    return 4 * (256 * smp_cpus + maxmem_kb / 1024);
+}
+
 static struct arch_info {
     const char *guest_type;
     const char *timer_compat;
diff --git a/tools/libxl/libxl_utils.c b/tools/libxl/libxl_utils.c
index b039143b8aef..e18b1524ef78 100644
--- a/tools/libxl/libxl_utils.c
+++ b/tools/libxl/libxl_utils.c
@@ -18,6 +18,7 @@
 #include <ctype.h>
 
 #include "libxl_internal.h"
+#include "libxl_arch.h"
 #include "_paths.h"
 
 #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE
@@ -39,13 +40,7 @@ char *libxl_basename(const char *name)
 
 unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus)
 {
-    /* 256 pages (1MB) per vcpu,
-       plus 1 page per MiB of RAM for the P2M map,
-       plus 1 page per MiB of RAM to shadow the resident processes.
-       This is higher than the minimum that Xen would allocate if no value
-       were given (but the Xen minimum is for safety, not performance).
-     */
-    return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
+    return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus);
 }
 
 char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid)
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index 07c7b05e0d36..0ad455301d88 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -852,6 +852,18 @@ int libxl__arch_passthrough_mode_setdefault(libxl__gc *gc,
     return rc;
 }
 
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus)
+{
+    /*
+     * 256 pages (1MB) per vcpu,
+     * plus 1 page per MiB of RAM for the P2M map,
+     * plus 1 page per MiB of RAM to shadow the resident processes.
+     * This is higher than the minimum that Xen would allocate if no value
+     * were given (but the Xen minimum is for safety, not performance).
+     */
+    return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
+}
 
 /*
  * Local variables:
-- 
2.37.1

From a6d56368473a283cbfd93da8ec5c70b3bc86aab3 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:28 +0000
Subject: [PATCH 2/4] xen/arm: Construct the P2M pages pool for guests

This commit constructs the p2m pages pool for guests from the
data structure and helper perspective.

This is implemented by:

- Adding a `struct paging_domain` which contains a freelist, a
counter variable and a spinlock to `struct arch_domain` to
indicate the free p2m pages and the number of p2m total pages in
the p2m pages pool.

- Adding a helper `p2m_get_allocation` to get the p2m pool size.

- Adding a helper `p2m_set_allocation` to set the p2m pages pool
size. This helper should be called before allocating memory for
a guest.

- Adding a helper `p2m_teardown_allocation` to free the p2m pages
pool. This helper should be called during the xl domain destory.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 xen/arch/arm/p2m.c           | 88 ++++++++++++++++++++++++++++++++++++
 xen/include/asm-arm/domain.h | 10 ++++
 xen/include/asm-arm/p2m.h    |  4 ++
 3 files changed, 102 insertions(+)

diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 62f4d31dc1cc..0c331a36a536 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -49,6 +49,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+/* Return the size of the pool, rounded up to the nearest MB */
+unsigned int p2m_get_allocation(struct domain *d)
+{
+    unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages);
+
+    return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT);
+}
+
+/*
+ * Set the pool of pages to the required number of pages.
+ * Returns 0 for success, non-zero for failure.
+ * Call with d->arch.paging.lock held.
+ */
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)
+{
+    struct page_info *pg;
+
+    ASSERT(spin_is_locked(&d->arch.paging.lock));
+
+    for ( ; ; )
+    {
+        if ( d->arch.paging.p2m_total_pages < pages )
+        {
+            /* Need to allocate more memory from domheap */
+            pg = alloc_domheap_page(NULL, 0);
+            if ( pg == NULL )
+            {
+                printk(XENLOG_ERR "Failed to allocate P2M pages.\n");
+                return -ENOMEM;
+            }
+            ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                d->arch.paging.p2m_total_pages + 1;
+            page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+        }
+        else if ( d->arch.paging.p2m_total_pages > pages )
+        {
+            /* Need to return memory to domheap */
+            pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+            if( pg )
+            {
+                ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                    d->arch.paging.p2m_total_pages - 1;
+                free_domheap_page(pg);
+            }
+            else
+            {
+                printk(XENLOG_ERR
+                       "Failed to free P2M pages, P2M freelist is empty.\n");
+                return -ENOMEM;
+            }
+        }
+        else
+            break;
+
+        /* Check to see if we need to yield and try again */
+        if ( preempted && general_preempt_check() )
+        {
+            *preempted = true;
+            return -ERESTART;
+        }
+    }
+
+    return 0;
+}
+
+int p2m_teardown_allocation(struct domain *d)
+{
+    int ret = 0;
+    bool preempted = false;
+
+    spin_lock(&d->arch.paging.lock);
+    if ( d->arch.paging.p2m_total_pages != 0 )
+    {
+        ret = p2m_set_allocation(d, 0, &preempted);
+        if ( preempted )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return -ERESTART;
+        }
+        ASSERT(d->arch.paging.p2m_total_pages == 0);
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return ret;
+}
+
 /* Unlock the flush and do a P2M TLB flush if necessary */
 void p2m_write_unlock(struct p2m_domain *p2m)
 {
@@ -1568,7 +1654,9 @@ int p2m_init(struct domain *d)
     unsigned int cpu;
 
     rwlock_init(&p2m->lock);
+    spin_lock_init(&d->arch.paging.lock);
     INIT_PAGE_LIST_HEAD(&p2m->pages);
+    INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist);
 
     p2m->vmid = INVALID_VMID;
 
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 9c4db75f087d..96a878d33448 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -42,6 +42,14 @@ struct vtimer {
     uint64_t cval;
 };
 
+struct paging_domain {
+    spinlock_t lock;
+    /* Free P2M pages from the pre-allocated P2M pool */
+    struct page_list_head p2m_freelist;
+    /* Number of pages from the pre-allocated P2M pool */
+    unsigned long p2m_total_pages;
+};
+
 struct arch_domain
 {
 #ifdef CONFIG_ARM_64
@@ -53,6 +61,8 @@ struct arch_domain
 
     struct hvm_domain hvm;
 
+    struct paging_domain paging;
+
     struct vmmio vmmio;
 
     /* Continuable domain_relinquish_resources(). */
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index f40f82794da9..b733f55d481c 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -209,6 +209,10 @@ void p2m_restore_state(struct vcpu *n);
 /* Print debugging/statistial info about a domain's p2m */
 void p2m_dump_info(struct domain *d);
 
+unsigned int p2m_get_allocation(struct domain *d);
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted);
+int p2m_teardown_allocation(struct domain *d);
+
 static inline void p2m_write_lock(struct p2m_domain *p2m)
 {
     write_lock(&p2m->lock);
-- 
2.37.1

From 74365d8ab4543963bbac83cff77e742a3cc8adb0 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:29 +0000
Subject: [PATCH 3/4] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm

This commit implements the `XEN_DOMCTL_shadow_op` support in Xen
for Arm. The p2m pages pool size for xl guests is supposed to be
determined by `XEN_DOMCTL_shadow_op`. Hence, this commit:

- Introduces a function `p2m_domctl` and implements the subops
`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and
`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`.

- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl.

Therefore enabling the setting of shadow memory pool size
when creating a guest from xl and getting shadow memory pool size
from Xen.

Note that the `XEN_DOMCTL_shadow_op` added in this commit is only
a dummy op, and the functionality of setting/getting p2m memory pool
size for xl guests will be added in following commits.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 tools/libxl/libxl_arm.c | 12 ++++++++++++
 xen/arch/arm/domctl.c   | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/tools/libxl/libxl_arm.c b/tools/libxl/libxl_arm.c
index f4b3dc8e7139..025df1bfd004 100644
--- a/tools/libxl/libxl_arm.c
+++ b/tools/libxl/libxl_arm.c
@@ -130,6 +130,18 @@ int libxl__arch_domain_save_config(libxl__gc *gc,
 int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
                               uint32_t domid)
 {
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+    unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024);
+
+    int r = xc_shadow_control(ctx->xch, domid,
+                              XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION,
+                              &shadow_mb, 0);
+    if (r) {
+        LOGED(ERROR, domid,
+              "Failed to set %u MiB shadow allocation", shadow_mb);
+        return ERROR_FAIL;
+    }
+
     return 0;
 }
 
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 9da88b8c64e2..ef1299ae1c64 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -45,11 +45,43 @@ static int handle_vuart_init(struct domain *d,
     return rc;
 }
 
+static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
+                       XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+{
+    if ( unlikely(d == current->domain) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
+        return -EINVAL;
+    }
+
+    if ( unlikely(d->is_dying) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n",
+               d->domain_id);
+        return -EINVAL;
+    }
+
+    switch ( sc->op )
+    {
+    case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
+        return 0;
+    case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+        return 0;
+    default:
+    {
+        printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
+        return -EINVAL;
+    }
+    }
+}
+
 long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
                     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
     switch ( domctl->cmd )
     {
+    case XEN_DOMCTL_shadow_op:
+        return p2m_domctl(d, &domctl->u.shadow_op, u_domctl);
     case XEN_DOMCTL_cacheflush:
     {
         gfn_t s = _gfn(domctl->u.cacheflush.start_pfn);
-- 
2.37.1

From b4a6a93c8e06403c1bedfa14583aad8b8401e301 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:30 +0000
Subject: [PATCH 4/4] xen/arm: Allocate and free P2M pages from the P2M pool

This commit sets/tearsdown of p2m pages pool for non-privileged Arm
guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`.

- For dom0, P2M pages should come from heap directly instead of p2m
pool, so that the kernel may take advantage of the extended regions.

- For xl guests, the setting of the p2m pool is called in
`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in
`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is
updated with the new size when setting the p2m pool.

- For dom0less domUs, the setting of the p2m pool is called before
allocating memory during domain creation. Users can specify the p2m
pool size by `xen,domain-p2m-mem-mb` dts property.

To actually allocate/free pages from the p2m pool, this commit adds
two helper functions namely `p2m_alloc_page` and `p2m_free_page` to
`struct p2m_domain`. By replacing the `alloc_domheap_page` and
`free_domheap_page` with these two helper functions, p2m pages can
be added/removed from the list of p2m pool rather than from the heap.

Since page from `p2m_alloc_page` is cleaned, take the opportunity
to remove the redundant `clean_page` in `p2m_create_table`.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 docs/misc/arm/device-tree/booting.txt |  8 ++++
 xen/arch/arm/domain.c                 |  6 +++
 xen/arch/arm/domain_build.c           | 29 ++++++++++++++
 xen/arch/arm/domctl.c                 | 23 ++++++++++-
 xen/arch/arm/p2m.c                    | 57 +++++++++++++++++++++++++--
 5 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
index 5243bc7fd344..470c9491a781 100644
--- a/docs/misc/arm/device-tree/booting.txt
+++ b/docs/misc/arm/device-tree/booting.txt
@@ -164,6 +164,14 @@ with the following properties:
     Both #address-cells and #size-cells need to be specified because
     both sub-nodes (described shortly) have reg properties.
 
+- xen,domain-p2m-mem-mb
+
+    Optional. A 32-bit integer specifying the amount of megabytes of RAM
+    used for the domain P2M pool. This is in-sync with the shadow_memory
+    option in xl.cfg. Leaving this field empty in device tree will lead to
+    the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB
+    per MB of guest RAM plus 512KB for guest extended regions.
+
 Under the "xen,domain" compatible node, one or more sub-nodes are present
 for the DomU kernel and ramdisk.
 
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index caa625bd16f9..aae615f7d6ad 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -980,6 +980,7 @@ enum {
     PROG_page,
     PROG_mapping,
     PROG_p2m,
+    PROG_p2m_pool,
     PROG_done,
 };
 
@@ -1035,6 +1036,11 @@ int domain_relinquish_resources(struct domain *d)
         if ( ret )
             return ret;
 
+    PROGRESS(p2m_pool):
+        ret = p2m_teardown_allocation(d);
+        if( ret )
+            return ret;
+
     PROGRESS(done):
         break;
 
diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index f49dbf1ca12e..3c05fa5ac7d0 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -2333,6 +2333,21 @@ static void __init find_gnttab_region(struct domain *d,
            kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size);
 }
 
+static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb,
+                                             unsigned int smp_cpus)
+{
+    /*
+     * Keep in sync with libxl__get_required_paging_memory().
+     * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map,
+     * plus 128 pages to cover extended regions.
+     */
+    unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128);
+
+    BUILD_BUG_ON(PAGE_SIZE != SZ_4K);
+
+    return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT);
+}
+
 static int __init construct_domain(struct domain *d, struct kernel_info *kinfo)
 {
     unsigned int i;
@@ -2424,6 +2439,8 @@ static int __init construct_domU(struct domain *d,
     struct kernel_info kinfo = {};
     int rc;
     u64 mem;
+    u32 p2m_mem_mb;
+    unsigned long p2m_pages;
 
     rc = dt_property_read_u64(node, "memory", &mem);
     if ( !rc )
@@ -2433,6 +2450,18 @@ static int __init construct_domU(struct domain *d,
     }
     kinfo.unassigned_mem = (paddr_t)mem * SZ_1K;
 
+    rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb);
+    /* If xen,domain-p2m-mem-mb is not specified, use the default value. */
+    p2m_pages = rc ?
+                p2m_mem_mb << (20 - PAGE_SHIFT) :
+                domain_p2m_pages(mem, d->max_vcpus);
+
+    spin_lock(&d->arch.paging.lock);
+    rc = p2m_set_allocation(d, p2m_pages, NULL);
+    spin_unlock(&d->arch.paging.lock);
+    if ( rc != 0 )
+        return rc;
+
     printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem);
 
     kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index ef1299ae1c64..dab3da3a23bc 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -48,6 +48,9 @@ static int handle_vuart_init(struct domain *d,
 static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
                        XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
+    long rc;
+    bool preempted = false;
+
     if ( unlikely(d == current->domain) )
     {
         printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
@@ -64,9 +67,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
     switch ( sc->op )
     {
     case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
-        return 0;
+    {
+        /* Allow and handle preemption */
+        spin_lock(&d->arch.paging.lock);
+        rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
+        spin_unlock(&d->arch.paging.lock);
+
+        if ( preempted )
+            /* Not finished. Set up to re-run the call. */
+            rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+                                               u_domctl);
+        else
+            /* Finished. Return the new allocation. */
+            sc->mb = p2m_get_allocation(d);
+
+        return rc;
+    }
     case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+    {
+        sc->mb = p2m_get_allocation(d);
         return 0;
+    }
     default:
     {
         printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 0c331a36a536..13b06c0fe41d 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -49,6 +49,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+static struct page_info *p2m_alloc_page(struct domain *d)
+{
+    struct page_info *pg;
+
+    spin_lock(&d->arch.paging.lock);
+    /*
+     * For hardware domain, there should be no limit in the number of pages that
+     * can be allocated, so that the kernel may take advantage of the extended
+     * regions. Hence, allocate p2m pages for hardware domains from heap.
+     */
+    if ( is_hardware_domain(d) )
+    {
+        pg = alloc_domheap_page(NULL, 0);
+        if ( pg == NULL )
+        {
+            printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n");
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+    }
+    else
+    {
+        pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+        if ( unlikely(!pg) )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+        d->arch.paging.p2m_total_pages--;
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return pg;
+}
+
+static void p2m_free_page(struct domain *d, struct page_info *pg)
+{
+    spin_lock(&d->arch.paging.lock);
+    if ( is_hardware_domain(d) )
+        free_domheap_page(pg);
+    else
+    {
+        d->arch.paging.p2m_total_pages++;
+        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+    }
+    spin_unlock(&d->arch.paging.lock);
+}
+
 /* Return the size of the pool, rounded up to the nearest MB */
 unsigned int p2m_get_allocation(struct domain *d)
 {
@@ -750,7 +798,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry)
 
     ASSERT(!p2m_is_valid(*entry));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( page == NULL )
         return -ENOMEM;
 
@@ -870,7 +918,7 @@ static void p2m_free_entry(struct p2m_domain *p2m,
     pg = mfn_to_page(mfn);
 
     page_list_del(pg, &p2m->pages);
-    free_domheap_page(pg);
+    p2m_free_page(p2m->domain, pg);
 }
 
 static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
@@ -894,7 +942,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
     ASSERT(level < target);
     ASSERT(p2m_is_superpage(*entry, level));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( !page )
         return false;
 
@@ -1610,7 +1658,7 @@ int p2m_teardown(struct domain *d)
 
     while ( (pg = page_list_remove_head(&p2m->pages)) )
     {
-        free_domheap_page(pg);
+        p2m_free_page(p2m->domain, pg);
         count++;
         /* Arbitrarily preempt every 512 iterations */
         if ( !(count % 512) && hypercall_preempt_check() )
@@ -1634,6 +1682,7 @@ void p2m_final_teardown(struct domain *d)
         return;
 
     ASSERT(page_list_empty(&p2m->pages));
+    ASSERT(page_list_empty(&d->arch.paging.p2m_freelist));
 
     if ( p2m->root )
         free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
-- 
2.37.1

From 41b51342e789621d2bdd825b8e73e946fdc6ba93 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Tue, 9 Aug 2022 06:39:23 +0000
Subject: [PATCH 1/4] libxl, docs: Use arch-specific default paging memory

The default paging memory (descibed in `shadow_memory` entry in xl
config) in libxl is used to determine the memory pool size for xl
guests. Currently this size is only used for x86, and contains a part
of RAM to shadow the resident processes. Since on Arm there is no
shadow mode guests, so the part of RAM to shadow the resident processes
is not necessary. Therefore, this commit splits the function
`libxl_get_required_shadow_memory()` to arch specific helpers and
renamed the helper to `libxl__arch_get_required_paging_memory()`.

On x86, this helper calls the original value from
`libxl_get_required_shadow_memory()` so no functional change intended.

On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM
for the P2M map.

Also update the xl.cfg documentation to add Arm documentation
according to code changes and correct the comment style following Xen
coding style.

This is part of CVE-2022-33747 / XSA-409.

Suggested-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
---
 docs/man/xl.cfg.5.pod.in       |  5 +++++
 tools/libs/light/libxl_arch.h  |  4 ++++
 tools/libs/light/libxl_arm.c   | 12 ++++++++++++
 tools/libs/light/libxl_utils.c |  9 ++-------
 tools/libs/light/libxl_x86.c   | 13 +++++++++++++
 5 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in
index 56370a37dbb1..af7fae7c52f9 100644
--- a/docs/man/xl.cfg.5.pod.in
+++ b/docs/man/xl.cfg.5.pod.in
@@ -1746,6 +1746,11 @@ are not using hardware assisted paging (i.e. you are using shadow
 mode) and your guest workload consists of a very large number of
 similar processes then increasing this value may improve performance.
 
+On Arm, this field is used to determine the size of the guest P2M pages
+pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for
+the P2M map. Users should adjust this value if bigger P2M pool size is
+needed.
+
 =back
 
 =head3 Processor and Platform Features
diff --git a/tools/libs/light/libxl_arch.h b/tools/libs/light/libxl_arch.h
index 8527fc5c6c23..6741b7f6f457 100644
--- a/tools/libs/light/libxl_arch.h
+++ b/tools/libs/light/libxl_arch.h
@@ -90,6 +90,10 @@ void libxl__arch_update_domain_config(libxl__gc *gc,
                                       libxl_domain_config *dst,
                                       const libxl_domain_config *src);
 
+_hidden
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus);
+
 #if defined(__i386__) || defined(__x86_64__)
 
 #define LAPIC_BASE_ADDRESS  0xfee00000
diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c
index e2901f13b724..d59b464192c2 100644
--- a/tools/libs/light/libxl_arm.c
+++ b/tools/libs/light/libxl_arm.c
@@ -154,6 +154,18 @@ out:
     return rc;
 }
 
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus)
+{
+    /*
+     * 256 pages (1MB) per vcpu,
+     * plus 1 page per MiB of RAM for the P2M map,
+     * This is higher than the minimum that Xen would allocate if no value
+     * were given (but the Xen minimum is for safety, not performance).
+     */
+    return 4 * (256 * smp_cpus + maxmem_kb / 1024);
+}
+
 static struct arch_info {
     const char *guest_type;
     const char *timer_compat;
diff --git a/tools/libs/light/libxl_utils.c b/tools/libs/light/libxl_utils.c
index 4699c4a0a36f..e276c0ee9cc3 100644
--- a/tools/libs/light/libxl_utils.c
+++ b/tools/libs/light/libxl_utils.c
@@ -18,6 +18,7 @@
 #include <ctype.h>
 
 #include "libxl_internal.h"
+#include "libxl_arch.h"
 #include "_paths.h"
 
 #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE
@@ -39,13 +40,7 @@ char *libxl_basename(const char *name)
 
 unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus)
 {
-    /* 256 pages (1MB) per vcpu,
-       plus 1 page per MiB of RAM for the P2M map,
-       plus 1 page per MiB of RAM to shadow the resident processes.
-       This is higher than the minimum that Xen would allocate if no value
-       were given (but the Xen minimum is for safety, not performance).
-     */
-    return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
+    return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus);
 }
 
 char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid)
diff --git a/tools/libs/light/libxl_x86.c b/tools/libs/light/libxl_x86.c
index 18c3c77ccde3..4d66478fe9dd 100644
--- a/tools/libs/light/libxl_x86.c
+++ b/tools/libs/light/libxl_x86.c
@@ -882,6 +882,19 @@ void libxl__arch_update_domain_config(libxl__gc *gc,
                     libxl_defbool_val(src->b_info.arch_x86.msr_relaxed));
 }
 
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus)
+{
+    /*
+     * 256 pages (1MB) per vcpu,
+     * plus 1 page per MiB of RAM for the P2M map,
+     * plus 1 page per MiB of RAM to shadow the resident processes.
+     * This is higher than the minimum that Xen would allocate if no value
+     * were given (but the Xen minimum is for safety, not performance).
+     */
+    return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
+}
+
 /*
  * Local variables:
  * mode: C
-- 
2.37.1

From 5d7dc6237e7b5e73b2f58a8b7545ce4b89543665 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:28 +0000
Subject: [PATCH 2/4] xen/arm: Construct the P2M pages pool for guests

This commit constructs the p2m pages pool for guests from the
data structure and helper perspective.

This is implemented by:

- Adding a `struct paging_domain` which contains a freelist, a
counter variable and a spinlock to `struct arch_domain` to
indicate the free p2m pages and the number of p2m total pages in
the p2m pages pool.

- Adding a helper `p2m_get_allocation` to get the p2m pool size.

- Adding a helper `p2m_set_allocation` to set the p2m pages pool
size. This helper should be called before allocating memory for
a guest.

- Adding a helper `p2m_teardown_allocation` to free the p2m pages
pool. This helper should be called during the xl domain destory.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 xen/arch/arm/p2m.c           | 88 ++++++++++++++++++++++++++++++++++++
 xen/include/asm-arm/domain.h | 10 ++++
 xen/include/asm-arm/p2m.h    |  4 ++
 3 files changed, 102 insertions(+)

diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 4ad3e0606e9c..6883d8627702 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -50,6 +50,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+/* Return the size of the pool, rounded up to the nearest MB */
+unsigned int p2m_get_allocation(struct domain *d)
+{
+    unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages);
+
+    return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT);
+}
+
+/*
+ * Set the pool of pages to the required number of pages.
+ * Returns 0 for success, non-zero for failure.
+ * Call with d->arch.paging.lock held.
+ */
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)
+{
+    struct page_info *pg;
+
+    ASSERT(spin_is_locked(&d->arch.paging.lock));
+
+    for ( ; ; )
+    {
+        if ( d->arch.paging.p2m_total_pages < pages )
+        {
+            /* Need to allocate more memory from domheap */
+            pg = alloc_domheap_page(NULL, 0);
+            if ( pg == NULL )
+            {
+                printk(XENLOG_ERR "Failed to allocate P2M pages.\n");
+                return -ENOMEM;
+            }
+            ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                d->arch.paging.p2m_total_pages + 1;
+            page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+        }
+        else if ( d->arch.paging.p2m_total_pages > pages )
+        {
+            /* Need to return memory to domheap */
+            pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+            if( pg )
+            {
+                ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                    d->arch.paging.p2m_total_pages - 1;
+                free_domheap_page(pg);
+            }
+            else
+            {
+                printk(XENLOG_ERR
+                       "Failed to free P2M pages, P2M freelist is empty.\n");
+                return -ENOMEM;
+            }
+        }
+        else
+            break;
+
+        /* Check to see if we need to yield and try again */
+        if ( preempted && general_preempt_check() )
+        {
+            *preempted = true;
+            return -ERESTART;
+        }
+    }
+
+    return 0;
+}
+
+int p2m_teardown_allocation(struct domain *d)
+{
+    int ret = 0;
+    bool preempted = false;
+
+    spin_lock(&d->arch.paging.lock);
+    if ( d->arch.paging.p2m_total_pages != 0 )
+    {
+        ret = p2m_set_allocation(d, 0, &preempted);
+        if ( preempted )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return -ERESTART;
+        }
+        ASSERT(d->arch.paging.p2m_total_pages == 0);
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return ret;
+}
+
 /* Unlock the flush and do a P2M TLB flush if necessary */
 void p2m_write_unlock(struct p2m_domain *p2m)
 {
@@ -1602,7 +1688,9 @@ int p2m_init(struct domain *d)
     unsigned int cpu;
 
     rwlock_init(&p2m->lock);
+    spin_lock_init(&d->arch.paging.lock);
     INIT_PAGE_LIST_HEAD(&p2m->pages);
+    INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist);
 
     p2m->vmid = INVALID_VMID;
 
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index bb0a6adbe00b..1d8935778f3b 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -40,6 +40,14 @@ struct vtimer {
     uint64_t cval;
 };
 
+struct paging_domain {
+    spinlock_t lock;
+    /* Free P2M pages from the pre-allocated P2M pool */
+    struct page_list_head p2m_freelist;
+    /* Number of pages from the pre-allocated P2M pool */
+    unsigned long p2m_total_pages;
+};
+
 struct arch_domain
 {
 #ifdef CONFIG_ARM_64
@@ -51,6 +59,8 @@ struct arch_domain
 
     struct hvm_domain hvm;
 
+    struct paging_domain paging;
+
     struct vmmio vmmio;
 
     /* Continuable domain_relinquish_resources(). */
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index 3a2d51b35d71..18675b234570 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -218,6 +218,10 @@ void p2m_restore_state(struct vcpu *n);
 /* Print debugging/statistial info about a domain's p2m */
 void p2m_dump_info(struct domain *d);
 
+unsigned int p2m_get_allocation(struct domain *d);
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted);
+int p2m_teardown_allocation(struct domain *d);
+
 static inline void p2m_write_lock(struct p2m_domain *p2m)
 {
     write_lock(&p2m->lock);
-- 
2.37.1

From 6bb69d78b12d5b5f308be40b497f56c2d45fa7ac Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:29 +0000
Subject: [PATCH 3/4] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm

This commit implements the `XEN_DOMCTL_shadow_op` support in Xen
for Arm. The p2m pages pool size for xl guests is supposed to be
determined by `XEN_DOMCTL_shadow_op`. Hence, this commit:

- Introduces a function `p2m_domctl` and implements the subops
`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and
`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`.

- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl.

Therefore enabling the setting of shadow memory pool size
when creating a guest from xl and getting shadow memory pool size
from Xen.

Note that the `XEN_DOMCTL_shadow_op` added in this commit is only
a dummy op, and the functionality of setting/getting p2m memory pool
size for xl guests will be added in following commits.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 tools/libs/light/libxl_arm.c | 12 ++++++++++++
 xen/arch/arm/domctl.c        | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c
index d59b464192c2..d21f614ed788 100644
--- a/tools/libs/light/libxl_arm.c
+++ b/tools/libs/light/libxl_arm.c
@@ -131,6 +131,18 @@ int libxl__arch_domain_create(libxl__gc *gc,
                               libxl__domain_build_state *state,
                               uint32_t domid)
 {
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+    unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024);
+
+    int r = xc_shadow_control(ctx->xch, domid,
+                              XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION,
+                              &shadow_mb, 0);
+    if (r) {
+        LOGED(ERROR, domid,
+              "Failed to set %u MiB shadow allocation", shadow_mb);
+        return ERROR_FAIL;
+    }
+
     return 0;
 }
 
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index a8c48b0beaab..a049bc7f3e52 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -45,11 +45,43 @@ static int handle_vuart_init(struct domain *d,
     return rc;
 }
 
+static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
+                       XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+{
+    if ( unlikely(d == current->domain) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
+        return -EINVAL;
+    }
+
+    if ( unlikely(d->is_dying) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n",
+               d->domain_id);
+        return -EINVAL;
+    }
+
+    switch ( sc->op )
+    {
+    case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
+        return 0;
+    case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+        return 0;
+    default:
+    {
+        printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
+        return -EINVAL;
+    }
+    }
+}
+
 long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
                     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
     switch ( domctl->cmd )
     {
+    case XEN_DOMCTL_shadow_op:
+        return p2m_domctl(d, &domctl->u.shadow_op, u_domctl);
     case XEN_DOMCTL_cacheflush:
     {
         gfn_t s = _gfn(domctl->u.cacheflush.start_pfn);
-- 
2.37.1

From ba3da27eac6326476b9f1430e204bae510863b97 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:30 +0000
Subject: [PATCH 4/4] xen/arm: Allocate and free P2M pages from the P2M pool

This commit sets/tearsdown of p2m pages pool for non-privileged Arm
guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`.

- For dom0, P2M pages should come from heap directly instead of p2m
pool, so that the kernel may take advantage of the extended regions.

- For xl guests, the setting of the p2m pool is called in
`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in
`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is
updated with the new size when setting the p2m pool.

- For dom0less domUs, the setting of the p2m pool is called before
allocating memory during domain creation. Users can specify the p2m
pool size by `xen,domain-p2m-mem-mb` dts property.

To actually allocate/free pages from the p2m pool, this commit adds
two helper functions namely `p2m_alloc_page` and `p2m_free_page` to
`struct p2m_domain`. By replacing the `alloc_domheap_page` and
`free_domheap_page` with these two helper functions, p2m pages can
be added/removed from the list of p2m pool rather than from the heap.

Since page from `p2m_alloc_page` is cleaned, take the opportunity
to remove the redundant `clean_page` in `p2m_create_table`.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 docs/misc/arm/device-tree/booting.txt |  8 ++++
 xen/arch/arm/domain.c                 |  6 +++
 xen/arch/arm/domain_build.c           | 29 ++++++++++++++
 xen/arch/arm/domctl.c                 | 23 ++++++++++-
 xen/arch/arm/p2m.c                    | 57 +++++++++++++++++++++++++--
 5 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
index 5243bc7fd344..470c9491a781 100644
--- a/docs/misc/arm/device-tree/booting.txt
+++ b/docs/misc/arm/device-tree/booting.txt
@@ -164,6 +164,14 @@ with the following properties:
     Both #address-cells and #size-cells need to be specified because
     both sub-nodes (described shortly) have reg properties.
 
+- xen,domain-p2m-mem-mb
+
+    Optional. A 32-bit integer specifying the amount of megabytes of RAM
+    used for the domain P2M pool. This is in-sync with the shadow_memory
+    option in xl.cfg. Leaving this field empty in device tree will lead to
+    the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB
+    per MB of guest RAM plus 512KB for guest extended regions.
+
 Under the "xen,domain" compatible node, one or more sub-nodes are present
 for the DomU kernel and ramdisk.
 
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 223ec9694df1..a5ffd952ecd0 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -985,6 +985,7 @@ enum {
     PROG_page,
     PROG_mapping,
     PROG_p2m,
+    PROG_p2m_pool,
     PROG_done,
 };
 
@@ -1044,6 +1045,11 @@ int domain_relinquish_resources(struct domain *d)
         if ( ret )
             return ret;
 
+    PROGRESS(p2m_pool):
+        ret = p2m_teardown_allocation(d);
+        if( ret )
+            return ret;
+
     PROGRESS(done):
         break;
 
diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index 26c13429488d..df0ec84f034c 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -2333,6 +2333,21 @@ static void __init find_gnttab_region(struct domain *d,
            kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size);
 }
 
+static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb,
+                                             unsigned int smp_cpus)
+{
+    /*
+     * Keep in sync with libxl__get_required_paging_memory().
+     * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map,
+     * plus 128 pages to cover extended regions.
+     */
+    unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128);
+
+    BUILD_BUG_ON(PAGE_SIZE != SZ_4K);
+
+    return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT);
+}
+
 static int __init construct_domain(struct domain *d, struct kernel_info *kinfo)
 {
     unsigned int i;
@@ -2424,6 +2439,8 @@ static int __init construct_domU(struct domain *d,
     struct kernel_info kinfo = {};
     int rc;
     u64 mem;
+    u32 p2m_mem_mb;
+    unsigned long p2m_pages;
 
     rc = dt_property_read_u64(node, "memory", &mem);
     if ( !rc )
@@ -2433,6 +2450,18 @@ static int __init construct_domU(struct domain *d,
     }
     kinfo.unassigned_mem = (paddr_t)mem * SZ_1K;
 
+    rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb);
+    /* If xen,domain-p2m-mem-mb is not specified, use the default value. */
+    p2m_pages = rc ?
+                p2m_mem_mb << (20 - PAGE_SHIFT) :
+                domain_p2m_pages(mem, d->max_vcpus);
+
+    spin_lock(&d->arch.paging.lock);
+    rc = p2m_set_allocation(d, p2m_pages, NULL);
+    spin_unlock(&d->arch.paging.lock);
+    if ( rc != 0 )
+        return rc;
+
     printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem);
 
     kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index a049bc7f3e52..4ab5ed4ab24d 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -48,6 +48,9 @@ static int handle_vuart_init(struct domain *d,
 static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
                        XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
+    long rc;
+    bool preempted = false;
+
     if ( unlikely(d == current->domain) )
     {
         printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
@@ -64,9 +67,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
     switch ( sc->op )
     {
     case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
-        return 0;
+    {
+        /* Allow and handle preemption */
+        spin_lock(&d->arch.paging.lock);
+        rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
+        spin_unlock(&d->arch.paging.lock);
+
+        if ( preempted )
+            /* Not finished. Set up to re-run the call. */
+            rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+                                               u_domctl);
+        else
+            /* Finished. Return the new allocation. */
+            sc->mb = p2m_get_allocation(d);
+
+        return rc;
+    }
     case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+    {
+        sc->mb = p2m_get_allocation(d);
         return 0;
+    }
     default:
     {
         printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 6883d8627702..c1055ff2a745 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -50,6 +50,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+static struct page_info *p2m_alloc_page(struct domain *d)
+{
+    struct page_info *pg;
+
+    spin_lock(&d->arch.paging.lock);
+    /*
+     * For hardware domain, there should be no limit in the number of pages that
+     * can be allocated, so that the kernel may take advantage of the extended
+     * regions. Hence, allocate p2m pages for hardware domains from heap.
+     */
+    if ( is_hardware_domain(d) )
+    {
+        pg = alloc_domheap_page(NULL, 0);
+        if ( pg == NULL )
+        {
+            printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n");
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+    }
+    else
+    {
+        pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+        if ( unlikely(!pg) )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+        d->arch.paging.p2m_total_pages--;
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return pg;
+}
+
+static void p2m_free_page(struct domain *d, struct page_info *pg)
+{
+    spin_lock(&d->arch.paging.lock);
+    if ( is_hardware_domain(d) )
+        free_domheap_page(pg);
+    else
+    {
+        d->arch.paging.p2m_total_pages++;
+        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+    }
+    spin_unlock(&d->arch.paging.lock);
+}
+
 /* Return the size of the pool, rounded up to the nearest MB */
 unsigned int p2m_get_allocation(struct domain *d)
 {
@@ -751,7 +799,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry)
 
     ASSERT(!p2m_is_valid(*entry));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( page == NULL )
         return -ENOMEM;
 
@@ -878,7 +926,7 @@ static void p2m_free_entry(struct p2m_domain *p2m,
     pg = mfn_to_page(mfn);
 
     page_list_del(pg, &p2m->pages);
-    free_domheap_page(pg);
+    p2m_free_page(p2m->domain, pg);
 }
 
 static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
@@ -902,7 +950,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
     ASSERT(level < target);
     ASSERT(p2m_is_superpage(*entry, level));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( !page )
         return false;
 
@@ -1644,7 +1692,7 @@ int p2m_teardown(struct domain *d)
 
     while ( (pg = page_list_remove_head(&p2m->pages)) )
     {
-        free_domheap_page(pg);
+        p2m_free_page(p2m->domain, pg);
         count++;
         /* Arbitrarily preempt every 512 iterations */
         if ( !(count % 512) && hypercall_preempt_check() )
@@ -1668,6 +1716,7 @@ void p2m_final_teardown(struct domain *d)
         return;
 
     ASSERT(page_list_empty(&p2m->pages));
+    ASSERT(page_list_empty(&d->arch.paging.p2m_freelist));
 
     if ( p2m->root )
         free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
-- 
2.37.1

From bd4a7db4001364fd03a80a2e73b81c46aaa44e9c Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 22 Aug 2022 01:35:09 +0000
Subject: [PATCH 1/4] libxl, docs: Use arch-specific default paging memory

The default paging memory (descibed in `shadow_memory` entry in xl
config) in libxl is used to determine the memory pool size for xl
guests. Currently this size is only used for x86, and contains a part
of RAM to shadow the resident processes. Since on Arm there is no
shadow mode guests, so the part of RAM to shadow the resident processes
is not necessary. Therefore, this commit splits the function
`libxl_get_required_shadow_memory()` to arch specific helpers and
renamed the helper to `libxl__arch_get_required_paging_memory()`.

On x86, this helper calls the original value from
`libxl_get_required_shadow_memory()` so no functional change intended.

On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM
for the P2M map and additional 512KB.

Also update the xl.cfg documentation to add Arm documentation
according to code changes and correct the comment style following Xen
coding style.

This is part of CVE-2022-33747 / XSA-409.

Suggested-by: Julien Grall <jgrall@amazon.com>
Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
---
 docs/man/xl.cfg.5.pod.in       |  5 +++++
 tools/libs/light/libxl_arch.h  |  4 ++++
 tools/libs/light/libxl_arm.c   | 14 ++++++++++++++
 tools/libs/light/libxl_utils.c |  9 ++-------
 tools/libs/light/libxl_x86.c   | 13 +++++++++++++
 5 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in
index b98d1613987e..eda1e77ebd06 100644
--- a/docs/man/xl.cfg.5.pod.in
+++ b/docs/man/xl.cfg.5.pod.in
@@ -1768,6 +1768,11 @@ are not using hardware assisted paging (i.e. you are using shadow
 mode) and your guest workload consists of a very large number of
 similar processes then increasing this value may improve performance.
 
+On Arm, this field is used to determine the size of the guest P2M pages
+pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for
+the P2M map and additional 512KB for extended regions. Users should
+adjust this value if bigger P2M pool size is needed.
+
 =back
 
 =head3 Processor and Platform Features
diff --git a/tools/libs/light/libxl_arch.h b/tools/libs/light/libxl_arch.h
index 1522ecb97f72..5a060c2c3033 100644
--- a/tools/libs/light/libxl_arch.h
+++ b/tools/libs/light/libxl_arch.h
@@ -90,6 +90,10 @@ void libxl__arch_update_domain_config(libxl__gc *gc,
                                       libxl_domain_config *dst,
                                       const libxl_domain_config *src);
 
+_hidden
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus);
+
 #if defined(__i386__) || defined(__x86_64__)
 
 #define LAPIC_BASE_ADDRESS  0xfee00000
diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c
index eef1de093914..73a95e83af24 100644
--- a/tools/libs/light/libxl_arm.c
+++ b/tools/libs/light/libxl_arm.c
@@ -154,6 +154,20 @@ out:
     return rc;
 }
 
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus)
+{
+    /*
+     * 256 pages (1MB) per vcpu,
+     * plus 1 page per MiB of RAM for the P2M map,
+     * plus 1 page per MiB of extended region. This default value is 128 MiB
+     * which should be enough for domains that are not running backend.
+     * This is higher than the minimum that Xen would allocate if no value
+     * were given (but the Xen minimum is for safety, not performance).
+     */
+    return 4 * (256 * smp_cpus + maxmem_kb / 1024 + 128);
+}
+
 static struct arch_info {
     const char *guest_type;
     const char *timer_compat;
diff --git a/tools/libs/light/libxl_utils.c b/tools/libs/light/libxl_utils.c
index 4699c4a0a36f..e276c0ee9cc3 100644
--- a/tools/libs/light/libxl_utils.c
+++ b/tools/libs/light/libxl_utils.c
@@ -18,6 +18,7 @@
 #include <ctype.h>
 
 #include "libxl_internal.h"
+#include "libxl_arch.h"
 #include "_paths.h"
 
 #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE
@@ -39,13 +40,7 @@ char *libxl_basename(const char *name)
 
 unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus)
 {
-    /* 256 pages (1MB) per vcpu,
-       plus 1 page per MiB of RAM for the P2M map,
-       plus 1 page per MiB of RAM to shadow the resident processes.
-       This is higher than the minimum that Xen would allocate if no value
-       were given (but the Xen minimum is for safety, not performance).
-     */
-    return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
+    return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus);
 }
 
 char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid)
diff --git a/tools/libs/light/libxl_x86.c b/tools/libs/light/libxl_x86.c
index 1feadebb1852..51362893cf98 100644
--- a/tools/libs/light/libxl_x86.c
+++ b/tools/libs/light/libxl_x86.c
@@ -882,6 +882,19 @@ void libxl__arch_update_domain_config(libxl__gc *gc,
                     libxl_defbool_val(src->b_info.arch_x86.msr_relaxed));
 }
 
+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb,
+                                                     unsigned int smp_cpus)
+{
+    /*
+     * 256 pages (1MB) per vcpu,
+     * plus 1 page per MiB of RAM for the P2M map,
+     * plus 1 page per MiB of RAM to shadow the resident processes.
+     * This is higher than the minimum that Xen would allocate if no value
+     * were given (but the Xen minimum is for safety, not performance).
+     */
+    return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024));
+}
+
 /*
  * Local variables:
  * mode: C
-- 
2.37.1

From 419a4bbc20cf7c5d7d9dedae59fb8049922e6a2c Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:28 +0000
Subject: [PATCH 2/4] xen/arm: Construct the P2M pages pool for guests

This commit constructs the p2m pages pool for guests from the
data structure and helper perspective.

This is implemented by:

- Adding a `struct paging_domain` which contains a freelist, a
counter variable and a spinlock to `struct arch_domain` to
indicate the free p2m pages and the number of p2m total pages in
the p2m pages pool.

- Adding a helper `p2m_get_allocation` to get the p2m pool size.

- Adding a helper `p2m_set_allocation` to set the p2m pages pool
size. This helper should be called before allocating memory for
a guest.

- Adding a helper `p2m_teardown_allocation` to free the p2m pages
pool. This helper should be called during the xl domain destory.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 xen/arch/arm/p2m.c           | 88 ++++++++++++++++++++++++++++++++++++
 xen/include/asm-arm/domain.h | 10 ++++
 xen/include/asm-arm/p2m.h    |  4 ++
 3 files changed, 102 insertions(+)

diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 3bcd1e897e88..79f3d37f5230 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -50,6 +50,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+/* Return the size of the pool, rounded up to the nearest MB */
+unsigned int p2m_get_allocation(struct domain *d)
+{
+    unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages);
+
+    return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT);
+}
+
+/*
+ * Set the pool of pages to the required number of pages.
+ * Returns 0 for success, non-zero for failure.
+ * Call with d->arch.paging.lock held.
+ */
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)
+{
+    struct page_info *pg;
+
+    ASSERT(spin_is_locked(&d->arch.paging.lock));
+
+    for ( ; ; )
+    {
+        if ( d->arch.paging.p2m_total_pages < pages )
+        {
+            /* Need to allocate more memory from domheap */
+            pg = alloc_domheap_page(NULL, 0);
+            if ( pg == NULL )
+            {
+                printk(XENLOG_ERR "Failed to allocate P2M pages.\n");
+                return -ENOMEM;
+            }
+            ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                d->arch.paging.p2m_total_pages + 1;
+            page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+        }
+        else if ( d->arch.paging.p2m_total_pages > pages )
+        {
+            /* Need to return memory to domheap */
+            pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+            if( pg )
+            {
+                ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                    d->arch.paging.p2m_total_pages - 1;
+                free_domheap_page(pg);
+            }
+            else
+            {
+                printk(XENLOG_ERR
+                       "Failed to free P2M pages, P2M freelist is empty.\n");
+                return -ENOMEM;
+            }
+        }
+        else
+            break;
+
+        /* Check to see if we need to yield and try again */
+        if ( preempted && general_preempt_check() )
+        {
+            *preempted = true;
+            return -ERESTART;
+        }
+    }
+
+    return 0;
+}
+
+int p2m_teardown_allocation(struct domain *d)
+{
+    int ret = 0;
+    bool preempted = false;
+
+    spin_lock(&d->arch.paging.lock);
+    if ( d->arch.paging.p2m_total_pages != 0 )
+    {
+        ret = p2m_set_allocation(d, 0, &preempted);
+        if ( preempted )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return -ERESTART;
+        }
+        ASSERT(d->arch.paging.p2m_total_pages == 0);
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return ret;
+}
+
 /* Unlock the flush and do a P2M TLB flush if necessary */
 void p2m_write_unlock(struct p2m_domain *p2m)
 {
@@ -1599,7 +1685,9 @@ int p2m_init(struct domain *d)
     unsigned int cpu;
 
     rwlock_init(&p2m->lock);
+    spin_lock_init(&d->arch.paging.lock);
     INIT_PAGE_LIST_HEAD(&p2m->pages);
+    INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist);
 
     p2m->vmid = INVALID_VMID;
 
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 9b3647587a04..c90daa65afa7 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -40,6 +40,14 @@ struct vtimer {
     uint64_t cval;
 };
 
+struct paging_domain {
+    spinlock_t lock;
+    /* Free P2M pages from the pre-allocated P2M pool */
+    struct page_list_head p2m_freelist;
+    /* Number of pages from the pre-allocated P2M pool */
+    unsigned long p2m_total_pages;
+};
+
 struct arch_domain
 {
 #ifdef CONFIG_ARM_64
@@ -51,6 +59,8 @@ struct arch_domain
 
     struct hvm_domain hvm;
 
+    struct paging_domain paging;
+
     struct vmmio vmmio;
 
     /* Continuable domain_relinquish_resources(). */
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index b3ba83283e11..c9598740bd02 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -218,6 +218,10 @@ void p2m_restore_state(struct vcpu *n);
 /* Print debugging/statistial info about a domain's p2m */
 void p2m_dump_info(struct domain *d);
 
+unsigned int p2m_get_allocation(struct domain *d);
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted);
+int p2m_teardown_allocation(struct domain *d);
+
 static inline void p2m_write_lock(struct p2m_domain *p2m)
 {
     write_lock(&p2m->lock);
-- 
2.37.1

From 332a9979d4dd0b047aa16db201c50fcedbd56743 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:29 +0000
Subject: [PATCH 3/4] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm

This commit implements the `XEN_DOMCTL_shadow_op` support in Xen
for Arm. The p2m pages pool size for xl guests is supposed to be
determined by `XEN_DOMCTL_shadow_op`. Hence, this commit:

- Introduces a function `p2m_domctl` and implements the subops
`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and
`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`.

- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl.

Therefore enabling the setting of shadow memory pool size
when creating a guest from xl and getting shadow memory pool size
from Xen.

Note that the `XEN_DOMCTL_shadow_op` added in this commit is only
a dummy op, and the functionality of setting/getting p2m memory pool
size for xl guests will be added in following commits.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 tools/libs/light/libxl_arm.c | 12 ++++++++++++
 xen/arch/arm/domctl.c        | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c
index 73a95e83af24..22a0c561bbc6 100644
--- a/tools/libs/light/libxl_arm.c
+++ b/tools/libs/light/libxl_arm.c
@@ -131,6 +131,18 @@ int libxl__arch_domain_create(libxl__gc *gc,
                               libxl__domain_build_state *state,
                               uint32_t domid)
 {
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+    unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024);
+
+    int r = xc_shadow_control(ctx->xch, domid,
+                              XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION,
+                              &shadow_mb, 0);
+    if (r) {
+        LOGED(ERROR, domid,
+              "Failed to set %u MiB shadow allocation", shadow_mb);
+        return ERROR_FAIL;
+    }
+
     return 0;
 }
 
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 1baf25c3d98b..9bf72e693019 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -47,11 +47,43 @@ static int handle_vuart_init(struct domain *d,
     return rc;
 }
 
+static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
+                       XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+{
+    if ( unlikely(d == current->domain) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
+        return -EINVAL;
+    }
+
+    if ( unlikely(d->is_dying) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n",
+               d->domain_id);
+        return -EINVAL;
+    }
+
+    switch ( sc->op )
+    {
+    case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
+        return 0;
+    case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+        return 0;
+    default:
+    {
+        printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
+        return -EINVAL;
+    }
+    }
+}
+
 long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
                     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
     switch ( domctl->cmd )
     {
+    case XEN_DOMCTL_shadow_op:
+        return p2m_domctl(d, &domctl->u.shadow_op, u_domctl);
     case XEN_DOMCTL_cacheflush:
     {
         gfn_t s = _gfn(domctl->u.cacheflush.start_pfn);
-- 
2.37.1

From 39664d9ee041f96e9c7ee131ed8ef72a4d19c9f8 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:30 +0000
Subject: [PATCH 4/4] xen/arm: Allocate and free P2M pages from the P2M pool

This commit sets/tearsdown of p2m pages pool for non-privileged Arm
guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`.

- For dom0, P2M pages should come from heap directly instead of p2m
pool, so that the kernel may take advantage of the extended regions.

- For xl guests, the setting of the p2m pool is called in
`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in
`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is
updated with the new size when setting the p2m pool.

- For dom0less domUs, the setting of the p2m pool is called before
allocating memory during domain creation. Users can specify the p2m
pool size by `xen,domain-p2m-mem-mb` dts property.

To actually allocate/free pages from the p2m pool, this commit adds
two helper functions namely `p2m_alloc_page` and `p2m_free_page` to
`struct p2m_domain`. By replacing the `alloc_domheap_page` and
`free_domheap_page` with these two helper functions, p2m pages can
be added/removed from the list of p2m pool rather than from the heap.

Since page from `p2m_alloc_page` is cleaned, take the opportunity
to remove the redundant `clean_page` in `p2m_create_table`.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 docs/misc/arm/device-tree/booting.txt |  8 ++++
 xen/arch/arm/domain.c                 |  6 +++
 xen/arch/arm/domain_build.c           | 29 ++++++++++++++
 xen/arch/arm/domctl.c                 | 23 ++++++++++-
 xen/arch/arm/p2m.c                    | 57 +++++++++++++++++++++++++--
 5 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
index 71895663a4de..d92ccc56ffe0 100644
--- a/docs/misc/arm/device-tree/booting.txt
+++ b/docs/misc/arm/device-tree/booting.txt
@@ -182,6 +182,14 @@ with the following properties:
     Both #address-cells and #size-cells need to be specified because
     both sub-nodes (described shortly) have reg properties.
 
+- xen,domain-p2m-mem-mb
+
+    Optional. A 32-bit integer specifying the amount of megabytes of RAM
+    used for the domain P2M pool. This is in-sync with the shadow_memory
+    option in xl.cfg. Leaving this field empty in device tree will lead to
+    the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB
+    per MB of guest RAM plus 512KB for guest extended regions.
+
 Under the "xen,domain" compatible node, one or more sub-nodes are present
 for the DomU kernel and ramdisk.
 
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 2694c39127c5..a818f33a1afa 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -997,6 +997,7 @@ enum {
     PROG_page,
     PROG_mapping,
     PROG_p2m,
+    PROG_p2m_pool,
     PROG_done,
 };
 
@@ -1062,6 +1063,11 @@ int domain_relinquish_resources(struct domain *d)
         if ( ret )
             return ret;
 
+    PROGRESS(p2m_pool):
+        ret = p2m_teardown_allocation(d);
+        if( ret )
+            return ret;
+
     PROGRESS(done):
         break;
 
diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index d02bacbcd1ed..8aec3755ca5d 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -2833,6 +2833,21 @@ static void __init find_gnttab_region(struct domain *d,
            kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size);
 }
 
+static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb,
+                                             unsigned int smp_cpus)
+{
+    /*
+     * Keep in sync with libxl__get_required_paging_memory().
+     * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map,
+     * plus 128 pages to cover extended regions.
+     */
+    unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128);
+
+    BUILD_BUG_ON(PAGE_SIZE != SZ_4K);
+
+    return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT);
+}
+
 static int __init construct_domain(struct domain *d, struct kernel_info *kinfo)
 {
     unsigned int i;
@@ -2924,6 +2939,8 @@ static int __init construct_domU(struct domain *d,
     struct kernel_info kinfo = {};
     int rc;
     u64 mem;
+    u32 p2m_mem_mb;
+    unsigned long p2m_pages;
 
     rc = dt_property_read_u64(node, "memory", &mem);
     if ( !rc )
@@ -2933,6 +2950,18 @@ static int __init construct_domU(struct domain *d,
     }
     kinfo.unassigned_mem = (paddr_t)mem * SZ_1K;
 
+    rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb);
+    /* If xen,domain-p2m-mem-mb is not specified, use the default value. */
+    p2m_pages = rc ?
+                p2m_mem_mb << (20 - PAGE_SHIFT) :
+                domain_p2m_pages(mem, d->max_vcpus);
+
+    spin_lock(&d->arch.paging.lock);
+    rc = p2m_set_allocation(d, p2m_pages, NULL);
+    spin_unlock(&d->arch.paging.lock);
+    if ( rc != 0 )
+        return rc;
+
     printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem);
 
     kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 9bf72e693019..c8fdeb124084 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -50,6 +50,9 @@ static int handle_vuart_init(struct domain *d,
 static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
                        XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
+    long rc;
+    bool preempted = false;
+
     if ( unlikely(d == current->domain) )
     {
         printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
@@ -66,9 +69,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
     switch ( sc->op )
     {
     case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
-        return 0;
+    {
+        /* Allow and handle preemption */
+        spin_lock(&d->arch.paging.lock);
+        rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
+        spin_unlock(&d->arch.paging.lock);
+
+        if ( preempted )
+            /* Not finished. Set up to re-run the call. */
+            rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+                                               u_domctl);
+        else
+            /* Finished. Return the new allocation. */
+            sc->mb = p2m_get_allocation(d);
+
+        return rc;
+    }
     case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+    {
+        sc->mb = p2m_get_allocation(d);
         return 0;
+    }
     default:
     {
         printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 79f3d37f5230..1bf9cbeb53cf 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -50,6 +50,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+static struct page_info *p2m_alloc_page(struct domain *d)
+{
+    struct page_info *pg;
+
+    spin_lock(&d->arch.paging.lock);
+    /*
+     * For hardware domain, there should be no limit in the number of pages that
+     * can be allocated, so that the kernel may take advantage of the extended
+     * regions. Hence, allocate p2m pages for hardware domains from heap.
+     */
+    if ( is_hardware_domain(d) )
+    {
+        pg = alloc_domheap_page(NULL, 0);
+        if ( pg == NULL )
+        {
+            printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n");
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+    }
+    else
+    {
+        pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+        if ( unlikely(!pg) )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+        d->arch.paging.p2m_total_pages--;
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return pg;
+}
+
+static void p2m_free_page(struct domain *d, struct page_info *pg)
+{
+    spin_lock(&d->arch.paging.lock);
+    if ( is_hardware_domain(d) )
+        free_domheap_page(pg);
+    else
+    {
+        d->arch.paging.p2m_total_pages++;
+        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+    }
+    spin_unlock(&d->arch.paging.lock);
+}
+
 /* Return the size of the pool, rounded up to the nearest MB */
 unsigned int p2m_get_allocation(struct domain *d)
 {
@@ -751,7 +799,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry)
 
     ASSERT(!p2m_is_valid(*entry));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( page == NULL )
         return -ENOMEM;
 
@@ -878,7 +926,7 @@ static void p2m_free_entry(struct p2m_domain *p2m,
     pg = mfn_to_page(mfn);
 
     page_list_del(pg, &p2m->pages);
-    free_domheap_page(pg);
+    p2m_free_page(p2m->domain, pg);
 }
 
 static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
@@ -902,7 +950,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
     ASSERT(level < target);
     ASSERT(p2m_is_superpage(*entry, level));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( !page )
         return false;
 
@@ -1641,7 +1689,7 @@ int p2m_teardown(struct domain *d)
 
     while ( (pg = page_list_remove_head(&p2m->pages)) )
     {
-        free_domheap_page(pg);
+        p2m_free_page(p2m->domain, pg);
         count++;
         /* Arbitrarily preempt every 512 iterations */
         if ( !(count % 512) && hypercall_preempt_check() )
@@ -1665,6 +1713,7 @@ void p2m_final_teardown(struct domain *d)
         return;
 
     ASSERT(page_list_empty(&p2m->pages));
+    ASSERT(page_list_empty(&d->arch.paging.p2m_freelist));
 
     if ( p2m->root )
         free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
-- 
2.37.1

From d0f3f329eb3b652acc354265ca4d4042807724b5 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:27 +0000
Subject: [PATCH 1/4] libxl, docs: Add per-arch extra default paging memory

This commit adds a per-arch macro `EXTRA_DEFAULT_PAGING_MEM_MB`
to the default paging memory size, in order to cover the p2m
pool for extended regions of a xl-based guest on Arm.

For Arm, the extra default paging memory is 128MB.
For x86, the extra default paging memory is zero, since there
are no extended regions on x86.

Also update the xl.cfg documentation to add Arm documentation
according to code changes.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Julien Grall <jgrall@amazon.com>
---
 docs/man/xl.cfg.5.pod.in        |  5 +++++
 tools/libs/light/libxl_arch.h   | 11 +++++++++++
 tools/libs/light/libxl_create.c |  7 ++++++-
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in
index 6d98d73d76c0..70de1033550f 100644
--- a/docs/man/xl.cfg.5.pod.in
+++ b/docs/man/xl.cfg.5.pod.in
@@ -2725,6 +2725,11 @@ are not using hardware assisted paging (i.e. you are using shadow
 mode) and your guest workload consists of a very large number of
 similar processes then increasing this value may improve performance.
 
+On Arm, this field is used to determine the size of the guest P2M pages
+pool, and the default value is the same as x86 HAP mode, plus 512KB to
+cover the extended regions. Users should adjust this value if bigger
+P2M pool size is needed.
+
 =back
 
 =head2 Device-Model Options
diff --git a/tools/libs/light/libxl_arch.h b/tools/libs/light/libxl_arch.h
index 03b89929e68c..247cca130fc3 100644
--- a/tools/libs/light/libxl_arch.h
+++ b/tools/libs/light/libxl_arch.h
@@ -99,10 +99,21 @@ void libxl__arch_update_domain_config(libxl__gc *gc,
 
 #define LAPIC_BASE_ADDRESS  0xfee00000
 #define ACPI_INFO_PHYSICAL_ADDRESS 0xfc000000
+#define EXTRA_DEFAULT_PAGING_MEM_MB 0
 
 int libxl__dom_load_acpi(libxl__gc *gc,
                          const libxl_domain_build_info *b_info,
                          struct xc_dom_image *dom);
+
+#else
+
+/*
+ * 128MB extra default paging memory on Arm for extended regions. This
+ * value is normally enough for domains that are not running backend.
+ * See the `shadow_memory` in xl.cfg documentation for more information.
+ */
+#define EXTRA_DEFAULT_PAGING_MEM_MB 128
+
 #endif
 
 #endif
diff --git a/tools/libs/light/libxl_create.c b/tools/libs/light/libxl_create.c
index b9dd2deedf13..612eacfc7fac 100644
--- a/tools/libs/light/libxl_create.c
+++ b/tools/libs/light/libxl_create.c
@@ -1035,12 +1035,17 @@ unsigned long libxl__get_required_paging_memory(unsigned long maxmem_kb,
      * plus 1 page per MiB of RAM for the P2M map (for non-PV guests),
      * plus 1 page per MiB of RAM to shadow the resident processes (for shadow
      * mode guests).
+     * plus 1 page per MiB of RAM for the architecture specific
+     * EXTRA_DEFAULT_PAGING_MEM_MB. On x86, this value is zero. On Arm, this
+     * value is 128 MiB to cover domain extended regions (enough for domains
+     * that are not running backend).
      * This is higher than the minimum that Xen would allocate if no value
      * were given (but the Xen minimum is for safety, not performance).
      */
     return 4 * (256 * smp_cpus +
                 ((type != LIBXL_DOMAIN_TYPE_PV) + !hap) *
-                (maxmem_kb / 1024));
+                (maxmem_kb / 1024) +
+                EXTRA_DEFAULT_PAGING_MEM_MB);
 }
 
 static unsigned long libxl__get_required_iommu_memory(unsigned long maxmem_kb)
-- 
2.37.1

From 043148a877fda3dc8d2cbdc02917b02c83eea727 Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:28 +0000
Subject: [PATCH 2/4] xen/arm: Construct the P2M pages pool for guests

This commit constructs the p2m pages pool for guests from the
data structure and helper perspective.

This is implemented by:

- Adding a `struct paging_domain` which contains a freelist, a
counter variable and a spinlock to `struct arch_domain` to
indicate the free p2m pages and the number of p2m total pages in
the p2m pages pool.

- Adding a helper `p2m_get_allocation` to get the p2m pool size.

- Adding a helper `p2m_set_allocation` to set the p2m pages pool
size. This helper should be called before allocating memory for
a guest.

- Adding a helper `p2m_teardown_allocation` to free the p2m pages
pool. This helper should be called during the xl domain destory.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 xen/arch/arm/include/asm/domain.h | 10 ++++
 xen/arch/arm/include/asm/p2m.h    |  4 ++
 xen/arch/arm/p2m.c                | 88 +++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/xen/arch/arm/include/asm/domain.h b/xen/arch/arm/include/asm/domain.h
index cd9ce19b4b41..df56c1ddd429 100644
--- a/xen/arch/arm/include/asm/domain.h
+++ b/xen/arch/arm/include/asm/domain.h
@@ -53,6 +53,14 @@ struct vtimer {
     uint64_t cval;
 };
 
+struct paging_domain {
+    spinlock_t lock;
+    /* Free P2M pages from the pre-allocated P2M pool */
+    struct page_list_head p2m_freelist;
+    /* Number of pages from the pre-allocated P2M pool */
+    unsigned long p2m_total_pages;
+};
+
 struct arch_domain
 {
 #ifdef CONFIG_ARM_64
@@ -64,6 +72,8 @@ struct arch_domain
 
     struct hvm_domain hvm;
 
+    struct paging_domain paging;
+
     struct vmmio vmmio;
 
     /* Continuable domain_relinquish_resources(). */
diff --git a/xen/arch/arm/include/asm/p2m.h b/xen/arch/arm/include/asm/p2m.h
index a15ea67f9b48..42bfd548c48b 100644
--- a/xen/arch/arm/include/asm/p2m.h
+++ b/xen/arch/arm/include/asm/p2m.h
@@ -218,6 +218,10 @@ void p2m_restore_state(struct vcpu *n);
 /* Print debugging/statistial info about a domain's p2m */
 void p2m_dump_info(struct domain *d);
 
+unsigned int p2m_get_allocation(struct domain *d);
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted);
+int p2m_teardown_allocation(struct domain *d);
+
 static inline void p2m_write_lock(struct p2m_domain *p2m)
 {
     write_lock(&p2m->lock);
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index b445f4d7541e..db385fe41023 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -44,6 +44,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+/* Return the size of the pool, rounded up to the nearest MB */
+unsigned int p2m_get_allocation(struct domain *d)
+{
+    unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages);
+
+    return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT);
+}
+
+/*
+ * Set the pool of pages to the required number of pages.
+ * Returns 0 for success, non-zero for failure.
+ * Call with d->arch.paging.lock held.
+ */
+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)
+{
+    struct page_info *pg;
+
+    ASSERT(spin_is_locked(&d->arch.paging.lock));
+
+    for ( ; ; )
+    {
+        if ( d->arch.paging.p2m_total_pages < pages )
+        {
+            /* Need to allocate more memory from domheap */
+            pg = alloc_domheap_page(NULL, 0);
+            if ( pg == NULL )
+            {
+                printk(XENLOG_ERR "Failed to allocate P2M pages.\n");
+                return -ENOMEM;
+            }
+            ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                d->arch.paging.p2m_total_pages + 1;
+            page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+        }
+        else if ( d->arch.paging.p2m_total_pages > pages )
+        {
+            /* Need to return memory to domheap */
+            pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+            if( pg )
+            {
+                ACCESS_ONCE(d->arch.paging.p2m_total_pages) =
+                    d->arch.paging.p2m_total_pages - 1;
+                free_domheap_page(pg);
+            }
+            else
+            {
+                printk(XENLOG_ERR
+                       "Failed to free P2M pages, P2M freelist is empty.\n");
+                return -ENOMEM;
+            }
+        }
+        else
+            break;
+
+        /* Check to see if we need to yield and try again */
+        if ( preempted && general_preempt_check() )
+        {
+            *preempted = true;
+            return -ERESTART;
+        }
+    }
+
+    return 0;
+}
+
+int p2m_teardown_allocation(struct domain *d)
+{
+    int ret = 0;
+    bool preempted = false;
+
+    spin_lock(&d->arch.paging.lock);
+    if ( d->arch.paging.p2m_total_pages != 0 )
+    {
+        ret = p2m_set_allocation(d, 0, &preempted);
+        if ( preempted )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return -ERESTART;
+        }
+        ASSERT(d->arch.paging.p2m_total_pages == 0);
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return ret;
+}
+
 /* Unlock the flush and do a P2M TLB flush if necessary */
 void p2m_write_unlock(struct p2m_domain *p2m)
 {
@@ -1623,7 +1709,9 @@ int p2m_init(struct domain *d)
     unsigned int cpu;
 
     rwlock_init(&p2m->lock);
+    spin_lock_init(&d->arch.paging.lock);
     INIT_PAGE_LIST_HEAD(&p2m->pages);
+    INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist);
 
     p2m->vmid = INVALID_VMID;
 
-- 
2.37.1

From a563e2c02f9e89c6ece4f6525c10b7be3b515b9b Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:29 +0000
Subject: [PATCH 3/4] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm

This commit implements the `XEN_DOMCTL_shadow_op` support in Xen
for Arm. The p2m pages pool size for xl guests is supposed to be
determined by `XEN_DOMCTL_shadow_op`. Hence, this commit:

- Introduces a function `p2m_domctl` and implements the subops
`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and
`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`.

- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl.

Therefore enabling the setting of shadow memory pool size
when creating a guest from xl and getting shadow memory pool size
from Xen.

Note that the `XEN_DOMCTL_shadow_op` added in this commit is only
a dummy op, and the functionality of setting/getting p2m memory pool
size for xl guests will be added in following commits.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 tools/libs/light/libxl_arm.c | 12 ++++++++++++
 xen/arch/arm/domctl.c        | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c
index 1a3ac1646e94..2a5e93c28403 100644
--- a/tools/libs/light/libxl_arm.c
+++ b/tools/libs/light/libxl_arm.c
@@ -209,6 +209,18 @@ int libxl__arch_domain_create(libxl__gc *gc,
                               libxl__domain_build_state *state,
                               uint32_t domid)
 {
+    libxl_ctx *ctx = libxl__gc_owner(gc);
+    unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024);
+
+    int r = xc_shadow_control(ctx->xch, domid,
+                              XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION,
+                              &shadow_mb, 0);
+    if (r) {
+        LOGED(ERROR, domid,
+              "Failed to set %u MiB shadow allocation", shadow_mb);
+        return ERROR_FAIL;
+    }
+
     return 0;
 }
 
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 1baf25c3d98b..9bf72e693019 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -47,11 +47,43 @@ static int handle_vuart_init(struct domain *d,
     return rc;
 }
 
+static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
+                       XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
+{
+    if ( unlikely(d == current->domain) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
+        return -EINVAL;
+    }
+
+    if ( unlikely(d->is_dying) )
+    {
+        printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n",
+               d->domain_id);
+        return -EINVAL;
+    }
+
+    switch ( sc->op )
+    {
+    case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
+        return 0;
+    case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+        return 0;
+    default:
+    {
+        printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
+        return -EINVAL;
+    }
+    }
+}
+
 long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
                     XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
     switch ( domctl->cmd )
     {
+    case XEN_DOMCTL_shadow_op:
+        return p2m_domctl(d, &domctl->u.shadow_op, u_domctl);
     case XEN_DOMCTL_cacheflush:
     {
         gfn_t s = _gfn(domctl->u.cacheflush.start_pfn);
-- 
2.37.1

From 64a71f1538c657779f74fa3b63226ac3eb95379a Mon Sep 17 00:00:00 2001
From: Henry Wang <Henry.Wang@arm.com>
Date: Mon, 6 Jun 2022 06:17:30 +0000
Subject: [PATCH 4/4] xen/arm: Allocate and free P2M pages from the P2M pool

This commit sets/tearsdown of p2m pages pool for non-privileged Arm
guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`.

- For dom0, P2M pages should come from heap directly instead of p2m
pool, so that the kernel may take advantage of the extended regions.

- For xl guests, the setting of the p2m pool is called in
`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in
`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is
updated with the new size when setting the p2m pool.

- For dom0less domUs, the setting of the p2m pool is called before
allocating memory during domain creation. Users can specify the p2m
pool size by `xen,domain-p2m-mem-mb` dts property.

To actually allocate/free pages from the p2m pool, this commit adds
two helper functions namely `p2m_alloc_page` and `p2m_free_page` to
`struct p2m_domain`. By replacing the `alloc_domheap_page` and
`free_domheap_page` with these two helper functions, p2m pages can
be added/removed from the list of p2m pool rather than from the heap.

Since page from `p2m_alloc_page` is cleaned, take the opportunity
to remove the redundant `clean_page` in `p2m_create_table`.

This is part of CVE-2022-33747 / XSA-409.

Signed-off-by: Henry Wang <Henry.Wang@arm.com>
Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
---
 docs/misc/arm/device-tree/booting.txt |  8 ++++
 xen/arch/arm/domain.c                 |  6 +++
 xen/arch/arm/domain_build.c           | 29 ++++++++++++++
 xen/arch/arm/domctl.c                 | 23 ++++++++++-
 xen/arch/arm/p2m.c                    | 57 +++++++++++++++++++++++++--
 5 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt
index 98253414b8d1..bf401d3c91c1 100644
--- a/docs/misc/arm/device-tree/booting.txt
+++ b/docs/misc/arm/device-tree/booting.txt
@@ -211,6 +211,14 @@ with the following properties:
     In the future other possible property values might be added to
     enable only selected interfaces.
 
+- xen,domain-p2m-mem-mb
+
+    Optional. A 32-bit integer specifying the amount of megabytes of RAM
+    used for the domain P2M pool. This is in-sync with the shadow_memory
+    option in xl.cfg. Leaving this field empty in device tree will lead to
+    the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB
+    per MB of guest RAM plus 512KB for guest extended regions.
+
 Under the "xen,domain" compatible node, one or more sub-nodes are present
 for the DomU kernel and ramdisk.
 
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index e5ae3e71eb20..33ee4f4032a1 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -1005,6 +1005,7 @@ enum {
     PROG_page,
     PROG_mapping,
     PROG_p2m,
+    PROG_p2m_pool,
     PROG_done,
 };
 
@@ -1070,6 +1071,11 @@ int domain_relinquish_resources(struct domain *d)
         if ( ret )
             return ret;
 
+    PROGRESS(p2m_pool):
+        ret = p2m_teardown_allocation(d);
+        if( ret )
+            return ret;
+
     PROGRESS(done):
         break;
 
diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index 3fd1186b53a8..93200bff5711 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -3079,6 +3079,21 @@ static void __init find_gnttab_region(struct domain *d,
            kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size);
 }
 
+static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb,
+                                             unsigned int smp_cpus)
+{
+    /*
+     * Keep in sync with libxl__get_required_paging_memory().
+     * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map,
+     * plus 128 pages to cover extended regions.
+     */
+    unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128);
+
+    BUILD_BUG_ON(PAGE_SIZE != SZ_4K);
+
+    return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT);
+}
+
 static int __init construct_domain(struct domain *d, struct kernel_info *kinfo)
 {
     unsigned int i;
@@ -3190,6 +3205,8 @@ static int __init construct_domU(struct domain *d,
     const char *dom0less_enhanced;
     int rc;
     u64 mem;
+    u32 p2m_mem_mb;
+    unsigned long p2m_pages;
 
     rc = dt_property_read_u64(node, "memory", &mem);
     if ( !rc )
@@ -3199,6 +3216,18 @@ static int __init construct_domU(struct domain *d,
     }
     kinfo.unassigned_mem = (paddr_t)mem * SZ_1K;
 
+    rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb);
+    /* If xen,domain-p2m-mem-mb is not specified, use the default value. */
+    p2m_pages = rc ?
+                p2m_mem_mb << (20 - PAGE_SHIFT) :
+                domain_p2m_pages(mem, d->max_vcpus);
+
+    spin_lock(&d->arch.paging.lock);
+    rc = p2m_set_allocation(d, p2m_pages, NULL);
+    spin_unlock(&d->arch.paging.lock);
+    if ( rc != 0 )
+        return rc;
+
     printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem);
 
     kinfo.vpl011 = dt_property_read_bool(node, "vpl011");
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 9bf72e693019..c8fdeb124084 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -50,6 +50,9 @@ static int handle_vuart_init(struct domain *d,
 static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
                        XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
+    long rc;
+    bool preempted = false;
+
     if ( unlikely(d == current->domain) )
     {
         printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n");
@@ -66,9 +69,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc,
     switch ( sc->op )
     {
     case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION:
-        return 0;
+    {
+        /* Allow and handle preemption */
+        spin_lock(&d->arch.paging.lock);
+        rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted);
+        spin_unlock(&d->arch.paging.lock);
+
+        if ( preempted )
+            /* Not finished. Set up to re-run the call. */
+            rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h",
+                                               u_domctl);
+        else
+            /* Finished. Return the new allocation. */
+            sc->mb = p2m_get_allocation(d);
+
+        return rc;
+    }
     case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION:
+    {
+        sc->mb = p2m_get_allocation(d);
         return 0;
+    }
     default:
     {
         printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op);
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index db385fe41023..f17500ddf3a3 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -44,6 +44,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
     return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
 }
 
+static struct page_info *p2m_alloc_page(struct domain *d)
+{
+    struct page_info *pg;
+
+    spin_lock(&d->arch.paging.lock);
+    /*
+     * For hardware domain, there should be no limit in the number of pages that
+     * can be allocated, so that the kernel may take advantage of the extended
+     * regions. Hence, allocate p2m pages for hardware domains from heap.
+     */
+    if ( is_hardware_domain(d) )
+    {
+        pg = alloc_domheap_page(NULL, 0);
+        if ( pg == NULL )
+        {
+            printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n");
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+    }
+    else
+    {
+        pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+        if ( unlikely(!pg) )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+        d->arch.paging.p2m_total_pages--;
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return pg;
+}
+
+static void p2m_free_page(struct domain *d, struct page_info *pg)
+{
+    spin_lock(&d->arch.paging.lock);
+    if ( is_hardware_domain(d) )
+        free_domheap_page(pg);
+    else
+    {
+        d->arch.paging.p2m_total_pages++;
+        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+    }
+    spin_unlock(&d->arch.paging.lock);
+}
+
 /* Return the size of the pool, rounded up to the nearest MB */
 unsigned int p2m_get_allocation(struct domain *d)
 {
@@ -747,7 +795,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry)
 
     ASSERT(!p2m_is_valid(*entry));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( page == NULL )
         return -ENOMEM;
 
@@ -877,7 +925,7 @@ static void p2m_free_entry(struct p2m_domain *p2m,
     pg = mfn_to_page(mfn);
 
     page_list_del(pg, &p2m->pages);
-    free_domheap_page(pg);
+    p2m_free_page(p2m->domain, pg);
 }
 
 static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
@@ -901,7 +949,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
     ASSERT(level < target);
     ASSERT(p2m_is_superpage(*entry, level));
 
-    page = alloc_domheap_page(NULL, 0);
+    page = p2m_alloc_page(p2m->domain);
     if ( !page )
         return false;
 
@@ -1665,7 +1713,7 @@ int p2m_teardown(struct domain *d)
 
     while ( (pg = page_list_remove_head(&p2m->pages)) )
     {
-        free_domheap_page(pg);
+        p2m_free_page(p2m->domain, pg);
         count++;
         /* Arbitrarily preempt every 512 iterations */
         if ( !(count % 512) && hypercall_preempt_check() )
@@ -1689,6 +1737,7 @@ void p2m_final_teardown(struct domain *d)
         return;
 
     ASSERT(page_list_empty(&p2m->pages));
+    ASSERT(page_list_empty(&d->arch.paging.p2m_freelist));
 
     if ( p2m->root )
         free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
-- 
2.37.1