From: Yu-cheng Yu <yu-cheng.yu@intel.com>
There is essentially no room left in the x86 hardware PTEs on some OSes
(not Linux). That left the hardware architects looking for a way to
represent a new memory type (shadow stack) within the existing bits.
They chose to repurpose a lightly-used state: Write=0, Dirty=1.
The reason it's lightly used is that Dirty=1 is normally set by hardware
and cannot normally be set by hardware on a Write=0 PTE. Software must
normally be involved to create one of these PTEs, so software can simply
opt to not create them.
In places where Linux normally creates Write=0, Dirty=1, it can use the
software-defined _PAGE_COW in place of the hardware _PAGE_DIRTY. In other
words, whenever Linux needs to create Write=0, Dirty=1, it instead creates
Write=0, Cow=1, except for shadow stack, which is Write=0, Dirty=1. This
clearly separates shadow stack from other data, and results in the
following:
(a) A modified, copy-on-write (COW) page: (Write=0, Cow=1)
(b) A R/O page that has been COW'ed: (Write=0, Cow=1)
The user page is in a R/O VMA, and get_user_pages() needs a writable
copy. The page fault handler creates a copy of the page and sets
the new copy's PTE as Write=0 and Cow=1.
(c) A shadow stack PTE: (Write=0, Dirty=1)
(d) A shared shadow stack PTE: (Write=0, Cow=1)
When a shadow stack page is being shared among processes (this happens
at fork()), its PTE is made Dirty=0, so the next shadow stack access
causes a fault, and the page is duplicated and Dirty=1 is set again.
This is the COW equivalent for shadow stack pages, even though it's
copy-on-access rather than copy-on-write.
(e) A page where the processor observed a Write=1 PTE, started a write, set
Dirty=1, but then observed a Write=0 PTE. That's possible today, but
will not happen on processors that support shadow stack.
Define _PAGE_COW and update pte_*() helpers and apply the same changes to
pmd and pud.
After this, there are six free bits left in the 64-bit PTE, and no more
free bits in the 32-bit PTE (except for PAE) and Shadow Stack is not
implemented for the 32-bit kernel.
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
arch/x86/include/asm/pgtable.h | 196 ++++++++++++++++++++++++---
arch/x86/include/asm/pgtable_types.h | 42 +++++-
2 files changed, 217 insertions(+), 21 deletions(-)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index aff5e489ff17..a4a75e78a934 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -123,9 +123,20 @@ extern pmdval_t early_pmd_flags;
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
{
- return pte_flags(pte) & _PAGE_DIRTY;
+ /*
+ * A dirty PTE has Dirty=1 or Cow=1.
+ */
+ return pte_flags(pte) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pte_shstk(pte_t pte)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return false;
+
+ return (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}
static inline int pte_young(pte_t pte)
@@ -133,9 +144,20 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_DIRTY;
+ /*
+ * A dirty PMD has Dirty=1 or Cow=1.
+ */
+ return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pmd_shstk(pmd_t pmd)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return false;
+
+ return (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}
static inline int pmd_young(pmd_t pmd)
@@ -143,9 +165,12 @@ static inline int pmd_young(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_ACCESSED;
}
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
{
- return pud_flags(pud) & _PAGE_DIRTY;
+ /*
+ * A dirty PUD has Dirty=1 or Cow=1.
+ */
+ return pud_flags(pud) & _PAGE_DIRTY_BITS;
}
static inline int pud_young(pud_t pud)
@@ -155,13 +180,23 @@ static inline int pud_young(pud_t pud)
static inline int pte_write(pte_t pte)
{
- return pte_flags(pte) & _PAGE_RW;
+ /*
+ * Shadow stack pages are always writable - but not by normal
+ * instructions, and only by shadow stack operations. Therefore,
+ * the W=0,D=1 test with pte_shstk().
+ */
+ return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
}
#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_RW;
+ /*
+ * Shadow stack pages are always writable - but not by normal
+ * instructions, and only by shadow stack operations. Therefore,
+ * the W=0,D=1 test with pmd_shstk().
+ */
+ return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
}
#define pud_write pud_write
@@ -299,6 +334,24 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+static inline pte_t pte_mkcow(pte_t pte)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return pte;
+
+ pte = pte_clear_flags(pte, _PAGE_DIRTY);
+ return pte_set_flags(pte, _PAGE_COW);
+}
+
+static inline pte_t pte_clear_cow(pte_t pte)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return pte;
+
+ pte = pte_set_flags(pte, _PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_COW);
+}
+
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
@@ -318,7 +371,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte)
static inline pte_t pte_mkclean(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}
static inline pte_t pte_mkold(pte_t pte)
@@ -328,7 +381,16 @@ static inline pte_t pte_mkold(pte_t pte)
static inline pte_t pte_wrprotect(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_RW);
+ pte = pte_clear_flags(pte, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PTE (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ if (pte_dirty(pte))
+ pte = pte_mkcow(pte);
+ return pte;
}
static inline pte_t pte_mkexec(pte_t pte)
@@ -338,7 +400,18 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pteval_t dirty = _PAGE_DIRTY;
+
+ /* Avoid creating (HW)Dirty=1, Write=0 PTEs */
+ if (cpu_feature_enabled(X86_FEATURE_SHSTK) && !pte_write(pte))
+ dirty = _PAGE_COW;
+
+ return pte_set_flags(pte, dirty | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_mkwrite_shstk(pte_t pte)
+{
+ return pte_clear_cow(pte);
}
static inline pte_t pte_mkyoung(pte_t pte)
@@ -348,7 +421,12 @@ static inline pte_t pte_mkyoung(pte_t pte)
static inline pte_t pte_mkwrite(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_RW);
+ pte = pte_set_flags(pte, _PAGE_RW);
+
+ if (pte_dirty(pte))
+ pte = pte_clear_cow(pte);
+
+ return pte;
}
static inline pte_t pte_mkhuge(pte_t pte)
@@ -395,6 +473,24 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+static inline pmd_t pmd_mkcow(pmd_t pmd)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return pmd;
+
+ pmd = pmd_clear_flags(pmd, _PAGE_DIRTY);
+ return pmd_set_flags(pmd, _PAGE_COW);
+}
+
+static inline pmd_t pmd_clear_cow(pmd_t pmd)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return pmd;
+
+ pmd = pmd_set_flags(pmd, _PAGE_DIRTY);
+ return pmd_clear_flags(pmd, _PAGE_COW);
+}
+
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
@@ -419,17 +515,36 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_DIRTY);
+ return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_RW);
+ pmd = pmd_clear_flags(pmd, _PAGE_RW);
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PMD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ if (pmd_dirty(pmd))
+ pmd = pmd_mkcow(pmd);
+ return pmd;
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pmdval_t dirty = _PAGE_DIRTY;
+
+ /* Avoid creating (HW)Dirty=1, Write=0 PMDs */
+ if (cpu_feature_enabled(X86_FEATURE_SHSTK) && !pmd_write(pmd))
+ dirty = _PAGE_COW;
+
+ return pmd_set_flags(pmd, dirty | _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
+{
+ return pmd_clear_cow(pmd);
}
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
@@ -449,7 +564,11 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
static inline pmd_t pmd_mkwrite(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_RW);
+ pmd = pmd_set_flags(pmd, _PAGE_RW);
+
+ if (pmd_dirty(pmd))
+ pmd = pmd_clear_cow(pmd);
+ return pmd;
}
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
@@ -466,6 +585,24 @@ static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
return native_make_pud(v & ~clear);
}
+static inline pud_t pud_mkcow(pud_t pud)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return pud;
+
+ pud = pud_clear_flags(pud, _PAGE_DIRTY);
+ return pud_set_flags(pud, _PAGE_COW);
+}
+
+static inline pud_t pud_clear_cow(pud_t pud)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return pud;
+
+ pud = pud_set_flags(pud, _PAGE_DIRTY);
+ return pud_clear_flags(pud, _PAGE_COW);
+}
+
static inline pud_t pud_mkold(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_ACCESSED);
@@ -473,17 +610,32 @@ static inline pud_t pud_mkold(pud_t pud)
static inline pud_t pud_mkclean(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_DIRTY);
+ return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}
static inline pud_t pud_wrprotect(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_RW);
+ pud = pud_clear_flags(pud, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PUD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ if (pud_dirty(pud))
+ pud = pud_mkcow(pud);
+ return pud;
}
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pudval_t dirty = _PAGE_DIRTY;
+
+ /* Avoid creating (HW)Dirty=1, Write=0 PUDs */
+ if (cpu_feature_enabled(X86_FEATURE_SHSTK) && !pud_write(pud))
+ dirty = _PAGE_COW;
+
+ return pud_set_flags(pud, dirty | _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mkdevmap(pud_t pud)
@@ -503,7 +655,11 @@ static inline pud_t pud_mkyoung(pud_t pud)
static inline pud_t pud_mkwrite(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_RW);
+ pud = pud_set_flags(pud, _PAGE_RW);
+
+ if (pud_dirty(pud))
+ pud = pud_clear_cow(pud);
+ return pud;
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3781a79b6388..1bfab70ff9ac 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -21,7 +21,8 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
-#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
+#define _PAGE_BIT_SOFTW4 57 /* available for programmer */
+#define _PAGE_BIT_SOFTW5 58 /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
@@ -34,6 +35,15 @@
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+/*
+ * Indicates a copy-on-write page.
+ */
+#ifdef CONFIG_X86_SHADOW_STACK
+#define _PAGE_BIT_COW _PAGE_BIT_SOFTW5 /* copy-on-write */
+#else
+#define _PAGE_BIT_COW 0
+#endif
+
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -115,6 +125,36 @@
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
+/*
+ * The hardware requires shadow stack to be read-only and Dirty.
+ * _PAGE_COW is a software-only bit used to separate copy-on-write PTEs
+ * from shadow stack PTEs:
+ * (a) A modified, copy-on-write (COW) page: (Write=0, Cow=1)
+ * (b) A R/O page that has been COW'ed: (Write=0, Cow=1)
+ * The user page is in a R/O VMA, and get_user_pages() needs a
+ * writable copy. The page fault handler creates a copy of the page
+ * and sets the new copy's PTE as Write=0, Cow=1.
+ * (c) A shadow stack PTE: (Write=0, Dirty=1)
+ * (d) A shared (copy-on-access) shadow stack PTE: (Write=0, Cow=1)
+ * When a shadow stack page is being shared among processes (this
+ * happens at fork()), its PTE is cleared of _PAGE_DIRTY, so the next
+ * shadow stack access causes a fault, and the page is duplicated and
+ * _PAGE_DIRTY is set again. This is the COW equivalent for shadow
+ * stack pages, even though it's copy-on-access rather than
+ * copy-on-write.
+ * (e) A page where the processor observed a Write=1 PTE, started a write,
+ * set Dirty=1, but then observed a Write=0 PTE (changed by another
+ * thread). That's possible today, but will not happen on processors
+ * that support shadow stack.
+ */
+#ifdef CONFIG_X86_SHADOW_STACK
+#define _PAGE_COW (_AT(pteval_t, 1) << _PAGE_BIT_COW)
+#else
+#define _PAGE_COW (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_COW)
+
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
/*
--
2.17.1
On 1/30/22 13:18, Rick Edgecombe wrote:
> From: Yu-cheng Yu <yu-cheng.yu@intel.com>
>
> There is essentially no room left in the x86 hardware PTEs on some OSes
> (not Linux). That left the hardware architects looking for a way to
> represent a new memory type (shadow stack) within the existing bits.
> They chose to repurpose a lightly-used state: Write=0, Dirty=1.
>
> The reason it's lightly used is that Dirty=1 is normally set by hardware
> and cannot normally be set by hardware on a Write=0 PTE. Software must
> normally be involved to create one of these PTEs, so software can simply
> opt to not create them.
This is kinda skipping over something important:
The reason it's lightly used is that Dirty=1 is normally set
_before_ a write. A write with a Write=0 PTE would typically
only generate a fault, not set Dirty=1. Hardware can (rarely)
both set Write=1 *and* generate the fault, resulting in a
Dirty=0,Write=1 PTE. Hardware which supports shadow stacks
will no longer exhibit this oddity.
> In places where Linux normally creates Write=0, Dirty=1, it can use the
> software-defined _PAGE_COW in place of the hardware _PAGE_DIRTY. In other
> words, whenever Linux needs to create Write=0, Dirty=1, it instead creates
> Write=0, Cow=1, except for shadow stack, which is Write=0, Dirty=1. This
> clearly separates shadow stack from other data, and results in the
> following:
Following _what_... What are these? I think they're PTE states. Best
to say that.
> (a) A modified, copy-on-write (COW) page: (Write=0, Cow=1)
Could you give an example of this? Would this be a typical anonymous
page which was Write=1,Dirty=1, then historically made Write=0,Dirty=1
at fork()?
> (b) A R/O page that has been COW'ed: (Write=0, Cow=1)
> The user page is in a R/O VMA, and get_user_pages() needs a writable
> copy. The page fault handler creates a copy of the page and sets
> the new copy's PTE as Write=0 and Cow=1.
> (c) A shadow stack PTE: (Write=0, Dirty=1)
> (d) A shared shadow stack PTE: (Write=0, Cow=1)
> When a shadow stack page is being shared among processes (this happens
> at fork()), its PTE is made Dirty=0, so the next shadow stack access
> causes a fault, and the page is duplicated and Dirty=1 is set again.
> This is the COW equivalent for shadow stack pages, even though it's
> copy-on-access rather than copy-on-write.
Just like code, it's also nice to format these in a way which allows
them to be visually compared, trivially. So, let's expand all the bits
and vertically align everything. To break this down a bit, we have two
old states:
[a] (Write=0, Dirty=0, Cow=1)
[b] (Write=0, Dirty=0, Cow=1)
And two new ones:
[c] (Write=0, Dirty=1, Cow=0)
[d] (Write=0, Dirty=0, Cow=1)
That makes me wonder what the difference is between [a] and [b] and why
they are separate. Is their handling different? How are those two
states differentiated?
> (e) A page where the processor observed a Write=1 PTE, started a write, set
> Dirty=1, but then observed a Write=0 PTE. That's possible today, but
> will not happen on processors that support shadow stack.
This left me wondering how you are going to detangle the mess where PTEs
look like shadow-stack PTEs on non-shadow-stack hardware. Could you
cover that here?
You can shorten that above bullet to this to help make the space:
(e) (Write=0, Dirty=1, Cow=0) PTE created when a processor
without shadow stack support set Dirty=1.
> Define _PAGE_COW and update pte_*() helpers and apply the same changes to
> pmd and pud.
>
> After this, there are six free bits left in the 64-bit PTE, and no more
> free bits in the 32-bit PTE (except for PAE) and Shadow Stack is not
> implemented for the 32-bit kernel.
Just say:
There are six bits left available to software in the 64-bit PTE
after consuming a bit for _PAGE_COW. No space is consumed in
32-bit kernels because shadow stacks are not enabled there.
There's no need to rub it in that 32-bit is out of space.
> -static inline int pte_dirty(pte_t pte)
> +static inline bool pte_dirty(pte_t pte)
> {
> - return pte_flags(pte) & _PAGE_DIRTY;
> + /*
> + * A dirty PTE has Dirty=1 or Cow=1.
> + */
I don't really like that comment because "Cow" isn't anywhere to be found.
> + return pte_flags(pte) & _PAGE_DIRTY_BITS;
> +}
> +
> +static inline bool pte_shstk(pte_t pte)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> + return false;
> +
> + return (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
> }
>
> static inline int pte_young(pte_t pte)
> @@ -133,9 +144,20 @@ static inline int pte_young(pte_t pte)
> return pte_flags(pte) & _PAGE_ACCESSED;
> }
>
> -static inline int pmd_dirty(pmd_t pmd)
> +static inline bool pmd_dirty(pmd_t pmd)
> {
> - return pmd_flags(pmd) & _PAGE_DIRTY;
> + /*
> + * A dirty PMD has Dirty=1 or Cow=1.
> + */
> + return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
> +}
> +
> +static inline bool pmd_shstk(pmd_t pmd)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> + return false;
> +
> + return (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
> }
>
> static inline int pmd_young(pmd_t pmd)
> @@ -143,9 +165,12 @@ static inline int pmd_young(pmd_t pmd)
> return pmd_flags(pmd) & _PAGE_ACCESSED;
> }
>
> -static inline int pud_dirty(pud_t pud)
> +static inline bool pud_dirty(pud_t pud)
> {
> - return pud_flags(pud) & _PAGE_DIRTY;
> + /*
> + * A dirty PUD has Dirty=1 or Cow=1.
> + */
> + return pud_flags(pud) & _PAGE_DIRTY_BITS;
> }
>
> static inline int pud_young(pud_t pud)
> @@ -155,13 +180,23 @@ static inline int pud_young(pud_t pud)
>
> static inline int pte_write(pte_t pte)
> {
> - return pte_flags(pte) & _PAGE_RW;
> + /*
> + * Shadow stack pages are always writable - but not by normal
> + * instructions, and only by shadow stack operations. Therefore,
> + * the W=0,D=1 test with pte_shstk().
> + */
I think that comment is off a bit. It's not really connected to the
code. We don't, for instance need to know what the bit combination is
inside pte_shstk(). Further, it's a bit mean to talk about "W" in the
comment and _PAGE_RW in the code. How about:
/*
* Shadow stack pages are logically writable, but do not have
* _PAGE_RW. Check for them separately from _PAGE_RW itself.
*/
> + return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
> }
>
> #define pmd_write pmd_write
> static inline int pmd_write(pmd_t pmd)
> {
> - return pmd_flags(pmd) & _PAGE_RW;
> + /*
> + * Shadow stack pages are always writable - but not by normal
> + * instructions, and only by shadow stack operations. Therefore,
> + * the W=0,D=1 test with pmd_shstk().
> + */
> + return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
> }
Ditto on the comment. Please copy the pte_write() one here too.
>
> #define pud_write pud_write
> @@ -299,6 +334,24 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
> return native_make_pte(v & ~clear);
> }
>
> +static inline pte_t pte_mkcow(pte_t pte)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> + return pte;
> +
> + pte = pte_clear_flags(pte, _PAGE_DIRTY);
> + return pte_set_flags(pte, _PAGE_COW);
> +}
> +
> +static inline pte_t pte_clear_cow(pte_t pte)
> +{
> + if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
> + return pte;
> +
> + pte = pte_set_flags(pte, _PAGE_DIRTY);
> + return pte_clear_flags(pte, _PAGE_COW);
> +}
I think we need to say *SOMETHING* about the X86_FEATURE_SHSTK and
_PAGE_COW connection here. Otherwise they look like two random features
that are interacting in an unknown way.
Maybe even something this simple:
/*
* _PAGE_COW is unnecessary on !X86_FEATURE_SHSTK kernels.
* See the _PAGE_COW definition for more details.
*/
Also, the manipulation of _PAGE_DIRTY is not clear here. It's obvious
why we have to:
pte_clear_flags(pte, _PAGE_COW);
in a function called pte_clear_cow() but, again, how does _PAGE_DIRTY fit?
> #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
> static inline int pte_uffd_wp(pte_t pte)
> {
> @@ -318,7 +371,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte)
>
> static inline pte_t pte_mkclean(pte_t pte)
> {
> - return pte_clear_flags(pte, _PAGE_DIRTY);
> + return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
> }
>
> static inline pte_t pte_mkold(pte_t pte)
> @@ -328,7 +381,16 @@ static inline pte_t pte_mkold(pte_t pte)
>
> static inline pte_t pte_wrprotect(pte_t pte)
> {
> - return pte_clear_flags(pte, _PAGE_RW);
> + pte = pte_clear_flags(pte, _PAGE_RW);
> +
> + /*
> + * Blindly clearing _PAGE_RW might accidentally create
> + * a shadow stack PTE (RW=0, Dirty=1). Move the hardware
Could you grep this series and try to be consistent about the formatting
here? (Not that I've been perfect in this regard either). I think we
have at least:
Write=X,Dirty=Y
W=X,D=Y
RW=X,Dirty=Y
> + * dirty value to the software bit.
> + */
> + if (pte_dirty(pte))
> + pte = pte_mkcow(pte);
> + return pte;
> }
One of my logical checks for this is "does it all go away when this is
compiled out". Because of this:
+static inline pte_t pte_mkcow(pte_t pte)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+ return pte;
...
the answer is yes! So, this looks good to me. Just thought I'd share a
bit of my thought process.
> static inline pte_t pte_mkexec(pte_t pte)
> @@ -338,7 +400,18 @@ static inline pte_t pte_mkexec(pte_t pte)
>
> static inline pte_t pte_mkdirty(pte_t pte)
> {
> - return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
> + pteval_t dirty = _PAGE_DIRTY;
> +
> + /* Avoid creating (HW)Dirty=1, Write=0 PTEs */
The "(HW)" thing doesn't make a lot of sense any longer. I think we had
a set of HWDirty and SWDirty bits, but SWDirty ended up being morphed
over to _PAGE_COW.
> + if (cpu_feature_enabled(X86_FEATURE_SHSTK) && !pte_write(pte))
> + dirty = _PAGE_COW;
> +
> + return pte_set_flags(pte, dirty | _PAGE_SOFT_DIRTY);
> +}
> +
> +static inline pte_t pte_mkwrite_shstk(pte_t pte)
> +{
> + return pte_clear_cow(pte);
> }
This one is a bit of black magic. This is taking a PTE from
(presumably) states [c]->[d] from earlier in the changelog.
Write=0,Dirty=0,Cow=1
to
Write=0,Dirty=1,Cow=0
It's hard to wrap my head around how clearing a software bit (from the
naming) will make this PTE writable.
There's either something wrong with the naming, or something wrong with
my mental model of what "COW clearing" is.
> static inline pte_t pte_mkyoung(pte_t pte)
> @@ -348,7 +421,12 @@ static inline pte_t pte_mkyoung(pte_t pte)
>
> static inline pte_t pte_mkwrite(pte_t pte)
> {
> - return pte_set_flags(pte, _PAGE_RW);
> + pte = pte_set_flags(pte, _PAGE_RW);
> +
> + if (pte_dirty(pte))
> + pte = pte_clear_cow(pte);
> +
> + return pte;
> }
Along the same lines as the last few comments, this leaves me wondering
why a pte_dirty() can't also be a "COW PTE".
... <snipping the pmd/pud copies> ...
> #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 3781a79b6388..1bfab70ff9ac 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -21,7 +21,8 @@
> #define _PAGE_BIT_SOFTW2 10 /* " */
> #define _PAGE_BIT_SOFTW3 11 /* " */
> #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
> -#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
> +#define _PAGE_BIT_SOFTW4 57 /* available for programmer */
> +#define _PAGE_BIT_SOFTW5 58 /* available for programmer */
> #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
> #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
> #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
> @@ -34,6 +35,15 @@
> #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
> #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
>
> +/*
> + * Indicates a copy-on-write page.
> + */
> +#ifdef CONFIG_X86_SHADOW_STACK
> +#define _PAGE_BIT_COW _PAGE_BIT_SOFTW5 /* copy-on-write */
> +#else
> +#define _PAGE_BIT_COW 0
> +#endif
> +
> /* If _PAGE_BIT_PRESENT is clear, we use these: */
> /* - if the user mapped it with PROT_NONE; pte_present gives true */
> #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
> @@ -115,6 +125,36 @@
> #define _PAGE_DEVMAP (_AT(pteval_t, 0))
> #endif
>
> +/*
> + * The hardware requires shadow stack to be read-only and Dirty.
> + * _PAGE_COW is a software-only bit used to separate copy-on-write PTEs
> + * from shadow stack PTEs:
> + * (a) A modified, copy-on-write (COW) page: (Write=0, Cow=1)
> + * (b) A R/O page that has been COW'ed: (Write=0, Cow=1)
> + * The user page is in a R/O VMA, and get_user_pages() needs a
> + * writable copy. The page fault handler creates a copy of the page
> + * and sets the new copy's PTE as Write=0, Cow=1.
> + * (c) A shadow stack PTE: (Write=0, Dirty=1)
> + * (d) A shared (copy-on-access) shadow stack PTE: (Write=0, Cow=1)
> + * When a shadow stack page is being shared among processes (this
> + * happens at fork()), its PTE is cleared of _PAGE_DIRTY, so the next
> + * shadow stack access causes a fault, and the page is duplicated and
> + * _PAGE_DIRTY is set again. This is the COW equivalent for shadow
> + * stack pages, even though it's copy-on-access rather than
> + * copy-on-write.
> + * (e) A page where the processor observed a Write=1 PTE, started a write,
> + * set Dirty=1, but then observed a Write=0 PTE (changed by another
> + * thread). That's possible today, but will not happen on processors
> + * that support shadow stack.
This info, again, is great. Let's keep it, but please do reformat it
like the changelog version to make the bit states easier to grok.
© 2016 - 2025 Red Hat, Inc.