The pv_map_ldt_shadow_page() and pv_destroy_ldt() functions rely on the L1
table(s) that contain such mappings being stashed in the domain structure, and
thus such mappings being modified by merely updating the require L1 entries.
Switch pv_map_ldt_shadow_page() to unconditionally use the linear recursive, as
that logic is always called while the vCPU is running on the current pCPU.
For pv_destroy_ldt() use the linear mappings if the vCPU is the one currently
running on the pCPU, otherwise use destroy_mappings().
Note this requires keeping an array with the pages currently mapped at the LDT
area, as that allows dropping the extra taken page reference when removing the
mappings.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
xen/arch/x86/include/asm/domain.h | 2 ++
xen/arch/x86/pv/descriptor-tables.c | 19 ++++++++++---------
xen/arch/x86/pv/domain.c | 4 ++++
xen/arch/x86/pv/mm.c | 3 ++-
4 files changed, 18 insertions(+), 10 deletions(-)
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index b79d6badd71c..b659cffc7f81 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -523,6 +523,8 @@ struct pv_vcpu
struct trap_info *trap_ctxt;
unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE];
+ /* Max LDT entries is 8192, so 8192 * 8 = 64KiB (16 pages). */
+ mfn_t ldt_frames[16];
unsigned long ldt_base;
unsigned int gdt_ents, ldt_ents;
diff --git a/xen/arch/x86/pv/descriptor-tables.c b/xen/arch/x86/pv/descriptor-tables.c
index 5a79f022ce13..95b598a4c0cf 100644
--- a/xen/arch/x86/pv/descriptor-tables.c
+++ b/xen/arch/x86/pv/descriptor-tables.c
@@ -20,28 +20,29 @@
*/
bool pv_destroy_ldt(struct vcpu *v)
{
- l1_pgentry_t *pl1e;
+ const unsigned int nr_frames = ARRAY_SIZE(v->arch.pv.ldt_frames);
unsigned int i, mappings_dropped = 0;
- struct page_info *page;
ASSERT(!in_irq());
ASSERT(v == current || !vcpu_cpu_dirty(v));
- pl1e = pv_ldt_ptes(v);
+ destroy_perdomain_mapping(v, LDT_VIRT_START(v), nr_frames);
- for ( i = 0; i < 16; i++ )
+ for ( i = 0; i < nr_frames; i++ )
{
- if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
- continue;
+ mfn_t mfn = v->arch.pv.ldt_frames[i];
+ struct page_info *page;
- page = l1e_get_page(pl1e[i]);
- l1e_write(&pl1e[i], l1e_empty());
- mappings_dropped++;
+ if ( mfn_eq(mfn, INVALID_MFN) )
+ continue;
+ v->arch.pv.ldt_frames[i] = INVALID_MFN;
+ page = mfn_to_page(mfn);
ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
ASSERT_PAGE_IS_DOMAIN(page, v->domain);
put_page_and_type(page);
+ mappings_dropped++;
}
return mappings_dropped;
diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c
index 7e8bffaae9a0..32d7488cc186 100644
--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -303,6 +303,7 @@ void pv_vcpu_destroy(struct vcpu *v)
int pv_vcpu_initialise(struct vcpu *v)
{
struct domain *d = v->domain;
+ unsigned int i;
int rc;
ASSERT(!is_idle_domain(d));
@@ -311,6 +312,9 @@ int pv_vcpu_initialise(struct vcpu *v)
if ( rc )
return rc;
+ for ( i = 0; i < ARRAY_SIZE(v->arch.pv.ldt_frames); i++ )
+ v->arch.pv.ldt_frames[i] = INVALID_MFN;
+
BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) >
PAGE_SIZE);
v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS);
diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 187f5f6a3e8c..4853e619f2a7 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -86,7 +86,8 @@ bool pv_map_ldt_shadow_page(unsigned int offset)
return false;
}
- pl1e = &pv_ldt_ptes(curr)[offset >> PAGE_SHIFT];
+ curr->arch.pv.ldt_frames[offset >> PAGE_SHIFT] = page_to_mfn(page);
+ pl1e = &__linear_l1_table[l1_linear_offset(LDT_VIRT_START(curr) + offset)];
l1e_add_flags(gl1e, _PAGE_RW);
l1e_write(pl1e, gl1e);
--
2.46.0
On 08.01.2025 15:26, Roger Pau Monne wrote: > The pv_map_ldt_shadow_page() and pv_destroy_ldt() functions rely on the L1 > table(s) that contain such mappings being stashed in the domain structure, and > thus such mappings being modified by merely updating the require L1 entries. > > Switch pv_map_ldt_shadow_page() to unconditionally use the linear recursive, as > that logic is always called while the vCPU is running on the current pCPU. > > For pv_destroy_ldt() use the linear mappings if the vCPU is the one currently > running on the pCPU, otherwise use destroy_mappings(). > > Note this requires keeping an array with the pages currently mapped at the LDT > area, as that allows dropping the extra taken page reference when removing the > mappings. I'm confused by the wording of this paragraph: It reads as if you were changing reference obtaining / dropping, yet it all looks to stay the same. If I'm not mistaken you use the array to replace the acquiring of the MFNs in question from the L1 page table entries. If so, I think it would be nice if this could be described in a more direct way. Perhaps first and foremost by replacing "allows" and getting rid of "extra". Jan
On Wed Jan 8, 2025 at 2:26 PM GMT, Roger Pau Monne wrote: > The pv_map_ldt_shadow_page() and pv_destroy_ldt() functions rely on the L1 > table(s) that contain such mappings being stashed in the domain structure, and > thus such mappings being modified by merely updating the require L1 entries. > > Switch pv_map_ldt_shadow_page() to unconditionally use the linear recursive, as > that logic is always called while the vCPU is running on the current pCPU. > > For pv_destroy_ldt() use the linear mappings if the vCPU is the one currently > running on the pCPU, otherwise use destroy_mappings(). > > Note this requires keeping an array with the pages currently mapped at the LDT > area, as that allows dropping the extra taken page reference when removing the > mappings. > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > --- > xen/arch/x86/include/asm/domain.h | 2 ++ > xen/arch/x86/pv/descriptor-tables.c | 19 ++++++++++--------- > xen/arch/x86/pv/domain.c | 4 ++++ > xen/arch/x86/pv/mm.c | 3 ++- > 4 files changed, 18 insertions(+), 10 deletions(-) > > diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h > index b79d6badd71c..b659cffc7f81 100644 > --- a/xen/arch/x86/include/asm/domain.h > +++ b/xen/arch/x86/include/asm/domain.h > @@ -523,6 +523,8 @@ struct pv_vcpu > struct trap_info *trap_ctxt; > > unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE]; > + /* Max LDT entries is 8192, so 8192 * 8 = 64KiB (16 pages). */ > + mfn_t ldt_frames[16]; > unsigned long ldt_base; > unsigned int gdt_ents, ldt_ents; > > diff --git a/xen/arch/x86/pv/descriptor-tables.c b/xen/arch/x86/pv/descriptor-tables.c > index 5a79f022ce13..95b598a4c0cf 100644 > --- a/xen/arch/x86/pv/descriptor-tables.c > +++ b/xen/arch/x86/pv/descriptor-tables.c > @@ -20,28 +20,29 @@ > */ > bool pv_destroy_ldt(struct vcpu *v) > { > - l1_pgentry_t *pl1e; > + const unsigned int nr_frames = ARRAY_SIZE(v->arch.pv.ldt_frames); > unsigned int i, mappings_dropped = 0; > - struct page_info *page; > > ASSERT(!in_irq()); > > ASSERT(v == current || !vcpu_cpu_dirty(v)); > > - pl1e = pv_ldt_ptes(v); > + destroy_perdomain_mapping(v, LDT_VIRT_START(v), nr_frames); > > - for ( i = 0; i < 16; i++ ) > + for ( i = 0; i < nr_frames; i++ ) nit: While at this, can the "unsigned int" be moved here too? > { > - if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) > - continue; > + mfn_t mfn = v->arch.pv.ldt_frames[i]; > + struct page_info *page; > > - page = l1e_get_page(pl1e[i]); > - l1e_write(&pl1e[i], l1e_empty()); > - mappings_dropped++; > + if ( mfn_eq(mfn, INVALID_MFN) ) > + continue; Can it really be disjoint? As in, why "continue" and not "break"?. Not that it matters in the slightest, and I prefer this form; but I'm curious. > > + v->arch.pv.ldt_frames[i] = INVALID_MFN; > + page = mfn_to_page(mfn); > ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page); > ASSERT_PAGE_IS_DOMAIN(page, v->domain); > put_page_and_type(page); > + mappings_dropped++; > } > > return mappings_dropped; > diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c > index 7e8bffaae9a0..32d7488cc186 100644 > --- a/xen/arch/x86/pv/domain.c > +++ b/xen/arch/x86/pv/domain.c > @@ -303,6 +303,7 @@ void pv_vcpu_destroy(struct vcpu *v) > int pv_vcpu_initialise(struct vcpu *v) > { > struct domain *d = v->domain; > + unsigned int i; > int rc; > > ASSERT(!is_idle_domain(d)); > @@ -311,6 +312,9 @@ int pv_vcpu_initialise(struct vcpu *v) > if ( rc ) > return rc; > > + for ( i = 0; i < ARRAY_SIZE(v->arch.pv.ldt_frames); i++ ) > + v->arch.pv.ldt_frames[i] = INVALID_MFN; > + I think it makes more sense to move this earlier so ldt_frames[] is initialised even if pv_vcpu_initialise() fails. It may be benign, but it looks like an accident abount to happen. Also, nit: "unsigned int i"'s scope can be restricted to the loop itself. As in, "for ( unsigned int i =..." > BUILD_BUG_ON(X86_NR_VECTORS * sizeof(*v->arch.pv.trap_ctxt) > > PAGE_SIZE); > v->arch.pv.trap_ctxt = xzalloc_array(struct trap_info, X86_NR_VECTORS); > diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c > index 187f5f6a3e8c..4853e619f2a7 100644 > --- a/xen/arch/x86/pv/mm.c > +++ b/xen/arch/x86/pv/mm.c > @@ -86,7 +86,8 @@ bool pv_map_ldt_shadow_page(unsigned int offset) > return false; > } > > - pl1e = &pv_ldt_ptes(curr)[offset >> PAGE_SHIFT]; > + curr->arch.pv.ldt_frames[offset >> PAGE_SHIFT] = page_to_mfn(page); > + pl1e = &__linear_l1_table[l1_linear_offset(LDT_VIRT_START(curr) + offset)]; > l1e_add_flags(gl1e, _PAGE_RW); > > l1e_write(pl1e, gl1e); Cheers, Alejandro
On Thu, Jan 09, 2025 at 02:34:05PM +0000, Alejandro Vallejo wrote: > On Wed Jan 8, 2025 at 2:26 PM GMT, Roger Pau Monne wrote: > > The pv_map_ldt_shadow_page() and pv_destroy_ldt() functions rely on the L1 > > table(s) that contain such mappings being stashed in the domain structure, and > > thus such mappings being modified by merely updating the require L1 entries. > > > > Switch pv_map_ldt_shadow_page() to unconditionally use the linear recursive, as > > that logic is always called while the vCPU is running on the current pCPU. > > > > For pv_destroy_ldt() use the linear mappings if the vCPU is the one currently > > running on the pCPU, otherwise use destroy_mappings(). > > > > Note this requires keeping an array with the pages currently mapped at the LDT > > area, as that allows dropping the extra taken page reference when removing the > > mappings. > > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > > --- > > xen/arch/x86/include/asm/domain.h | 2 ++ > > xen/arch/x86/pv/descriptor-tables.c | 19 ++++++++++--------- > > xen/arch/x86/pv/domain.c | 4 ++++ > > xen/arch/x86/pv/mm.c | 3 ++- > > 4 files changed, 18 insertions(+), 10 deletions(-) > > > > diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h > > index b79d6badd71c..b659cffc7f81 100644 > > --- a/xen/arch/x86/include/asm/domain.h > > +++ b/xen/arch/x86/include/asm/domain.h > > @@ -523,6 +523,8 @@ struct pv_vcpu > > struct trap_info *trap_ctxt; > > > > unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE]; > > + /* Max LDT entries is 8192, so 8192 * 8 = 64KiB (16 pages). */ > > + mfn_t ldt_frames[16]; > > unsigned long ldt_base; > > unsigned int gdt_ents, ldt_ents; > > > > diff --git a/xen/arch/x86/pv/descriptor-tables.c b/xen/arch/x86/pv/descriptor-tables.c > > index 5a79f022ce13..95b598a4c0cf 100644 > > --- a/xen/arch/x86/pv/descriptor-tables.c > > +++ b/xen/arch/x86/pv/descriptor-tables.c > > @@ -20,28 +20,29 @@ > > */ > > bool pv_destroy_ldt(struct vcpu *v) > > { > > - l1_pgentry_t *pl1e; > > + const unsigned int nr_frames = ARRAY_SIZE(v->arch.pv.ldt_frames); > > unsigned int i, mappings_dropped = 0; > > - struct page_info *page; > > > > ASSERT(!in_irq()); > > > > ASSERT(v == current || !vcpu_cpu_dirty(v)); > > > > - pl1e = pv_ldt_ptes(v); > > + destroy_perdomain_mapping(v, LDT_VIRT_START(v), nr_frames); > > > > - for ( i = 0; i < 16; i++ ) > > + for ( i = 0; i < nr_frames; i++ ) > > nit: While at this, can the "unsigned int" be moved here too? I don't mind much, but I also don't usually do such changes as I think it adds more noise. > > { > > - if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) > > - continue; > > + mfn_t mfn = v->arch.pv.ldt_frames[i]; > > + struct page_info *page; > > > > - page = l1e_get_page(pl1e[i]); > > - l1e_write(&pl1e[i], l1e_empty()); > > - mappings_dropped++; > > + if ( mfn_eq(mfn, INVALID_MFN) ) > > + continue; > > Can it really be disjoint? As in, why "continue" and not "break"?. Not that it > matters in the slightest, and I prefer this form; but I'm curious. I think so? The PV guest LDT is populated as a result of page-faults, so if the guest only happens to use segment descriptors that are on the third page, the second page might not be mapped? The continue was there already, and I really didn't dare to change this, neither asked myself much. Assumed due to how the guest LDT is mapped on a page-fault basis it could indeed be disjointly mapped. > > > > + v->arch.pv.ldt_frames[i] = INVALID_MFN; > > + page = mfn_to_page(mfn); > > ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page); > > ASSERT_PAGE_IS_DOMAIN(page, v->domain); > > put_page_and_type(page); > > + mappings_dropped++; > > } > > > > return mappings_dropped; > > diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c > > index 7e8bffaae9a0..32d7488cc186 100644 > > --- a/xen/arch/x86/pv/domain.c > > +++ b/xen/arch/x86/pv/domain.c > > @@ -303,6 +303,7 @@ void pv_vcpu_destroy(struct vcpu *v) > > int pv_vcpu_initialise(struct vcpu *v) > > { > > struct domain *d = v->domain; > > + unsigned int i; > > int rc; > > > > ASSERT(!is_idle_domain(d)); > > @@ -311,6 +312,9 @@ int pv_vcpu_initialise(struct vcpu *v) > > if ( rc ) > > return rc; > > > > + for ( i = 0; i < ARRAY_SIZE(v->arch.pv.ldt_frames); i++ ) > > + v->arch.pv.ldt_frames[i] = INVALID_MFN; > > + > > I think it makes more sense to move this earlier so ldt_frames[] is initialised > even if pv_vcpu_initialise() fails. It may be benign, but it looks like an > accident abount to happen. Right, pv_destroy_gdt_ldt_l1tab() doesn't care at all about the contents of ldt_frames[], but it will be safe to do change the ordering in pv_vcpu_initialise(). Thanks, Roger.
On Fri Jan 10, 2025 at 2:44 PM GMT, Roger Pau Monné wrote: > On Thu, Jan 09, 2025 at 02:34:05PM +0000, Alejandro Vallejo wrote: > > On Wed Jan 8, 2025 at 2:26 PM GMT, Roger Pau Monne wrote: > > > The pv_map_ldt_shadow_page() and pv_destroy_ldt() functions rely on the L1 > > > table(s) that contain such mappings being stashed in the domain structure, and > > > thus such mappings being modified by merely updating the require L1 entries. > > > > > > Switch pv_map_ldt_shadow_page() to unconditionally use the linear recursive, as > > > that logic is always called while the vCPU is running on the current pCPU. > > > > > > For pv_destroy_ldt() use the linear mappings if the vCPU is the one currently > > > running on the pCPU, otherwise use destroy_mappings(). > > > > > > Note this requires keeping an array with the pages currently mapped at the LDT > > > area, as that allows dropping the extra taken page reference when removing the > > > mappings. > > > > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> > > > --- > > > xen/arch/x86/include/asm/domain.h | 2 ++ > > > xen/arch/x86/pv/descriptor-tables.c | 19 ++++++++++--------- > > > xen/arch/x86/pv/domain.c | 4 ++++ > > > xen/arch/x86/pv/mm.c | 3 ++- > > > 4 files changed, 18 insertions(+), 10 deletions(-) > > > > > > diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h > > > index b79d6badd71c..b659cffc7f81 100644 > > > --- a/xen/arch/x86/include/asm/domain.h > > > +++ b/xen/arch/x86/include/asm/domain.h > > > @@ -523,6 +523,8 @@ struct pv_vcpu > > > struct trap_info *trap_ctxt; > > > > > > unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE]; > > > + /* Max LDT entries is 8192, so 8192 * 8 = 64KiB (16 pages). */ > > > + mfn_t ldt_frames[16]; > > > unsigned long ldt_base; > > > unsigned int gdt_ents, ldt_ents; > > > > > > diff --git a/xen/arch/x86/pv/descriptor-tables.c b/xen/arch/x86/pv/descriptor-tables.c > > > index 5a79f022ce13..95b598a4c0cf 100644 > > > --- a/xen/arch/x86/pv/descriptor-tables.c > > > +++ b/xen/arch/x86/pv/descriptor-tables.c > > > @@ -20,28 +20,29 @@ > > > */ > > > bool pv_destroy_ldt(struct vcpu *v) > > > { > > > - l1_pgentry_t *pl1e; > > > + const unsigned int nr_frames = ARRAY_SIZE(v->arch.pv.ldt_frames); > > > unsigned int i, mappings_dropped = 0; > > > - struct page_info *page; > > > > > > ASSERT(!in_irq()); > > > > > > ASSERT(v == current || !vcpu_cpu_dirty(v)); > > > > > > - pl1e = pv_ldt_ptes(v); > > > + destroy_perdomain_mapping(v, LDT_VIRT_START(v), nr_frames); > > > > > > - for ( i = 0; i < 16; i++ ) > > > + for ( i = 0; i < nr_frames; i++ ) > > > > nit: While at this, can the "unsigned int" be moved here too? > > I don't mind much, but I also don't usually do such changes as I think > it adds more noise. Fair enough, nvm then. > > > > { > > > - if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) ) > > > - continue; > > > + mfn_t mfn = v->arch.pv.ldt_frames[i]; > > > + struct page_info *page; > > > > > > - page = l1e_get_page(pl1e[i]); > > > - l1e_write(&pl1e[i], l1e_empty()); > > > - mappings_dropped++; > > > + if ( mfn_eq(mfn, INVALID_MFN) ) > > > + continue; > > > > Can it really be disjoint? As in, why "continue" and not "break"?. Not that it > > matters in the slightest, and I prefer this form; but I'm curious. > > I think so? The PV guest LDT is populated as a result of page-faults, > so if the guest only happens to use segment descriptors that are on > the third page, the second page might not be mapped? > > The continue was there already, and I really didn't dare to change > this, neither asked myself much. Assumed due to how the guest LDT is > mapped on a page-fault basis it could indeed be disjointly mapped. Ah, I see. That makes sense then. I wouldn't suggest changing it either, I was just curious :) > > > > > > > + v->arch.pv.ldt_frames[i] = INVALID_MFN; > > > + page = mfn_to_page(mfn); > > > ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page); > > > ASSERT_PAGE_IS_DOMAIN(page, v->domain); > > > put_page_and_type(page); > > > + mappings_dropped++; > > > } > > > > > > return mappings_dropped; > > > diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c > > > index 7e8bffaae9a0..32d7488cc186 100644 > > > --- a/xen/arch/x86/pv/domain.c > > > +++ b/xen/arch/x86/pv/domain.c > > > @@ -303,6 +303,7 @@ void pv_vcpu_destroy(struct vcpu *v) > > > int pv_vcpu_initialise(struct vcpu *v) > > > { > > > struct domain *d = v->domain; > > > + unsigned int i; > > > int rc; > > > > > > ASSERT(!is_idle_domain(d)); > > > @@ -311,6 +312,9 @@ int pv_vcpu_initialise(struct vcpu *v) > > > if ( rc ) > > > return rc; > > > > > > + for ( i = 0; i < ARRAY_SIZE(v->arch.pv.ldt_frames); i++ ) > > > + v->arch.pv.ldt_frames[i] = INVALID_MFN; > > > + > > > > I think it makes more sense to move this earlier so ldt_frames[] is initialised > > even if pv_vcpu_initialise() fails. It may be benign, but it looks like an > > accident abount to happen. > > Right, pv_destroy_gdt_ldt_l1tab() doesn't care at all about the > contents of ldt_frames[], but it will be safe to do change the > ordering in pv_vcpu_initialise(). > > Thanks, Roger. Cheers, Alejandro
© 2016 - 2025 Red Hat, Inc.