For vendor specific code to support superpages we need to be able to
deal with a superpage mapping replacing an intermediate page table (or
hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
needed to free individual page tables while a domain is still alive.
Since the freeing needs to be deferred until after a suitable IOTLB
flush was performed, released page tables get queued for processing by a
tasklet.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
I was considering whether to use a softirq-taklet instead. This would
have the benefit of avoiding extra scheduling operations, but come with
the risk of the freeing happening prematurely because of a
process_pending_softirqs() somewhere.
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -12,6 +12,7 @@
* this program; If not, see <http://www.gnu.org/licenses/>.
*/
+#include <xen/cpu.h>
#include <xen/sched.h>
#include <xen/iommu.h>
#include <xen/paging.h>
@@ -463,6 +464,85 @@ struct page_info *iommu_alloc_pgtable(st
return pg;
}
+/*
+ * Intermediate page tables which get replaced by large pages may only be
+ * freed after a suitable IOTLB flush. Hence such pages get queued on a
+ * per-CPU list, with a per-CPU tasklet processing the list on the assumption
+ * that the necessary IOTLB flush will have occurred by the time tasklets get
+ * to run. (List and tasklet being per-CPU has the benefit of accesses not
+ * requiring any locking.)
+ */
+static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
+static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
+
+static void free_queued_pgtables(void *arg)
+{
+ struct page_list_head *list = arg;
+ struct page_info *pg;
+
+ while ( (pg = page_list_remove_head(list)) )
+ free_domheap_page(pg);
+}
+
+void iommu_queue_free_pgtable(struct domain *d, struct page_info *pg)
+{
+ struct domain_iommu *hd = dom_iommu(d);
+ unsigned int cpu = smp_processor_id();
+
+ spin_lock(&hd->arch.pgtables.lock);
+ page_list_del(pg, &hd->arch.pgtables.list);
+ spin_unlock(&hd->arch.pgtables.lock);
+
+ page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
+
+ tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
+}
+
+static int cpu_callback(
+ struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct page_list_head *list = &per_cpu(free_pgt_list, cpu);
+ struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu);
+
+ switch ( action )
+ {
+ case CPU_DOWN_PREPARE:
+ tasklet_kill(tasklet);
+ break;
+
+ case CPU_DEAD:
+ page_list_splice(list, &this_cpu(free_pgt_list));
+ INIT_PAGE_LIST_HEAD(list);
+ tasklet_schedule(&this_cpu(free_pgt_tasklet));
+ break;
+
+ case CPU_UP_PREPARE:
+ case CPU_DOWN_FAILED:
+ tasklet_init(tasklet, free_queued_pgtables, list);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block cpu_nfb = {
+ .notifier_call = cpu_callback,
+};
+
+static int __init bsp_init(void)
+{
+ if ( iommu_enabled )
+ {
+ cpu_callback(&cpu_nfb, CPU_UP_PREPARE,
+ (void *)(unsigned long)smp_processor_id());
+ register_cpu_notifier(&cpu_nfb);
+ }
+
+ return 0;
+}
+presmp_initcall(bsp_init);
+
bool arch_iommu_use_permitted(const struct domain *d)
{
/*
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -143,6 +143,7 @@ int pi_update_irte(const struct pi_desc
int __must_check iommu_free_pgtables(struct domain *d);
struct page_info *__must_check iommu_alloc_pgtable(struct domain *d);
+void iommu_queue_free_pgtable(struct domain *d, struct page_info *pg);
#endif /* !__ARCH_X86_IOMMU_H__ */
/*
On Fri, Sep 24, 2021 at 11:48:21AM +0200, Jan Beulich wrote: > For vendor specific code to support superpages we need to be able to > deal with a superpage mapping replacing an intermediate page table (or > hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is > needed to free individual page tables while a domain is still alive. > Since the freeing needs to be deferred until after a suitable IOTLB > flush was performed, released page tables get queued for processing by a > tasklet. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > I was considering whether to use a softirq-taklet instead. This would > have the benefit of avoiding extra scheduling operations, but come with > the risk of the freeing happening prematurely because of a > process_pending_softirqs() somewhere. Another approach that comes to mind (maybe you already thought of it and discarded) would be to perform the freeing after the flush in iommu_iotlb_flush{_all} while keeping the per pPCU lists. That would IMO seem better from a safety PoV, as we know that the flush has been performed when the pages are freed, and would avoid the switch to the idle domain in order to do the freeing. Thanks, Roger.
On 02.12.2021 17:03, Roger Pau Monné wrote: > On Fri, Sep 24, 2021 at 11:48:21AM +0200, Jan Beulich wrote: >> For vendor specific code to support superpages we need to be able to >> deal with a superpage mapping replacing an intermediate page table (or >> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is >> needed to free individual page tables while a domain is still alive. >> Since the freeing needs to be deferred until after a suitable IOTLB >> flush was performed, released page tables get queued for processing by a >> tasklet. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> --- >> I was considering whether to use a softirq-taklet instead. This would >> have the benefit of avoiding extra scheduling operations, but come with >> the risk of the freeing happening prematurely because of a >> process_pending_softirqs() somewhere. > > Another approach that comes to mind (maybe you already thought of it > and discarded) would be to perform the freeing after the flush in > iommu_iotlb_flush{_all} while keeping the per pPCU lists. This was my initial plan, but I couldn't convince myself that the first flush to happen would be _the_ one associated with the to-be-freed page tables. ISTR (vaguely though) actually having found an example to the contrary. Jan
On Thu, Dec 02, 2021 at 05:10:38PM +0100, Jan Beulich wrote: > On 02.12.2021 17:03, Roger Pau Monné wrote: > > On Fri, Sep 24, 2021 at 11:48:21AM +0200, Jan Beulich wrote: > >> For vendor specific code to support superpages we need to be able to > >> deal with a superpage mapping replacing an intermediate page table (or > >> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is > >> needed to free individual page tables while a domain is still alive. > >> Since the freeing needs to be deferred until after a suitable IOTLB > >> flush was performed, released page tables get queued for processing by a > >> tasklet. > >> > >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > >> --- > >> I was considering whether to use a softirq-taklet instead. This would > >> have the benefit of avoiding extra scheduling operations, but come with > >> the risk of the freeing happening prematurely because of a > >> process_pending_softirqs() somewhere. > > > > Another approach that comes to mind (maybe you already thought of it > > and discarded) would be to perform the freeing after the flush in > > iommu_iotlb_flush{_all} while keeping the per pPCU lists. > > This was my initial plan, but I couldn't convince myself that the first > flush to happen would be _the_ one associated with the to-be-freed > page tables. ISTR (vaguely though) actually having found an example to > the contrary. I see. If we keep the list per pCPU I'm not sure we could have an IOMMU flush not related to the to-be-freed pages, as we cannot have interleaved IOMMU operations on the same pCPU. Also, if we strictly add the pages to the freeing list once unhooked from the IOMMU page tables it should be safe to flush and then free them, as they would be no references remaining anymore. Thanks, Roger.
On 03.12.2021 09:30, Roger Pau Monné wrote: > On Thu, Dec 02, 2021 at 05:10:38PM +0100, Jan Beulich wrote: >> On 02.12.2021 17:03, Roger Pau Monné wrote: >>> On Fri, Sep 24, 2021 at 11:48:21AM +0200, Jan Beulich wrote: >>>> For vendor specific code to support superpages we need to be able to >>>> deal with a superpage mapping replacing an intermediate page table (or >>>> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is >>>> needed to free individual page tables while a domain is still alive. >>>> Since the freeing needs to be deferred until after a suitable IOTLB >>>> flush was performed, released page tables get queued for processing by a >>>> tasklet. >>>> >>>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>>> --- >>>> I was considering whether to use a softirq-taklet instead. This would >>>> have the benefit of avoiding extra scheduling operations, but come with >>>> the risk of the freeing happening prematurely because of a >>>> process_pending_softirqs() somewhere. >>> >>> Another approach that comes to mind (maybe you already thought of it >>> and discarded) would be to perform the freeing after the flush in >>> iommu_iotlb_flush{_all} while keeping the per pPCU lists. >> >> This was my initial plan, but I couldn't convince myself that the first >> flush to happen would be _the_ one associated with the to-be-freed >> page tables. ISTR (vaguely though) actually having found an example to >> the contrary. > > I see. If we keep the list per pCPU I'm not sure we could have an > IOMMU flush not related to the to-be-freed pages, as we cannot have > interleaved IOMMU operations on the same pCPU. "interleaved" is perhaps the wrong word. But can you easily exclude e.g. a put_page() in the middle of some other operation? That could in turn invoke one of the legacy map/unmap functions (see cleanup_page_mappings() for an example), where the flushing happens immediately after the map/unmap. > Also, if we strictly add the pages to the freeing list once unhooked > from the IOMMU page tables it should be safe to flush and then free > them, as they would be no references remaining anymore. Only if the flush is a full-address-space one. Jan
On Fri, Dec 03, 2021 at 09:30:00AM +0100, Roger Pau Monné wrote: > On Thu, Dec 02, 2021 at 05:10:38PM +0100, Jan Beulich wrote: > > On 02.12.2021 17:03, Roger Pau Monné wrote: > > > On Fri, Sep 24, 2021 at 11:48:21AM +0200, Jan Beulich wrote: > > >> For vendor specific code to support superpages we need to be able to > > >> deal with a superpage mapping replacing an intermediate page table (or > > >> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is > > >> needed to free individual page tables while a domain is still alive. > > >> Since the freeing needs to be deferred until after a suitable IOTLB > > >> flush was performed, released page tables get queued for processing by a > > >> tasklet. > > >> > > >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > >> --- > > >> I was considering whether to use a softirq-taklet instead. This would > > >> have the benefit of avoiding extra scheduling operations, but come with > > >> the risk of the freeing happening prematurely because of a > > >> process_pending_softirqs() somewhere. > > > > > > Another approach that comes to mind (maybe you already thought of it > > > and discarded) would be to perform the freeing after the flush in > > > iommu_iotlb_flush{_all} while keeping the per pPCU lists. > > > > This was my initial plan, but I couldn't convince myself that the first > > flush to happen would be _the_ one associated with the to-be-freed > > page tables. ISTR (vaguely though) actually having found an example to > > the contrary. > > I see. If we keep the list per pCPU I'm not sure we could have an > IOMMU flush not related to the to-be-freed pages, as we cannot have > interleaved IOMMU operations on the same pCPU. > > Also, if we strictly add the pages to the freeing list once unhooked > from the IOMMU page tables it should be safe to flush and then free > them, as they would be no references remaining anymore. Replying to my last paragraph: there are different types of flushes, and they have different scopes, so just adding the pages to be freed to a random list and expecting any flush to remove them from the IOMMU cache is not correct. I still think the first paragraph is accurate, as we shouldn't have interleaving IOMMU operations on the same pCPU, so a flush on a pCPU should always clear the entries that have been freed as a result of the ongoing operation on that pCPU, and those operations should be sequential. Thanks, Roger.
On Fri, Sep 24, 2021 at 11:48:21AM +0200, Jan Beulich wrote: > For vendor specific code to support superpages we need to be able to > deal with a superpage mapping replacing an intermediate page table (or > hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is > needed to free individual page tables while a domain is still alive. > Since the freeing needs to be deferred until after a suitable IOTLB > flush was performed, released page tables get queued for processing by a > tasklet. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > I was considering whether to use a softirq-taklet instead. This would > have the benefit of avoiding extra scheduling operations, but come with > the risk of the freeing happening prematurely because of a > process_pending_softirqs() somewhere. The main one that comes to mind would be the debug keys and it's usage of process_pending_softirqs that could interfere with iommu unmaps, so I guess if only for that reason it's best to run in idle vcpu context. > --- a/xen/drivers/passthrough/x86/iommu.c > +++ b/xen/drivers/passthrough/x86/iommu.c > @@ -12,6 +12,7 @@ > * this program; If not, see <http://www.gnu.org/licenses/>. > */ > > +#include <xen/cpu.h> > #include <xen/sched.h> > #include <xen/iommu.h> > #include <xen/paging.h> > @@ -463,6 +464,85 @@ struct page_info *iommu_alloc_pgtable(st > return pg; > } > > +/* > + * Intermediate page tables which get replaced by large pages may only be > + * freed after a suitable IOTLB flush. Hence such pages get queued on a > + * per-CPU list, with a per-CPU tasklet processing the list on the assumption > + * that the necessary IOTLB flush will have occurred by the time tasklets get > + * to run. (List and tasklet being per-CPU has the benefit of accesses not > + * requiring any locking.) > + */ > +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list); > +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet); > + > +static void free_queued_pgtables(void *arg) > +{ > + struct page_list_head *list = arg; > + struct page_info *pg; > + > + while ( (pg = page_list_remove_head(list)) ) > + free_domheap_page(pg); Should you add a preempt check here to yield and schedule the tasklet again, in order to be able to process any pending work? Maybe just calling process_pending_softirqs would be enough? Thanks, Roger.
On 10.12.2021 14:51, Roger Pau Monné wrote: > On Fri, Sep 24, 2021 at 11:48:21AM +0200, Jan Beulich wrote: >> For vendor specific code to support superpages we need to be able to >> deal with a superpage mapping replacing an intermediate page table (or >> hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is >> needed to free individual page tables while a domain is still alive. >> Since the freeing needs to be deferred until after a suitable IOTLB >> flush was performed, released page tables get queued for processing by a >> tasklet. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> --- >> I was considering whether to use a softirq-taklet instead. This would >> have the benefit of avoiding extra scheduling operations, but come with >> the risk of the freeing happening prematurely because of a >> process_pending_softirqs() somewhere. > > The main one that comes to mind would be the debug keys and it's usage > of process_pending_softirqs that could interfere with iommu unmaps, so > I guess if only for that reason it's best to run in idle vcpu context. IOW you support the choice made. >> --- a/xen/drivers/passthrough/x86/iommu.c >> +++ b/xen/drivers/passthrough/x86/iommu.c >> @@ -12,6 +12,7 @@ >> * this program; If not, see <http://www.gnu.org/licenses/>. >> */ >> >> +#include <xen/cpu.h> >> #include <xen/sched.h> >> #include <xen/iommu.h> >> #include <xen/paging.h> >> @@ -463,6 +464,85 @@ struct page_info *iommu_alloc_pgtable(st >> return pg; >> } >> >> +/* >> + * Intermediate page tables which get replaced by large pages may only be >> + * freed after a suitable IOTLB flush. Hence such pages get queued on a >> + * per-CPU list, with a per-CPU tasklet processing the list on the assumption >> + * that the necessary IOTLB flush will have occurred by the time tasklets get >> + * to run. (List and tasklet being per-CPU has the benefit of accesses not >> + * requiring any locking.) >> + */ >> +static DEFINE_PER_CPU(struct page_list_head, free_pgt_list); >> +static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet); >> + >> +static void free_queued_pgtables(void *arg) >> +{ >> + struct page_list_head *list = arg; >> + struct page_info *pg; >> + >> + while ( (pg = page_list_remove_head(list)) ) >> + free_domheap_page(pg); > > Should you add a preempt check here to yield and schedule the tasklet > again, in order to be able to process any pending work? I did ask myself this question, yes, but ... > Maybe just calling process_pending_softirqs would be enough? ... I think I didn't consider this as a possible simpler variant (compared to re-scheduling the tasklet). Let me add such - I agree that this should be sufficient. Jan
© 2016 - 2024 Red Hat, Inc.