Mediated vPMU 4.0 for x86

[PATCH v4 05/38] perf: Add generic exclude_guest support

Posted by Mingwei Zhang 10 months, 3 weeks ago

From: Kan Liang <kan.liang@linux.intel.com>

Only KVM knows the exact time when a guest is entering/exiting. Expose
two interfaces to KVM to switch the ownership of the PMU resources.

All the pinned events must be scheduled in first. Extend the
perf_event_sched_in() helper to support extra flag, e.g., EVENT_GUEST.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Mingwei Zhang <mizhang@google.com>
---
 include/linux/perf_event.h |  4 ++
 kernel/events/core.c       | 80 ++++++++++++++++++++++++++++++++++----
 2 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7bda1e20be12..37187ee8e226 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1822,6 +1822,8 @@ extern int perf_event_period(struct perf_event *event, u64 value);
 extern u64 perf_event_pause(struct perf_event *event, bool reset);
 int perf_get_mediated_pmu(void);
 void perf_put_mediated_pmu(void);
+void perf_guest_enter(void);
+void perf_guest_exit(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
@@ -1919,6 +1921,8 @@ static inline int perf_get_mediated_pmu(void)
 }
 
 static inline void perf_put_mediated_pmu(void)			{ }
+static inline void perf_guest_enter(void)			{ }
+static inline void perf_guest_exit(void)			{ }
 #endif
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7a2115b2c5c1..d05487d465c9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2827,14 +2827,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
 
 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 				struct perf_event_context *ctx,
-				struct pmu *pmu)
+				struct pmu *pmu,
+				enum event_type_t event_type)
 {
-	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
+	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type);
 	if (ctx)
-		 ctx_sched_in(ctx, pmu, EVENT_PINNED);
-	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
+		ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type);
+	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type);
 	if (ctx)
-		 ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
+		ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type);
 }
 
 /*
@@ -2890,7 +2891,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
 	else if (event_type & EVENT_PINNED)
 		ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
 
-	perf_event_sched_in(cpuctx, task_ctx, pmu);
+	perf_event_sched_in(cpuctx, task_ctx, pmu, 0);
 
 	for_each_epc(epc, &cpuctx->ctx, pmu, 0)
 		perf_pmu_enable(epc->pmu);
@@ -4188,7 +4189,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
 		ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
 	}
 
-	perf_event_sched_in(cpuctx, ctx, NULL);
+	perf_event_sched_in(cpuctx, ctx, NULL, 0);
 
 	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
 
@@ -6040,6 +6041,71 @@ void perf_put_mediated_pmu(void)
 }
 EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
 
+static inline void perf_host_exit(struct perf_cpu_context *cpuctx)
+{
+	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
+	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+	if (cpuctx->task_ctx) {
+		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
+		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+	}
+}
+
+/* When entering a guest, schedule out all exclude_guest events. */
+void perf_guest_enter(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
+		goto unlock;
+
+	perf_host_exit(cpuctx);
+
+	__this_cpu_write(perf_in_guest, true);
+
+unlock:
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+EXPORT_SYMBOL_GPL(perf_guest_enter);
+
+static inline void perf_host_enter(struct perf_cpu_context *cpuctx)
+{
+	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
+	if (cpuctx->task_ctx)
+		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
+
+	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
+
+	if (cpuctx->task_ctx)
+		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
+	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
+}
+
+void perf_guest_exit(void)
+{
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
+	lockdep_assert_irqs_disabled();
+
+	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
+		goto unlock;
+
+	perf_host_enter(cpuctx);
+
+	__this_cpu_write(perf_in_guest, false);
+unlock:
+	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+}
+EXPORT_SYMBOL_GPL(perf_guest_exit);
+
 /*
  * Holding the top-level event's child_mutex means that any
  * descendant process that has inherited this event will block
-- 
2.49.0.395.g12beb8f557-goog

Re: [PATCH v4 05/38] perf: Add generic exclude_guest support

Posted by Namhyung Kim 8 months, 3 weeks ago

On Mon, Mar 24, 2025 at 05:30:45PM +0000, Mingwei Zhang wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> Only KVM knows the exact time when a guest is entering/exiting. Expose
> two interfaces to KVM to switch the ownership of the PMU resources.
> 
> All the pinned events must be scheduled in first. Extend the
> perf_event_sched_in() helper to support extra flag, e.g., EVENT_GUEST.
> 
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> Signed-off-by: Mingwei Zhang <mizhang@google.com>
> ---
>  include/linux/perf_event.h |  4 ++
>  kernel/events/core.c       | 80 ++++++++++++++++++++++++++++++++++----
>  2 files changed, 77 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 7bda1e20be12..37187ee8e226 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1822,6 +1822,8 @@ extern int perf_event_period(struct perf_event *event, u64 value);
>  extern u64 perf_event_pause(struct perf_event *event, bool reset);
>  int perf_get_mediated_pmu(void);
>  void perf_put_mediated_pmu(void);
> +void perf_guest_enter(void);
> +void perf_guest_exit(void);
>  #else /* !CONFIG_PERF_EVENTS: */
>  static inline void *
>  perf_aux_output_begin(struct perf_output_handle *handle,
> @@ -1919,6 +1921,8 @@ static inline int perf_get_mediated_pmu(void)
>  }
>  
>  static inline void perf_put_mediated_pmu(void)			{ }
> +static inline void perf_guest_enter(void)			{ }
> +static inline void perf_guest_exit(void)			{ }
>  #endif
>  
>  #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 7a2115b2c5c1..d05487d465c9 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -2827,14 +2827,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
>  
>  static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>  				struct perf_event_context *ctx,
> -				struct pmu *pmu)
> +				struct pmu *pmu,
> +				enum event_type_t event_type)
>  {
> -	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
> +	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type);
>  	if (ctx)
> -		 ctx_sched_in(ctx, pmu, EVENT_PINNED);
> -	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
> +		ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type);
> +	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type);
>  	if (ctx)
> -		 ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
> +		ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type);
>  }
>  
>  /*
> @@ -2890,7 +2891,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>  	else if (event_type & EVENT_PINNED)
>  		ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
>  
> -	perf_event_sched_in(cpuctx, task_ctx, pmu);
> +	perf_event_sched_in(cpuctx, task_ctx, pmu, 0);
>  
>  	for_each_epc(epc, &cpuctx->ctx, pmu, 0)
>  		perf_pmu_enable(epc->pmu);
> @@ -4188,7 +4189,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
>  		ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
>  	}
>  
> -	perf_event_sched_in(cpuctx, ctx, NULL);
> +	perf_event_sched_in(cpuctx, ctx, NULL, 0);
>  
>  	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
>  
> @@ -6040,6 +6041,71 @@ void perf_put_mediated_pmu(void)
>  }
>  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>  
> +static inline void perf_host_exit(struct perf_cpu_context *cpuctx)
> +{
> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> +	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> +	if (cpuctx->task_ctx) {
> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> +		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> +	}
> +}

Cpu context and task context may have events in the same PMU.
How about this?

	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
	if (cpuctx->task_ctx)
		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);

	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
	if (cpuctx->task_ctx)
		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);

	if (cpuctx->task_ctx)
		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);

Thanks,
Namhyung

> +
> +/* When entering a guest, schedule out all exclude_guest events. */
> +void perf_guest_enter(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
> +		goto unlock;
> +
> +	perf_host_exit(cpuctx);
> +
> +	__this_cpu_write(perf_in_guest, true);
> +
> +unlock:
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}
> +EXPORT_SYMBOL_GPL(perf_guest_enter);
> +
> +static inline void perf_host_enter(struct perf_cpu_context *cpuctx)
> +{
> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> +	if (cpuctx->task_ctx)
> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> +
> +	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
> +
> +	if (cpuctx->task_ctx)
> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> +}
> +
> +void perf_guest_exit(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
> +		goto unlock;
> +
> +	perf_host_enter(cpuctx);
> +
> +	__this_cpu_write(perf_in_guest, false);
> +unlock:
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}
> +EXPORT_SYMBOL_GPL(perf_guest_exit);
> +
>  /*
>   * Holding the top-level event's child_mutex means that any
>   * descendant process that has inherited this event will block
> -- 
> 2.49.0.395.g12beb8f557-goog
>

Re: [PATCH v4 05/38] perf: Add generic exclude_guest support

Posted by Liang, Kan 8 months, 3 weeks ago


On 2025-05-21 3:55 p.m., Namhyung Kim wrote:
> On Mon, Mar 24, 2025 at 05:30:45PM +0000, Mingwei Zhang wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> Only KVM knows the exact time when a guest is entering/exiting. Expose
>> two interfaces to KVM to switch the ownership of the PMU resources.
>>
>> All the pinned events must be scheduled in first. Extend the
>> perf_event_sched_in() helper to support extra flag, e.g., EVENT_GUEST.
>>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> Signed-off-by: Mingwei Zhang <mizhang@google.com>
>> ---
>>  include/linux/perf_event.h |  4 ++
>>  kernel/events/core.c       | 80 ++++++++++++++++++++++++++++++++++----
>>  2 files changed, 77 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 7bda1e20be12..37187ee8e226 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -1822,6 +1822,8 @@ extern int perf_event_period(struct perf_event *event, u64 value);
>>  extern u64 perf_event_pause(struct perf_event *event, bool reset);
>>  int perf_get_mediated_pmu(void);
>>  void perf_put_mediated_pmu(void);
>> +void perf_guest_enter(void);
>> +void perf_guest_exit(void);
>>  #else /* !CONFIG_PERF_EVENTS: */
>>  static inline void *
>>  perf_aux_output_begin(struct perf_output_handle *handle,
>> @@ -1919,6 +1921,8 @@ static inline int perf_get_mediated_pmu(void)
>>  }
>>  
>>  static inline void perf_put_mediated_pmu(void)			{ }
>> +static inline void perf_guest_enter(void)			{ }
>> +static inline void perf_guest_exit(void)			{ }
>>  #endif
>>  
>>  #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 7a2115b2c5c1..d05487d465c9 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -2827,14 +2827,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
>>  
>>  static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
>>  				struct perf_event_context *ctx,
>> -				struct pmu *pmu)
>> +				struct pmu *pmu,
>> +				enum event_type_t event_type)
>>  {
>> -	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
>> +	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type);
>>  	if (ctx)
>> -		 ctx_sched_in(ctx, pmu, EVENT_PINNED);
>> -	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
>> +		ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type);
>> +	ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type);
>>  	if (ctx)
>> -		 ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
>> +		ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type);
>>  }
>>  
>>  /*
>> @@ -2890,7 +2891,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>>  	else if (event_type & EVENT_PINNED)
>>  		ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
>>  
>> -	perf_event_sched_in(cpuctx, task_ctx, pmu);
>> +	perf_event_sched_in(cpuctx, task_ctx, pmu, 0);
>>  
>>  	for_each_epc(epc, &cpuctx->ctx, pmu, 0)
>>  		perf_pmu_enable(epc->pmu);
>> @@ -4188,7 +4189,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
>>  		ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
>>  	}
>>  
>> -	perf_event_sched_in(cpuctx, ctx, NULL);
>> +	perf_event_sched_in(cpuctx, ctx, NULL, 0);
>>  
>>  	perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
>>  
>> @@ -6040,6 +6041,71 @@ void perf_put_mediated_pmu(void)
>>  }
>>  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>>  
>> +static inline void perf_host_exit(struct perf_cpu_context *cpuctx)
>> +{
>> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>> +	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
>> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>> +	if (cpuctx->task_ctx) {
>> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>> +		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
>> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>> +	}
>> +}
> 
> Cpu context and task context may have events in the same PMU.
> How about this?
> 
> 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> 	if (cpuctx->task_ctx)
> 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> 
> 	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
> 	if (cpuctx->task_ctx)
> 		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
> 
> 	if (cpuctx->task_ctx)
> 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> 

Yes, this is what has to be fixed in V4.

No matter for load context or put context, I think the below steps have
to be followed.
- Disable both cpuctx->ctx and cpuctx->task_ctx
- schedule in/out host counters and load/put guest countext
- Enable both cpuctx->ctx and cpuctx->task_ctx

A similar proposed can be found at
https://lore.kernel.org/lkml/4aaf67ab-aa5c-41e6-bced-3cb000172c52@linux.intel.com/

Thanks,
Kan

Re: [PATCH v4 05/38] perf: Add generic exclude_guest support

Posted by Peter Zijlstra 9 months, 2 weeks ago

On Mon, Mar 24, 2025 at 05:30:45PM +0000, Mingwei Zhang wrote:

> @@ -6040,6 +6041,71 @@ void perf_put_mediated_pmu(void)
>  }
>  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>  
> +static inline void perf_host_exit(struct perf_cpu_context *cpuctx)
> +{
> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> +	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> +	if (cpuctx->task_ctx) {
> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> +		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> +	}
> +}
> +
> +/* When entering a guest, schedule out all exclude_guest events. */
> +void perf_guest_enter(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
> +		goto unlock;
> +
> +	perf_host_exit(cpuctx);
> +
> +	__this_cpu_write(perf_in_guest, true);
> +
> +unlock:
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}
> +EXPORT_SYMBOL_GPL(perf_guest_enter);
> +
> +static inline void perf_host_enter(struct perf_cpu_context *cpuctx)
> +{
> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> +	if (cpuctx->task_ctx)
> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> +
> +	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
> +
> +	if (cpuctx->task_ctx)
> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> +}
> +
> +void perf_guest_exit(void)
> +{
> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> +
> +	lockdep_assert_irqs_disabled();
> +
> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> +
> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
> +		goto unlock;
> +
> +	perf_host_enter(cpuctx);
> +
> +	__this_cpu_write(perf_in_guest, false);
> +unlock:
> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> +}
> +EXPORT_SYMBOL_GPL(perf_guest_exit);

This naming is confusing on purpose? Pick either guest/host and stick
with it.

Re: [PATCH v4 05/38] perf: Add generic exclude_guest support

Posted by Sean Christopherson 9 months ago

On Fri, Apr 25, 2025, Peter Zijlstra wrote:
> On Mon, Mar 24, 2025 at 05:30:45PM +0000, Mingwei Zhang wrote:
> 
> > @@ -6040,6 +6041,71 @@ void perf_put_mediated_pmu(void)
> >  }
> >  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
> >  
> > +static inline void perf_host_exit(struct perf_cpu_context *cpuctx)
> > +{
> > +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> > +	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
> > +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> > +	if (cpuctx->task_ctx) {
> > +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> > +		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
> > +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> > +	}
> > +}
> > +
> > +/* When entering a guest, schedule out all exclude_guest events. */
> > +void perf_guest_enter(void)
> > +{
> > +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> > +
> > +	lockdep_assert_irqs_disabled();
> > +
> > +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> > +
> > +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
> > +		goto unlock;
> > +
> > +	perf_host_exit(cpuctx);
> > +
> > +	__this_cpu_write(perf_in_guest, true);
> > +
> > +unlock:
> > +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> > +}
> > +EXPORT_SYMBOL_GPL(perf_guest_enter);
> > +
> > +static inline void perf_host_enter(struct perf_cpu_context *cpuctx)
> > +{
> > +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> > +	if (cpuctx->task_ctx)
> > +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> > +
> > +	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
> > +
> > +	if (cpuctx->task_ctx)
> > +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> > +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> > +}
> > +
> > +void perf_guest_exit(void)
> > +{
> > +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> > +
> > +	lockdep_assert_irqs_disabled();
> > +
> > +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> > +
> > +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
> > +		goto unlock;
> > +
> > +	perf_host_enter(cpuctx);
> > +
> > +	__this_cpu_write(perf_in_guest, false);
> > +unlock:
> > +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> > +}
> > +EXPORT_SYMBOL_GPL(perf_guest_exit);
> 
> This naming is confusing on purpose? Pick either guest/host and stick
> with it.

+1.  I also think the inner perf_host_{enter,exit}() helpers are superflous.
These flows

After a bit of hacking, and with a few spoilers, this is what I ended up with
(not anywhere near fully tested).  I like following KVM's kvm_xxx_{load,put}()
nomenclature to tie everything together, so I went with "guest" instead of "host"
even though the majority of work being down is to shedule out/in host context.

/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */
void perf_load_guest_context(unsigned long data)
{
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

	lockdep_assert_irqs_disabled();

	perf_ctx_lock(cpuctx, cpuctx->task_ctx);

	if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded)))
		goto unlock;

	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
	if (cpuctx->task_ctx) {
		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
	}

	arch_perf_load_guest_context(data);

	__this_cpu_write(guest_ctx_loaded, true);

unlock:
	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
EXPORT_SYMBOL_GPL(perf_load_guest_context);

void perf_put_guest_context(void)
{
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

	lockdep_assert_irqs_disabled();

	perf_ctx_lock(cpuctx, cpuctx->task_ctx);

	if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded)))
		goto unlock;

	arch_perf_put_guest_context();

	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
	if (cpuctx->task_ctx)
		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);

	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);

	if (cpuctx->task_ctx)
		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);

	__this_cpu_write(guest_ctx_loaded, false);
unlock:
	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
EXPORT_SYMBOL_GPL(perf_put_guest_context);

Re: [PATCH v4 05/38] perf: Add generic exclude_guest support

Posted by Liang, Kan 8 months, 4 weeks ago


On 2025-05-14 7:19 p.m., Sean Christopherson wrote:
> On Fri, Apr 25, 2025, Peter Zijlstra wrote:
>> On Mon, Mar 24, 2025 at 05:30:45PM +0000, Mingwei Zhang wrote:
>>
>>> @@ -6040,6 +6041,71 @@ void perf_put_mediated_pmu(void)
>>>  }
>>>  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>>>  
>>> +static inline void perf_host_exit(struct perf_cpu_context *cpuctx)
>>> +{
>>> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>>> +	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
>>> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>> +	if (cpuctx->task_ctx) {
>>> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>>> +		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
>>> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>> +	}
>>> +}
>>> +
>>> +/* When entering a guest, schedule out all exclude_guest events. */
>>> +void perf_guest_enter(void)
>>> +{
>>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>>> +
>>> +	lockdep_assert_irqs_disabled();
>>> +
>>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>> +
>>> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
>>> +		goto unlock;
>>> +
>>> +	perf_host_exit(cpuctx);
>>> +
>>> +	__this_cpu_write(perf_in_guest, true);
>>> +
>>> +unlock:
>>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>> +}
>>> +EXPORT_SYMBOL_GPL(perf_guest_enter);
>>> +
>>> +static inline void perf_host_enter(struct perf_cpu_context *cpuctx)
>>> +{
>>> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>>> +	if (cpuctx->task_ctx)
>>> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>>> +
>>> +	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
>>> +
>>> +	if (cpuctx->task_ctx)
>>> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>> +}
>>> +
>>> +void perf_guest_exit(void)
>>> +{
>>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>>> +
>>> +	lockdep_assert_irqs_disabled();
>>> +
>>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>> +
>>> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
>>> +		goto unlock;
>>> +
>>> +	perf_host_enter(cpuctx);
>>> +
>>> +	__this_cpu_write(perf_in_guest, false);
>>> +unlock:
>>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>> +}
>>> +EXPORT_SYMBOL_GPL(perf_guest_exit);
>>
>> This naming is confusing on purpose? Pick either guest/host and stick
>> with it.
> 
> +1.  I also think the inner perf_host_{enter,exit}() helpers are superflous.
> These flows
> 
> After a bit of hacking, and with a few spoilers, this is what I ended up with
> (not anywhere near fully tested).  I like following KVM's kvm_xxx_{load,put}()
> nomenclature to tie everything together, so I went with "guest" instead of "host"
> even though the majority of work being down is to shedule out/in host context.
> 
> /* When loading a guest's mediated PMU, schedule out all exclude_guest events. */
> void perf_load_guest_context(unsigned long data)
> {
> 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> 
> 	lockdep_assert_irqs_disabled();
> 
> 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> 
> 	if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded)))
> 		goto unlock;
> 
> 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> 	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
> 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> 	if (cpuctx->task_ctx) {
> 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> 		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
> 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> 	}
> 
> 	arch_perf_load_guest_context(data);
> 
> 	__this_cpu_write(guest_ctx_loaded, true);
> 
> unlock:
> 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
> EXPORT_SYMBOL_GPL(perf_load_guest_context);
> 
> void perf_put_guest_context(void)
> {
> 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> 
> 	lockdep_assert_irqs_disabled();
> 
> 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> 
> 	if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded)))
> 		goto unlock;
> 
> 	arch_perf_put_guest_context();

It will set the guest_ctx_loaded to false.
The update_context_time() invoked in the perf_event_sched_in() will not
get a chance to update the guest time.

I think something as below should work.

- Disable all in the PMU (disable global control)
- schedule in the host counters (but not run yet since the global
control of the PMU is disabled)
- arch_perf_put_guest_context()
- Enable all in the PMU (Enable global control. The host counters now start)

void perf_put_guest_context(void)
{
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

	lockdep_assert_irqs_disabled();

	perf_ctx_lock(cpuctx, cpuctx->task_ctx);

	if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded)))
		goto unlock;

	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
	if (cpuctx->task_ctx)
		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);

	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);

	arch_perf_put_guest_context();

	if (cpuctx->task_ctx)
		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);

unlock:
	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}



Similar to the perf_load_guest_context().

- Disable all in the PMU (disable global control)
- schedule out all the host counters
- arch_perf_load_guest_context()
- Enable all in the PMU (enable global control)

void perf_load_guest_context(unsigned long data)
{
	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

	lockdep_assert_irqs_disabled();

	perf_ctx_lock(cpuctx, cpuctx->task_ctx);

	if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded)))
		goto unlock;

	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
	if (cpuctx->task_ctx) {
		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
	}

	arch_perf_load_guest_context(data);

	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
	if (cpuctx->task_ctx)
		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
unlock:
	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}


Thanks,
Kan

> 
> 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> 	if (cpuctx->task_ctx)
> 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> 
> 	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
> 
> 	if (cpuctx->task_ctx)
> 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> 
> 	__this_cpu_write(guest_ctx_loaded, false);
> unlock:> 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
> EXPORT_SYMBOL_GPL(perf_put_guest_context);

Re: [PATCH v4 05/38] perf: Add generic exclude_guest support

Posted by Sean Christopherson 8 months, 4 weeks ago

On Thu, May 15, 2025, Kan Liang wrote:
> On 2025-05-14 7:19 p.m., Sean Christopherson wrote:
> >> This naming is confusing on purpose? Pick either guest/host and stick
> >> with it.
> > 
> > +1.  I also think the inner perf_host_{enter,exit}() helpers are superflous.
> > These flows
> > 
> > After a bit of hacking, and with a few spoilers, this is what I ended up with
> > (not anywhere near fully tested).  I like following KVM's kvm_xxx_{load,put}()
> > nomenclature to tie everything together, so I went with "guest" instead of "host"
> > even though the majority of work being down is to shedule out/in host context.
> > 
> > /* When loading a guest's mediated PMU, schedule out all exclude_guest events. */
> > void perf_load_guest_context(unsigned long data)
> > {
> > 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> > 
> > 	lockdep_assert_irqs_disabled();
> > 
> > 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> > 
> > 	if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded)))
> > 		goto unlock;
> > 
> > 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> > 	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
> > 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> > 	if (cpuctx->task_ctx) {
> > 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> > 		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
> > 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> > 	}
> > 
> > 	arch_perf_load_guest_context(data);
> > 
> > 	__this_cpu_write(guest_ctx_loaded, true);
> > 
> > unlock:
> > 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> > }
> > EXPORT_SYMBOL_GPL(perf_load_guest_context);
> > 
> > void perf_put_guest_context(void)
> > {
> > 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
> > 
> > 	lockdep_assert_irqs_disabled();
> > 
> > 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> > 
> > 	if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded)))
> > 		goto unlock;
> > 
> > 	arch_perf_put_guest_context();
> 
> It will set the guest_ctx_loaded to false.
> The update_context_time() invoked in the perf_event_sched_in() will not
> get a chance to update the guest time.

The guest_ctx_loaded in arch/x86/events/core.c is a different variable, it just
happens to have the same name.

It's completely gross, but exposing guest_ctx_loaded outside of kernel/events/core.c
didn't seem like a great alternative.  If we wanted to use a single variable,
then the writes in arch_perf_{load,put}_guest_context() can simply go away.

Re: [PATCH v4 05/38] perf: Add generic exclude_guest support

Posted by Liang, Kan 8 months, 4 weeks ago


On 2025-05-15 3:25 p.m., Sean Christopherson wrote:
> On Thu, May 15, 2025, Kan Liang wrote:
>> On 2025-05-14 7:19 p.m., Sean Christopherson wrote:
>>>> This naming is confusing on purpose? Pick either guest/host and stick
>>>> with it.
>>>
>>> +1.  I also think the inner perf_host_{enter,exit}() helpers are superflous.
>>> These flows
>>>
>>> After a bit of hacking, and with a few spoilers, this is what I ended up with
>>> (not anywhere near fully tested).  I like following KVM's kvm_xxx_{load,put}()
>>> nomenclature to tie everything together, so I went with "guest" instead of "host"
>>> even though the majority of work being down is to shedule out/in host context.
>>>
>>> /* When loading a guest's mediated PMU, schedule out all exclude_guest events. */
>>> void perf_load_guest_context(unsigned long data)
>>> {
>>> 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>>>
>>> 	lockdep_assert_irqs_disabled();
>>>
>>> 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>>
>>> 	if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded)))
>>> 		goto unlock;
>>>
>>> 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>>> 	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
>>> 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>> 	if (cpuctx->task_ctx) {
>>> 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>>> 		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
>>> 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>> 	}
>>>
>>> 	arch_perf_load_guest_context(data);
>>>
>>> 	__this_cpu_write(guest_ctx_loaded, true);
>>>
>>> unlock:
>>> 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>> }
>>> EXPORT_SYMBOL_GPL(perf_load_guest_context);
>>>
>>> void perf_put_guest_context(void)
>>> {
>>> 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>>>
>>> 	lockdep_assert_irqs_disabled();
>>>
>>> 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>>
>>> 	if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded)))
>>> 		goto unlock;
>>>
>>> 	arch_perf_put_guest_context();
>>
>> It will set the guest_ctx_loaded to false.
>> The update_context_time() invoked in the perf_event_sched_in() will not
>> get a chance to update the guest time.
> 
> The guest_ctx_loaded in arch/x86/events/core.c is a different variable, it just
> happens to have the same name.
>

Ah, I thought it was the same variable. Then there should be no issue
with the guest time.

But the same variable name may bring confusions. Maybe add a x86_pmu/x86
prefix to the variable in x86 code.
> It's completely gross, but exposing guest_ctx_loaded outside of kernel/events/core.c
> didn't seem like a great alternative.  If we wanted to use a single variable,
> then the writes in arch_perf_{load,put}_guest_context() can simply go away.
> 
Either two variables or a single variable is fine with me, as long as
they can be easily distinguished.

But I think we should put arch_perf_{load,put}_guest_context() and
guest_ctx_loaded between the perf_ctx_disable/enable() pair.
Perf should only update the state when PMU is completely disabled.
It matches the way the rest of the perf code does.
It could also avoid some potential issues, e.g., the state is not
updated completely, but the counter has already been fired.

Thanks,
Kan

Re: [PATCH v4 05/38] perf: Add generic exclude_guest support

Posted by Mi, Dapeng 9 months ago

On 5/15/2025 7:19 AM, Sean Christopherson wrote:
> On Fri, Apr 25, 2025, Peter Zijlstra wrote:
>> On Mon, Mar 24, 2025 at 05:30:45PM +0000, Mingwei Zhang wrote:
>>
>>> @@ -6040,6 +6041,71 @@ void perf_put_mediated_pmu(void)
>>>  }
>>>  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>>>  
>>> +static inline void perf_host_exit(struct perf_cpu_context *cpuctx)
>>> +{
>>> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>>> +	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
>>> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>> +	if (cpuctx->task_ctx) {
>>> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>>> +		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
>>> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>> +	}
>>> +}
>>> +
>>> +/* When entering a guest, schedule out all exclude_guest events. */
>>> +void perf_guest_enter(void)
>>> +{
>>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>>> +
>>> +	lockdep_assert_irqs_disabled();
>>> +
>>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>> +
>>> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
>>> +		goto unlock;
>>> +
>>> +	perf_host_exit(cpuctx);
>>> +
>>> +	__this_cpu_write(perf_in_guest, true);
>>> +
>>> +unlock:
>>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>> +}
>>> +EXPORT_SYMBOL_GPL(perf_guest_enter);
>>> +
>>> +static inline void perf_host_enter(struct perf_cpu_context *cpuctx)
>>> +{
>>> +	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>>> +	if (cpuctx->task_ctx)
>>> +		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>>> +
>>> +	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
>>> +
>>> +	if (cpuctx->task_ctx)
>>> +		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>> +	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>> +}
>>> +
>>> +void perf_guest_exit(void)
>>> +{
>>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>>> +
>>> +	lockdep_assert_irqs_disabled();
>>> +
>>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>> +
>>> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
>>> +		goto unlock;
>>> +
>>> +	perf_host_enter(cpuctx);
>>> +
>>> +	__this_cpu_write(perf_in_guest, false);
>>> +unlock:
>>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>> +}
>>> +EXPORT_SYMBOL_GPL(perf_guest_exit);
>> This naming is confusing on purpose? Pick either guest/host and stick
>> with it.
> +1.  I also think the inner perf_host_{enter,exit}() helpers are superflous.
> These flows
>
> After a bit of hacking, and with a few spoilers, this is what I ended up with
> (not anywhere near fully tested).  I like following KVM's kvm_xxx_{load,put}()
> nomenclature to tie everything together, so I went with "guest" instead of "host"
> even though the majority of work being down is to shedule out/in host context.
>
> /* When loading a guest's mediated PMU, schedule out all exclude_guest events. */
> void perf_load_guest_context(unsigned long data)
> {
> 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>
> 	lockdep_assert_irqs_disabled();
>
> 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>
> 	if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded)))
> 		goto unlock;
>
> 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> 	ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST);
> 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
> 	if (cpuctx->task_ctx) {
> 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
> 		task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST);
> 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> 	}
>
> 	arch_perf_load_guest_context(data);
>
> 	__this_cpu_write(guest_ctx_loaded, true);
>
> unlock:
> 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
> EXPORT_SYMBOL_GPL(perf_load_guest_context);
>
> void perf_put_guest_context(void)
> {
> 	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>
> 	lockdep_assert_irqs_disabled();
>
> 	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>
> 	if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded)))
> 		goto unlock;
>
> 	arch_perf_put_guest_context();
>
> 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
> 	if (cpuctx->task_ctx)
> 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>
> 	perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST);
>
> 	if (cpuctx->task_ctx)
> 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
> 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>
> 	__this_cpu_write(guest_ctx_loaded, false);
> unlock:
> 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
> }
> EXPORT_SYMBOL_GPL(perf_put_guest_context);

I'm fine with the name. Thanks.


>