RE: [PATCH] rcu: Fix race in set and clear TICK_DEP_BIT_RCU_EXP bitmask

Posted by Zhang, Qiang1 2 years, 8 months ago

> For the kernel bulit with CONFIG_NO_HZ_FULL enabled and the following 
> cpus is nohz_full cpus:
> 
> 	CPU1                                                 CPU2
> rcu_report_exp_cpu_mult                          synchronize_rcu_expedited_wait
>    acquires rnp->lock                               mask = rnp->expmask;
>                                                     for_each_leaf_node_cpu_mask(rnp, cpu, mask)
>    rnp->expmask = rnp->expmask & ~mask;                rdp = per_cpu_ptr(&rcu_data, cpu1);
>    for_each_leaf_node_cpu_mask(rnp, cpu, mask)
>       rdp = per_cpu_ptr(&rcu_data, cpu1);
>       if (!rdp->rcu_forced_tick_exp)
>              continue;                                 rdp->rcu_forced_tick_exp = true;
>                                                        
> tick_dep_set_cpu(cpu1, TICK_DEP_BIT_RCU_EXP);
> 
> In the above scenario, after CPU1 reported the quiescent state, CPU1 
> misses the opportunity to clear the TICK_DEP_BIT_RCU_EXP bitmask, it 
> will not be cleared until the next expedited grace period starts and 
> the CPU1 quiescent state is reported again. during this window period, 
> the CPU1 whose tick can not be stopped, if CPU1 has only one runnable 
> task and this task has aggressive real-time response constraints, this 
> task may have one of the worst response times.
> 
> Therefore, this commit add rnp->lock when set TICK_DEP_BIT_RCU_EXP 
> bitmask to fix this race.
> 
> Signed-off-by: Zqiang <qiang1.zhang@intel.com>
>
>Good eyes, thank you!!!
>
>Queued for testing and further review as follows, as always, please check for errors.
>

It looks more clear now, thank you!

Thanks
Zqiang

>							Thanx, Paul
>
>------------------------------------------------------------------------

commit acfe689f2e473fb59b6d2c95af5fe36198bb9a84
Author: Zqiang <qiang1.zhang@intel.com>
Date:   Tue Dec 20 19:25:20 2022 +0800

    rcu: Fix set/clear TICK_DEP_BIT_RCU_EXP bitmask race
    
    For kernels built with CONFIG_NO_HZ_FULL=y, the following scenario can result
    in the scheduling-clock interrupt remaining enabled on a holdout CPU after
    its quiescent state has been reported:
    
            CPU1                                                 CPU2
    rcu_report_exp_cpu_mult                          synchronize_rcu_expedited_wait
       acquires rnp->lock                               mask = rnp->expmask;
                                                        for_each_leaf_node_cpu_mask(rnp, cpu, mask)
       rnp->expmask = rnp->expmask & ~mask;                rdp = per_cpu_ptr(&rcu_data, cpu1);
       for_each_leaf_node_cpu_mask(rnp, cpu, mask)
          rdp = per_cpu_ptr(&rcu_data, cpu1);
          if (!rdp->rcu_forced_tick_exp)
                 continue;                                 rdp->rcu_forced_tick_exp = true;
                                                           tick_dep_set_cpu(cpu1, TICK_DEP_BIT_RCU_EXP);
    
    The problem is that CPU2's sampling of rnp->expmask is obsolete by the
    time it invokes tick_dep_set_cpu(), and CPU1 is not guaranteed to see
    CPU2's store to ->rcu_forced_tick_exp in time to clear it.  And even if
    CPU1 does see that store, it might invoke tick_dep_clear_cpu() before
    CPU2 got around to executing its tick_dep_set_cpu(), which would still
    leave the victim CPU with its scheduler-clock tick running.
    
    Either way, an nohz_full real-time application running on the victim
    CPU would have its latency needlessly degraded.
    
    Note that expedited RCU grace periods look at context-tracking
    information, and so if the CPU is executing in nohz_full usermode
    throughout, that CPU cannot be victimized in this manner.
    
    This commit therefore causes synchronize_rcu_expedited_wait to hold
    the rcu_node structure's ->lock when checking for holdout CPUs, setting
    TICK_DEP_BIT_RCU_EXP, and invoking tick_dep_set_cpu(), thus preventing
    this race.
    
    Signed-off-by: Zqiang <qiang1.zhang@intel.com>
    Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 249c2967d9e6c..7cc4856da0817 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -594,6 +594,7 @@ static void synchronize_rcu_expedited_wait(void)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 	struct rcu_node *rnp_root = rcu_get_root();
+	unsigned long flags;
 
 	trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
 	jiffies_stall = rcu_exp_jiffies_till_stall_check();
@@ -602,17 +603,17 @@ static void synchronize_rcu_expedited_wait(void)
 		if (synchronize_rcu_expedited_wait_once(1))
 			return;
 		rcu_for_each_leaf_node(rnp) {
+			raw_spin_lock_irqsave_rcu_node(rnp, flags);
 			mask = READ_ONCE(rnp->expmask);
 			for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
 				rdp = per_cpu_ptr(&rcu_data, cpu);
 				if (rdp->rcu_forced_tick_exp)
 					continue;
 				rdp->rcu_forced_tick_exp = true;
-				preempt_disable();
 				if (cpu_online(cpu))
 					tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
-				preempt_enable();
 			}
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		}
 		j = READ_ONCE(jiffies_till_first_fqs);
 		if (synchronize_rcu_expedited_wait_once(j + HZ))

Re: [PATCH] rcu: Fix race in set and clear TICK_DEP_BIT_RCU_EXP bitmask

Posted by Paul E. McKenney 2 years, 8 months ago

On Thu, Dec 22, 2022 at 09:48:14AM +0000, Zhang, Qiang1 wrote:
> > For the kernel bulit with CONFIG_NO_HZ_FULL enabled and the following 
> > cpus is nohz_full cpus:
> > 
> > 	CPU1                                                 CPU2
> > rcu_report_exp_cpu_mult                          synchronize_rcu_expedited_wait
> >    acquires rnp->lock                               mask = rnp->expmask;
> >                                                     for_each_leaf_node_cpu_mask(rnp, cpu, mask)
> >    rnp->expmask = rnp->expmask & ~mask;                rdp = per_cpu_ptr(&rcu_data, cpu1);
> >    for_each_leaf_node_cpu_mask(rnp, cpu, mask)
> >       rdp = per_cpu_ptr(&rcu_data, cpu1);
> >       if (!rdp->rcu_forced_tick_exp)
> >              continue;                                 rdp->rcu_forced_tick_exp = true;
> >                                                        
> > tick_dep_set_cpu(cpu1, TICK_DEP_BIT_RCU_EXP);
> > 
> > In the above scenario, after CPU1 reported the quiescent state, CPU1 
> > misses the opportunity to clear the TICK_DEP_BIT_RCU_EXP bitmask, it 
> > will not be cleared until the next expedited grace period starts and 
> > the CPU1 quiescent state is reported again. during this window period, 
> > the CPU1 whose tick can not be stopped, if CPU1 has only one runnable 
> > task and this task has aggressive real-time response constraints, this 
> > task may have one of the worst response times.
> > 
> > Therefore, this commit add rnp->lock when set TICK_DEP_BIT_RCU_EXP 
> > bitmask to fix this race.
> > 
> > Signed-off-by: Zqiang <qiang1.zhang@intel.com>
> >
> >Good eyes, thank you!!!
> >
> >Queued for testing and further review as follows, as always, please check for errors.
> >
> 
> It looks more clear now, thank you!

Thank you for checking them both!

							Thanx, Paul

> Thanks
> Zqiang
> 
> >							Thanx, Paul
> >
> >------------------------------------------------------------------------
> 
> commit acfe689f2e473fb59b6d2c95af5fe36198bb9a84
> Author: Zqiang <qiang1.zhang@intel.com>
> Date:   Tue Dec 20 19:25:20 2022 +0800
> 
>     rcu: Fix set/clear TICK_DEP_BIT_RCU_EXP bitmask race
>     
>     For kernels built with CONFIG_NO_HZ_FULL=y, the following scenario can result
>     in the scheduling-clock interrupt remaining enabled on a holdout CPU after
>     its quiescent state has been reported:
>     
>             CPU1                                                 CPU2
>     rcu_report_exp_cpu_mult                          synchronize_rcu_expedited_wait
>        acquires rnp->lock                               mask = rnp->expmask;
>                                                         for_each_leaf_node_cpu_mask(rnp, cpu, mask)
>        rnp->expmask = rnp->expmask & ~mask;                rdp = per_cpu_ptr(&rcu_data, cpu1);
>        for_each_leaf_node_cpu_mask(rnp, cpu, mask)
>           rdp = per_cpu_ptr(&rcu_data, cpu1);
>           if (!rdp->rcu_forced_tick_exp)
>                  continue;                                 rdp->rcu_forced_tick_exp = true;
>                                                            tick_dep_set_cpu(cpu1, TICK_DEP_BIT_RCU_EXP);
>     
>     The problem is that CPU2's sampling of rnp->expmask is obsolete by the
>     time it invokes tick_dep_set_cpu(), and CPU1 is not guaranteed to see
>     CPU2's store to ->rcu_forced_tick_exp in time to clear it.  And even if
>     CPU1 does see that store, it might invoke tick_dep_clear_cpu() before
>     CPU2 got around to executing its tick_dep_set_cpu(), which would still
>     leave the victim CPU with its scheduler-clock tick running.
>     
>     Either way, an nohz_full real-time application running on the victim
>     CPU would have its latency needlessly degraded.
>     
>     Note that expedited RCU grace periods look at context-tracking
>     information, and so if the CPU is executing in nohz_full usermode
>     throughout, that CPU cannot be victimized in this manner.
>     
>     This commit therefore causes synchronize_rcu_expedited_wait to hold
>     the rcu_node structure's ->lock when checking for holdout CPUs, setting
>     TICK_DEP_BIT_RCU_EXP, and invoking tick_dep_set_cpu(), thus preventing
>     this race.
>     
>     Signed-off-by: Zqiang <qiang1.zhang@intel.com>
>     Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
> 
> diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 249c2967d9e6c..7cc4856da0817 100644
> --- a/kernel/rcu/tree_exp.h
> +++ b/kernel/rcu/tree_exp.h
> @@ -594,6 +594,7 @@ static void synchronize_rcu_expedited_wait(void)
>  	struct rcu_data *rdp;
>  	struct rcu_node *rnp;
>  	struct rcu_node *rnp_root = rcu_get_root();
> +	unsigned long flags;
>  
>  	trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
>  	jiffies_stall = rcu_exp_jiffies_till_stall_check();
> @@ -602,17 +603,17 @@ static void synchronize_rcu_expedited_wait(void)
>  		if (synchronize_rcu_expedited_wait_once(1))
>  			return;
>  		rcu_for_each_leaf_node(rnp) {
> +			raw_spin_lock_irqsave_rcu_node(rnp, flags);
>  			mask = READ_ONCE(rnp->expmask);
>  			for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
>  				rdp = per_cpu_ptr(&rcu_data, cpu);
>  				if (rdp->rcu_forced_tick_exp)
>  					continue;
>  				rdp->rcu_forced_tick_exp = true;
> -				preempt_disable();
>  				if (cpu_online(cpu))
>  					tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
> -				preempt_enable();
>  			}
> +			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
>  		}
>  		j = READ_ONCE(jiffies_till_first_fqs);
>  		if (synchronize_rcu_expedited_wait_once(j + HZ))