From: Muchun Song <songmuchun@bytedance.com>
In a subsequent patch, we'll reparent the LRU folios. The folios that are
moved to the appropriate LRU list can undergo reparenting during the
move_folios_to_lru() process. Hence, it's incorrect for the caller to hold
a lruvec lock. Instead, we should utilize the more general interface of
folio_lruvec_relock_irq() to obtain the correct lruvec lock.
This patch involves only code refactoring and doesn't introduce any
functional changes.
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
mm/vmscan.c | 46 +++++++++++++++++++++++-----------------------
1 file changed, 23 insertions(+), 23 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3a1044ce30f1e..660cd40cfddd4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1883,24 +1883,27 @@ static bool too_many_isolated(struct pglist_data *pgdat, int file,
/*
* move_folios_to_lru() moves folios from private @list to appropriate LRU list.
*
- * Returns the number of pages moved to the given lruvec.
+ * Returns the number of pages moved to the appropriate lruvec.
+ *
+ * Note: The caller must not hold any lruvec lock.
*/
-static unsigned int move_folios_to_lru(struct lruvec *lruvec,
- struct list_head *list)
+static unsigned int move_folios_to_lru(struct list_head *list)
{
int nr_pages, nr_moved = 0;
+ struct lruvec *lruvec = NULL;
struct folio_batch free_folios;
folio_batch_init(&free_folios);
while (!list_empty(list)) {
struct folio *folio = lru_to_folio(list);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
list_del(&folio->lru);
if (unlikely(!folio_evictable(folio))) {
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
folio_putback_lru(folio);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec = NULL;
continue;
}
@@ -1922,19 +1925,15 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
- spin_unlock_irq(&lruvec->lru_lock);
+ lruvec_unlock_irq(lruvec);
mem_cgroup_uncharge_folios(&free_folios);
free_unref_folios(&free_folios);
- spin_lock_irq(&lruvec->lru_lock);
+ lruvec = NULL;
}
continue;
}
- /*
- * All pages were isolated from the same lruvec (and isolation
- * inhibits memcg migration).
- */
VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
lruvec_add_folio(lruvec, folio);
nr_pages = folio_nr_pages(folio);
@@ -1943,11 +1942,12 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
workingset_age_nonresident(lruvec, nr_pages);
}
+ if (lruvec)
+ lruvec_unlock_irq(lruvec);
+
if (free_folios.nr) {
- spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
free_unref_folios(&free_folios);
- spin_lock_irq(&lruvec->lru_lock);
}
return nr_moved;
@@ -2016,9 +2016,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
lruvec_memcg(lruvec));
- spin_lock_irq(&lruvec->lru_lock);
- move_folios_to_lru(lruvec, &folio_list);
+ move_folios_to_lru(&folio_list);
+ spin_lock_irq(&lruvec->lru_lock);
__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -2166,11 +2166,10 @@ static void shrink_active_list(unsigned long nr_to_scan,
/*
* Move folios back to the lru list.
*/
- spin_lock_irq(&lruvec->lru_lock);
-
- nr_activate = move_folios_to_lru(lruvec, &l_active);
- nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
+ nr_activate = move_folios_to_lru(&l_active);
+ nr_deactivate = move_folios_to_lru(&l_inactive);
+ spin_lock_irq(&lruvec->lru_lock);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
@@ -4735,14 +4734,15 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
}
- spin_lock_irq(&lruvec->lru_lock);
-
- move_folios_to_lru(lruvec, &list);
+ move_folios_to_lru(&list);
+ local_irq_disable();
walk = current->reclaim_state->mm_walk;
if (walk && walk->batched) {
walk->lruvec = lruvec;
+ spin_lock(&lruvec->lru_lock);
reset_batch_size(walk);
+ spin_unlock(&lruvec->lru_lock);
}
__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
@@ -4754,7 +4754,7 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
count_memcg_events(memcg, item, reclaimed);
__count_vm_events(PGSTEAL_ANON + type, reclaimed);
- spin_unlock_irq(&lruvec->lru_lock);
+ local_irq_enable();
list_splice_init(&clean, &list);
--
2.20.1
On Tue, Oct 28, 2025 at 09:58:17PM +0800, Qi Zheng wrote:
> From: Muchun Song <songmuchun@bytedance.com>
>
> In a subsequent patch, we'll reparent the LRU folios. The folios that are
> moved to the appropriate LRU list can undergo reparenting during the
> move_folios_to_lru() process. Hence, it's incorrect for the caller to hold
> a lruvec lock. Instead, we should utilize the more general interface of
> folio_lruvec_relock_irq() to obtain the correct lruvec lock.
>
> This patch involves only code refactoring and doesn't introduce any
> functional changes.
>
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
> mm/vmscan.c | 46 +++++++++++++++++++++++-----------------------
> 1 file changed, 23 insertions(+), 23 deletions(-)
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 3a1044ce30f1e..660cd40cfddd4 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2016,9 +2016,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
> lruvec_memcg(lruvec));
>
> - spin_lock_irq(&lruvec->lru_lock);
> - move_folios_to_lru(lruvec, &folio_list);
> + move_folios_to_lru(&folio_list);
>
> + spin_lock_irq(&lruvec->lru_lock);
> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
> stat.nr_demoted);
Maybe I'm missing something or just confused for now, but let me ask...
How do we make sure the lruvec (and the mem_cgroup containing the
lruvec) did not disappear (due to offlining) after move_folios_to_lru()?
> __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
> @@ -2166,11 +2166,10 @@ static void shrink_active_list(unsigned long nr_to_scan,
> /*
> * Move folios back to the lru list.
> */
> - spin_lock_irq(&lruvec->lru_lock);
> -
> - nr_activate = move_folios_to_lru(lruvec, &l_active);
> - nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
> + nr_activate = move_folios_to_lru(&l_active);
> + nr_deactivate = move_folios_to_lru(&l_inactive);
>
> + spin_lock_irq(&lruvec->lru_lock);
> __count_vm_events(PGDEACTIVATE, nr_deactivate);
> count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
>
> @@ -4735,14 +4734,15 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
> set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
> }
>
> - spin_lock_irq(&lruvec->lru_lock);
> -
> - move_folios_to_lru(lruvec, &list);
> + move_folios_to_lru(&list);
>
> + local_irq_disable();
> walk = current->reclaim_state->mm_walk;
> if (walk && walk->batched) {
> walk->lruvec = lruvec;
> + spin_lock(&lruvec->lru_lock);
> reset_batch_size(walk);
> + spin_unlock(&lruvec->lru_lock);
> }
Cc'ing RT folks as they may not want to disable IRQ on PREEMPT_RT.
IIRC there has been some effort in MM to reduce the scope of
IRQ-disabled section in MM when PREEMPT_RT config was added to the
mainline. spin_lock_irq() doesn't disable IRQ on PREEMPT_RT.
Also, this will break RT according to Documentation/locking/locktypes.rst:
> The changes in spinlock_t and rwlock_t semantics on PREEMPT_RT kernels
> have a few implications. For example, on a non-PREEMPT_RT kernel
> the following code sequence works as expected:
>
> local_irq_disable();
> spin_lock(&lock);
>
> and is fully equivalent to:
>
> spin_lock_irq(&lock);
> Same applies to rwlock_t and the _irqsave() suffix variants.
>
> On PREEMPT_RT kernel this code sequence breaks because RT-mutex requires
> a fully preemptible context. Instead, use spin_lock_irq() or
> spin_lock_irqsave() and their unlock counterparts.
>
> In cases where the interrupt disabling and locking must remain separate,
> PREEMPT_RT offers a local_lock mechanism. Acquiring the local_lock pins
> the task to a CPU, allowing things like per-CPU interrupt disabled locks
> to be acquired. However, this approach should be used only where absolutely
> necessary.
--
Cheers,
Harry / Hyeonggon
On 2025-11-07 14:11:27 [+0900], Harry Yoo wrote:
> > @@ -4735,14 +4734,15 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
> > set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
> > }
> >
> > - spin_lock_irq(&lruvec->lru_lock);
> > -
> > - move_folios_to_lru(lruvec, &list);
> > + move_folios_to_lru(&list);
> >
> > + local_irq_disable();
> > walk = current->reclaim_state->mm_walk;
> > if (walk && walk->batched) {
> > walk->lruvec = lruvec;
> > + spin_lock(&lruvec->lru_lock);
> > reset_batch_size(walk);
> > + spin_unlock(&lruvec->lru_lock);
> > }
>
> Cc'ing RT folks as they may not want to disable IRQ on PREEMPT_RT.
Thank you, this is not going to work. The local_irq_disable() shouldn't
be used.
> IIRC there has been some effort in MM to reduce the scope of
> IRQ-disabled section in MM when PREEMPT_RT config was added to the
> mainline. spin_lock_irq() doesn't disable IRQ on PREEMPT_RT.
Exactly.
Sebastian
Hi Harry,
On 11/7/25 1:11 PM, Harry Yoo wrote:
> On Tue, Oct 28, 2025 at 09:58:17PM +0800, Qi Zheng wrote:
>> From: Muchun Song <songmuchun@bytedance.com>
>>
>> In a subsequent patch, we'll reparent the LRU folios. The folios that are
>> moved to the appropriate LRU list can undergo reparenting during the
>> move_folios_to_lru() process. Hence, it's incorrect for the caller to hold
>> a lruvec lock. Instead, we should utilize the more general interface of
>> folio_lruvec_relock_irq() to obtain the correct lruvec lock.
>>
>> This patch involves only code refactoring and doesn't introduce any
>> functional changes.
>>
>> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
>> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>> mm/vmscan.c | 46 +++++++++++++++++++++++-----------------------
>> 1 file changed, 23 insertions(+), 23 deletions(-)
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 3a1044ce30f1e..660cd40cfddd4 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -2016,9 +2016,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
>> nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
>> lruvec_memcg(lruvec));
>>
>> - spin_lock_irq(&lruvec->lru_lock);
>> - move_folios_to_lru(lruvec, &folio_list);
>> + move_folios_to_lru(&folio_list);
>>
>> + spin_lock_irq(&lruvec->lru_lock);
>> __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
>> stat.nr_demoted);
>
> Maybe I'm missing something or just confused for now, but let me ask...
>
> How do we make sure the lruvec (and the mem_cgroup containing the
> lruvec) did not disappear (due to offlining) after move_folios_to_lru()?
We obtained lruvec through the following method:
memcg = mem_cgroup_iter(target_memcg, NULL, partial);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
shrink_lruvec(lruvec, sc);
--> shrink_inactive_list
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
The mem_cgroup_iter() will hold the refcount of this memcg, so IIUC,
the memcg will not disappear at this time.
>
>> __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
>> @@ -2166,11 +2166,10 @@ static void shrink_active_list(unsigned long nr_to_scan,
>> /*
>> * Move folios back to the lru list.
>> */
>> - spin_lock_irq(&lruvec->lru_lock);
>> -
>> - nr_activate = move_folios_to_lru(lruvec, &l_active);
>> - nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
>> + nr_activate = move_folios_to_lru(&l_active);
>> + nr_deactivate = move_folios_to_lru(&l_inactive);
>>
>> + spin_lock_irq(&lruvec->lru_lock);
>> __count_vm_events(PGDEACTIVATE, nr_deactivate);
>> count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
>>
>> @@ -4735,14 +4734,15 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
>> set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
>> }
>>
>> - spin_lock_irq(&lruvec->lru_lock);
>> -
>> - move_folios_to_lru(lruvec, &list);
>> + move_folios_to_lru(&list);
>>
>> + local_irq_disable();
>> walk = current->reclaim_state->mm_walk;
>> if (walk && walk->batched) {
>> walk->lruvec = lruvec;
>> + spin_lock(&lruvec->lru_lock);
>> reset_batch_size(walk);
>> + spin_unlock(&lruvec->lru_lock);
>> }
>
> Cc'ing RT folks as they may not want to disable IRQ on PREEMPT_RT.
>
> IIRC there has been some effort in MM to reduce the scope of
> IRQ-disabled section in MM when PREEMPT_RT config was added to the
> mainline. spin_lock_irq() doesn't disable IRQ on PREEMPT_RT.
Thanks for this information.
>
> Also, this will break RT according to Documentation/locking/locktypes.rst:
>> The changes in spinlock_t and rwlock_t semantics on PREEMPT_RT kernels
>> have a few implications. For example, on a non-PREEMPT_RT kernel
>> the following code sequence works as expected:
>>
>> local_irq_disable();
>> spin_lock(&lock);
>>
>> and is fully equivalent to:
>>
>> spin_lock_irq(&lock);
>> Same applies to rwlock_t and the _irqsave() suffix variants.
>>
>> On PREEMPT_RT kernel this code sequence breaks because RT-mutex requires
>> a fully preemptible context. Instead, use spin_lock_irq() or
>> spin_lock_irqsave() and their unlock counterparts.
>>
>> In cases where the interrupt disabling and locking must remain separate,
>> PREEMPT_RT offers a local_lock mechanism. Acquiring the local_lock pins
>> the task to a CPU, allowing things like per-CPU interrupt disabled locks
>> to be acquired. However, this approach should be used only where absolutely
>> necessary.
But how do we determine if it's necessary?
Thanks,
Qi
>
On Fri, Nov 07, 2025 at 02:41:13PM +0800, Qi Zheng wrote:
> Hi Harry,
>
> On 11/7/25 1:11 PM, Harry Yoo wrote:
> > On Tue, Oct 28, 2025 at 09:58:17PM +0800, Qi Zheng wrote:
> > > From: Muchun Song <songmuchun@bytedance.com>
> > >
> > > In a subsequent patch, we'll reparent the LRU folios. The folios that are
> > > moved to the appropriate LRU list can undergo reparenting during the
> > > move_folios_to_lru() process. Hence, it's incorrect for the caller to hold
> > > a lruvec lock. Instead, we should utilize the more general interface of
> > > folio_lruvec_relock_irq() to obtain the correct lruvec lock.
> > >
> > > This patch involves only code refactoring and doesn't introduce any
> > > functional changes.
> > >
> > > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > > Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > > ---
> > > mm/vmscan.c | 46 +++++++++++++++++++++++-----------------------
> > > 1 file changed, 23 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > > index 3a1044ce30f1e..660cd40cfddd4 100644
> > > --- a/mm/vmscan.c
> > > +++ b/mm/vmscan.c
> > > @@ -2016,9 +2016,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
> > > nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false,
> > > lruvec_memcg(lruvec));
> > > - spin_lock_irq(&lruvec->lru_lock);
> > > - move_folios_to_lru(lruvec, &folio_list);
> > > + move_folios_to_lru(&folio_list);
> > > + spin_lock_irq(&lruvec->lru_lock);
> > > __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
> > > stat.nr_demoted);
> >
> > Maybe I'm missing something or just confused for now, but let me ask...
> >
> > How do we make sure the lruvec (and the mem_cgroup containing the
> > lruvec) did not disappear (due to offlining) after move_folios_to_lru()?
>
> We obtained lruvec through the following method:
>
> memcg = mem_cgroup_iter(target_memcg, NULL, partial);
> do {
> struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
>
> shrink_lruvec(lruvec, sc);
> --> shrink_inactive_list
> } while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial)));
>
> The mem_cgroup_iter() will hold the refcount of this memcg, so IIUC,
> the memcg will not disappear at this time.
Ah, right!
It can be offlined, but won't be released due to the refcount.
Thanks for the explanation.
> > > __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
> > > @@ -2166,11 +2166,10 @@ static void shrink_active_list(unsigned long nr_to_scan,
> > > /*
> > > * Move folios back to the lru list.
> > > */
> > > - spin_lock_irq(&lruvec->lru_lock);
> > > -
> > > - nr_activate = move_folios_to_lru(lruvec, &l_active);
> > > - nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
> > > + nr_activate = move_folios_to_lru(&l_active);
> > > + nr_deactivate = move_folios_to_lru(&l_inactive);
> > > + spin_lock_irq(&lruvec->lru_lock);
> > > __count_vm_events(PGDEACTIVATE, nr_deactivate);
> > > count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
> > > @@ -4735,14 +4734,15 @@ static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
> > > set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
> > > }
> > > - spin_lock_irq(&lruvec->lru_lock);
> > > -
> > > - move_folios_to_lru(lruvec, &list);
> > > + move_folios_to_lru(&list);
> > > + local_irq_disable();
> > > walk = current->reclaim_state->mm_walk;
> > > if (walk && walk->batched) {
> > > walk->lruvec = lruvec;
> > > + spin_lock(&lruvec->lru_lock);
> > > reset_batch_size(walk);
> > > + spin_unlock(&lruvec->lru_lock);
> > > }
> >
> > Cc'ing RT folks as they may not want to disable IRQ on PREEMPT_RT.
> >
> > IIRC there has been some effort in MM to reduce the scope of
> > IRQ-disabled section in MM when PREEMPT_RT config was added to the
> > mainline. spin_lock_irq() doesn't disable IRQ on PREEMPT_RT.
>
> Thanks for this information.
>
> >
> > Also, this will break RT according to Documentation/locking/locktypes.rst:
> > > The changes in spinlock_t and rwlock_t semantics on PREEMPT_RT kernels
> > > have a few implications. For example, on a non-PREEMPT_RT kernel
> > > the following code sequence works as expected:
> > >
> > > local_irq_disable();
> > > spin_lock(&lock);
> > >
> > > and is fully equivalent to:
> > >
> > > spin_lock_irq(&lock);
> > > Same applies to rwlock_t and the _irqsave() suffix variants.
> > >
> > > On PREEMPT_RT kernel this code sequence breaks because RT-mutex requires
> > > a fully preemptible context. Instead, use spin_lock_irq() or
> > > spin_lock_irqsave() and their unlock counterparts.
> > >
> > > In cases where the interrupt disabling and locking must remain separate,
> > > PREEMPT_RT offers a local_lock mechanism. Acquiring the local_lock pins
> > > the task to a CPU, allowing things like per-CPU interrupt disabled locks
> > > to be acquired. However, this approach should be used only where absolutely
> > > necessary.
>
> But how do we determine if it's necessary?
Although it's mentioned in the locking documentation, I'm afraid that
local_lock is not the right interface to use here. Preemption will be
disabled anyway (on both PREEMPT_RT and !PREEMPT_RT) when the stats are
updated (in __mod_node_page_state()).
Here we just want to disable IRQ only on !PREEMPT_RT (to update
the stats safely).
Maybe we'll need some ifdeffery? I don't see a better way around...
--
Cheers,
Harry / Hyeonggon
On Fri, Nov 07, 2025 at 10:20:57PM +0900, Harry Yoo wrote: > > Although it's mentioned in the locking documentation, I'm afraid that > local_lock is not the right interface to use here. Preemption will be > disabled anyway (on both PREEMPT_RT and !PREEMPT_RT) when the stats are > updated (in __mod_node_page_state()). > > Here we just want to disable IRQ only on !PREEMPT_RT (to update > the stats safely). I don't think there is a need to disable IRQs. There are three stats update functions called in that hunk. 1) __mod_lruvec_state 2) __count_vm_events 3) count_memcg_events count_memcg_events() can be called with IRQs. __count_vm_events can be replaced with count_vm_events. For __mod_lruvec_state, the __mod_node_page_state() inside needs preemption disabled. Easy way would be to just disable/enable preemption instead of IRQs. Otherwise go a bit more fine-grained approach i.e. replace __count_vm_events with count_vm_events and just disable preemption across __mod_node_page_state().
On Fri, Nov 07, 2025 at 10:32:52PM -0800, Shakeel Butt wrote: > On Fri, Nov 07, 2025 at 10:20:57PM +0900, Harry Yoo wrote: > > > > Although it's mentioned in the locking documentation, I'm afraid that > > local_lock is not the right interface to use here. Preemption will be > > disabled anyway (on both PREEMPT_RT and !PREEMPT_RT) when the stats are > > updated (in __mod_node_page_state()). > > > > Here we just want to disable IRQ only on !PREEMPT_RT (to update > > the stats safely). > > I don't think there is a need to disable IRQs. There are three stats > update functions called in that hunk. > > 1) __mod_lruvec_state > 2) __count_vm_events > 3) count_memcg_events > > count_memcg_events() can be called with IRQs. __count_vm_events can be > replaced with count_vm_events. Right. > For __mod_lruvec_state, the > __mod_node_page_state() inside needs preemption disabled. The function __mod_node_page_state() disables preemption. And there's a comment in __mod_zone_page_state(): > /* > * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, > * atomicity is provided by IRQs being disabled -- either explicitly > * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables > * CPU migrations and preemption potentially corrupts a counter so > * disable preemption. > */ > preempt_disable_nested(); We're relying on IRQs being disabled on !PREEMPT_RT. Maybe we could make it safe against re-entrant IRQ handlers by using read-modify-write operations? -- Cheers, Harry / Hyeonggon > Easy way would be to just disable/enable preemption instead of IRQs. > Otherwise go a bit more fine-grained approach i.e. replace > __count_vm_events with count_vm_events and just disable preemption > across __mod_node_page_state().
On 11/10/25 10:13 AM, Harry Yoo wrote: > On Fri, Nov 07, 2025 at 10:32:52PM -0800, Shakeel Butt wrote: >> On Fri, Nov 07, 2025 at 10:20:57PM +0900, Harry Yoo wrote: >>> >>> Although it's mentioned in the locking documentation, I'm afraid that >>> local_lock is not the right interface to use here. Preemption will be >>> disabled anyway (on both PREEMPT_RT and !PREEMPT_RT) when the stats are >>> updated (in __mod_node_page_state()). >>> >>> Here we just want to disable IRQ only on !PREEMPT_RT (to update >>> the stats safely). >> >> I don't think there is a need to disable IRQs. There are three stats >> update functions called in that hunk. >> >> 1) __mod_lruvec_state >> 2) __count_vm_events >> 3) count_memcg_events >> >> count_memcg_events() can be called with IRQs. __count_vm_events can be >> replaced with count_vm_events. > > Right. > >> For __mod_lruvec_state, the >> __mod_node_page_state() inside needs preemption disabled. > > The function __mod_node_page_state() disables preemption. > And there's a comment in __mod_zone_page_state(): > >> /* >> * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, >> * atomicity is provided by IRQs being disabled -- either explicitly >> * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables >> * CPU migrations and preemption potentially corrupts a counter so >> * disable preemption. >> */ >> preempt_disable_nested(); > > We're relying on IRQs being disabled on !PREEMPT_RT. So it's possible for us to update vmstat within an interrupt context, right? There is also a comment above __mod_zone_page_state(): /* * For use when we know that interrupts are disabled, * or when we know that preemption is disabled and that * particular counter cannot be updated from interrupt context. */ BTW, the comment inside __mod_node_page_state() should be: /* See __mod_zone_page_state */ instead of /* See __mod_node_page_state */ Will fix it. > > Maybe we could make it safe against re-entrant IRQ handlers by using > read-modify-write operations? Isn't it because of the RMW operation that we need to use IRQ to guarantee atomicity? Or have I misunderstood something? >
On Mon, Nov 10, 2025 at 12:30:06PM +0800, Qi Zheng wrote: > > > On 11/10/25 10:13 AM, Harry Yoo wrote: > > On Fri, Nov 07, 2025 at 10:32:52PM -0800, Shakeel Butt wrote: > > > On Fri, Nov 07, 2025 at 10:20:57PM +0900, Harry Yoo wrote: > > > > > > > > Although it's mentioned in the locking documentation, I'm afraid that > > > > local_lock is not the right interface to use here. Preemption will be > > > > disabled anyway (on both PREEMPT_RT and !PREEMPT_RT) when the stats are > > > > updated (in __mod_node_page_state()). > > > > > > > > Here we just want to disable IRQ only on !PREEMPT_RT (to update > > > > the stats safely). > > > > > > I don't think there is a need to disable IRQs. There are three stats > > > update functions called in that hunk. > > > > > > 1) __mod_lruvec_state > > > 2) __count_vm_events > > > 3) count_memcg_events > > > > > > count_memcg_events() can be called with IRQs. __count_vm_events can be > > > replaced with count_vm_events. > > > > Right. > > > > > For __mod_lruvec_state, the > > > __mod_node_page_state() inside needs preemption disabled. > > > > The function __mod_node_page_state() disables preemption. > > And there's a comment in __mod_zone_page_state(): > > > > > /* > > > * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, > > > * atomicity is provided by IRQs being disabled -- either explicitly > > > * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables > > > * CPU migrations and preemption potentially corrupts a counter so > > > * disable preemption. > > > */ > > > preempt_disable_nested(); > > > > We're relying on IRQs being disabled on !PREEMPT_RT. > > So it's possible for us to update vmstat within an interrupt context, > right? Yes, for instance when freeing memory in an interrupt context we can update vmstat and that's why we disable interrupts now. > There is also a comment above __mod_zone_page_state(): > > /* > * For use when we know that interrupts are disabled, > * or when we know that preemption is disabled and that > * particular counter cannot be updated from interrupt context. > */ Yeah we don't have to disable IRQs when we already know it's disabled. > BTW, the comment inside __mod_node_page_state() should be: > > /* See __mod_zone_page_state */ > > instead of > > /* See __mod_node_page_state */ > > Will fix it. Right :) Thanks! > > Maybe we could make it safe against re-entrant IRQ handlers by using > > read-modify-write operations? > > Isn't it because of the RMW operation that we need to use IRQ to > guarantee atomicity? Or have I misunderstood something? I meant using atomic operations instead of disabling IRQs, like, by using this_cpu_add() or cmpxchg() instead. -- Cheers, Harry / Hyeonggon
On Mon, Nov 10, 2025 at 02:43:21PM +0900, Harry Yoo wrote: > On Mon, Nov 10, 2025 at 12:30:06PM +0800, Qi Zheng wrote: > > > Maybe we could make it safe against re-entrant IRQ handlers by using > > > read-modify-write operations? > > > > Isn't it because of the RMW operation that we need to use IRQ to > > guarantee atomicity? Or have I misunderstood something? > > I meant using atomic operations instead of disabling IRQs, like, by > using this_cpu_add() or cmpxchg() instead. We already have mod_node_page_state() which is safe from IRQs and is optimized to not disable IRQs for archs with HAVE_CMPXCHG_LOCAL which includes x86 and arm64. Let me send the patch to cleanup the memcg code which uses __mod_node_page_state.
On 11/11/25 12:47 AM, Shakeel Butt wrote: > On Mon, Nov 10, 2025 at 02:43:21PM +0900, Harry Yoo wrote: >> On Mon, Nov 10, 2025 at 12:30:06PM +0800, Qi Zheng wrote: >>>> Maybe we could make it safe against re-entrant IRQ handlers by using >>>> read-modify-write operations? >>> >>> Isn't it because of the RMW operation that we need to use IRQ to >>> guarantee atomicity? Or have I misunderstood something? >> >> I meant using atomic operations instead of disabling IRQs, like, by >> using this_cpu_add() or cmpxchg() instead. > > We already have mod_node_page_state() which is safe from IRQs and is > optimized to not disable IRQs for archs with HAVE_CMPXCHG_LOCAL which > includes x86 and arm64. However, in the !CONFIG_HAVE_CMPXCHG_LOCAL case, mod_node_page_state() still calls local_irq_save(). Is this feasible in the PREEMPT_RT kernel? > > Let me send the patch to cleanup the memcg code which uses > __mod_node_page_state.
On Tue, Nov 11, 2025 at 11:04:09AM +0800, Qi Zheng wrote:
>
> On 11/11/25 12:47 AM, Shakeel Butt wrote:
> > On Mon, Nov 10, 2025 at 02:43:21PM +0900, Harry Yoo wrote:
> > > On Mon, Nov 10, 2025 at 12:30:06PM +0800, Qi Zheng wrote:
> > > > > Maybe we could make it safe against re-entrant IRQ handlers by using
> > > > > read-modify-write operations?
> > > >
> > > > Isn't it because of the RMW operation that we need to use IRQ to
> > > > guarantee atomicity? Or have I misunderstood something?
> > >
> > > I meant using atomic operations instead of disabling IRQs, like, by
> > > using this_cpu_add() or cmpxchg() instead.
> >
> > We already have mod_node_page_state() which is safe from IRQs and is
> > optimized to not disable IRQs for archs with HAVE_CMPXCHG_LOCAL which
> > includes x86 and arm64.
>
> However, in the !CONFIG_HAVE_CMPXCHG_LOCAL case, mod_node_page_state()
> still calls local_irq_save(). Is this feasible in the PREEMPT_RT kernel?
Hmm I was going to say it's necessary, but AFAICT we don't allocate
or free memory in hardirq context on PREEMPT_RT (that's the policy)
and so I'd say it's not necessary to disable IRQs.
Sounds like we still want to disable IRQs only on !PREEMPT_RT on
such architectures?
Not sure how seriously do PREEMPT_RT folks care about architectures
without HAVE_CMPXCHG_LOCAL. (riscv and loongarch have ARCH_SUPPORTS_RT
but doesn't have HAVE_CMPXCHG_LOCAL).
If they do care, this can be done as a separate patch series because
we already call local_irq_{save,restore}() in many places in mm/vmstat.c
if the architecture doesn't not have HAVE_CMPXCHG_LOCAL.
> > Let me send the patch to cleanup the memcg code which uses
> > __mod_node_page_state.
--
Cheers,
Harry / Hyeonggon
On 2025-11-11 12:16:43 [+0900], Harry Yoo wrote: > > However, in the !CONFIG_HAVE_CMPXCHG_LOCAL case, mod_node_page_state() > > still calls local_irq_save(). Is this feasible in the PREEMPT_RT kernel? > > Hmm I was going to say it's necessary, but AFAICT we don't allocate > or free memory in hardirq context on PREEMPT_RT (that's the policy) > and so I'd say it's not necessary to disable IRQs. > > Sounds like we still want to disable IRQs only on !PREEMPT_RT on > such architectures? > > Not sure how seriously do PREEMPT_RT folks care about architectures > without HAVE_CMPXCHG_LOCAL. (riscv and loongarch have ARCH_SUPPORTS_RT > but doesn't have HAVE_CMPXCHG_LOCAL). We take things seriously and you shouldn't make assumption based on implementation. Either the API can be used as such or not. In case of mod_node_page_state(), the non-IRQ off version (__mod_node_page_state()) has a preempt_disable_nested() to ensure atomic update on PREEMPT_RT without disabling interrupts. Sebastian
On Tue, Nov 11, 2025 at 09:49:00AM +0100, Sebastian Andrzej Siewior wrote:
> On 2025-11-11 12:16:43 [+0900], Harry Yoo wrote:
> > > However, in the !CONFIG_HAVE_CMPXCHG_LOCAL case, mod_node_page_state()
> > > still calls local_irq_save(). Is this feasible in the PREEMPT_RT kernel?
> >
> > Hmm I was going to say it's necessary, but AFAICT we don't allocate
> > or free memory in hardirq context on PREEMPT_RT (that's the policy)
> > and so I'd say it's not necessary to disable IRQs.
> >
> > Sounds like we still want to disable IRQs only on !PREEMPT_RT on
> > such architectures?
> >
> > Not sure how seriously do PREEMPT_RT folks care about architectures
> > without HAVE_CMPXCHG_LOCAL. (riscv and loongarch have ARCH_SUPPORTS_RT
> > but doesn't have HAVE_CMPXCHG_LOCAL).
>
> We take things seriously and you shouldn't make assumption based on
> implementation. Either the API can be used as such or not.
> In case of mod_node_page_state(), the non-IRQ off version
> (__mod_node_page_state()) has a preempt_disable_nested() to ensure
> atomic update on PREEMPT_RT without disabling interrupts.
>
Harry is talking about mod_node_page_state() on
!CONFIG_HAVE_CMPXCHG_LOCAL which is disabling irqs.
void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long delta)
{
unsigned long flags;
local_irq_save(flags);
__mod_node_page_state(pgdat, item, delta);
local_irq_restore(flags);
}
Is PREEMPT_RT fine with this?
On Tue, 11 Nov 2025 08:44:14 -0800
Shakeel Butt <shakeel.butt@linux.dev> wrote:
> Harry is talking about mod_node_page_state() on
> !CONFIG_HAVE_CMPXCHG_LOCAL which is disabling irqs.
>
> void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
> long delta)
> {
> unsigned long flags;
>
> local_irq_save(flags);
> __mod_node_page_state(pgdat, item, delta);
> local_irq_restore(flags);
> }
>
> Is PREEMPT_RT fine with this?
But should be:
void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long delta)
{
guard(irqsave)();
__mod_node_page_state(pgdat, item, delta);
}
-- Steve
On 2025-11-11 08:44:14 [-0800], Shakeel Butt wrote:
> On Tue, Nov 11, 2025 at 09:49:00AM +0100, Sebastian Andrzej Siewior wrote:
> > On 2025-11-11 12:16:43 [+0900], Harry Yoo wrote:
> > > > However, in the !CONFIG_HAVE_CMPXCHG_LOCAL case, mod_node_page_state()
> > > > still calls local_irq_save(). Is this feasible in the PREEMPT_RT kernel?
> > >
> > > Hmm I was going to say it's necessary, but AFAICT we don't allocate
> > > or free memory in hardirq context on PREEMPT_RT (that's the policy)
> > > and so I'd say it's not necessary to disable IRQs.
> > >
> > > Sounds like we still want to disable IRQs only on !PREEMPT_RT on
> > > such architectures?
> > >
> > > Not sure how seriously do PREEMPT_RT folks care about architectures
> > > without HAVE_CMPXCHG_LOCAL. (riscv and loongarch have ARCH_SUPPORTS_RT
> > > but doesn't have HAVE_CMPXCHG_LOCAL).
> >
> > We take things seriously and you shouldn't make assumption based on
> > implementation. Either the API can be used as such or not.
> > In case of mod_node_page_state(), the non-IRQ off version
> > (__mod_node_page_state()) has a preempt_disable_nested() to ensure
> > atomic update on PREEMPT_RT without disabling interrupts.
> >
>
> Harry is talking about mod_node_page_state() on
> !CONFIG_HAVE_CMPXCHG_LOCAL which is disabling irqs.
>
> void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
> long delta)
> {
> unsigned long flags;
>
> local_irq_save(flags);
> __mod_node_page_state(pgdat, item, delta);
> local_irq_restore(flags);
> }
>
> Is PREEMPT_RT fine with this?
Yes.
The local_irq_save() is not strictly needed but I am fine with it to
keep it simple. The inner part is just counting.
Sebastian
On Wed, Nov 12, 2025 at 08:49:30AM +0100, Sebastian Andrzej Siewior wrote:
> On 2025-11-11 08:44:14 [-0800], Shakeel Butt wrote:
> > On Tue, Nov 11, 2025 at 09:49:00AM +0100, Sebastian Andrzej Siewior wrote:
> > > On 2025-11-11 12:16:43 [+0900], Harry Yoo wrote:
> > > > > However, in the !CONFIG_HAVE_CMPXCHG_LOCAL case, mod_node_page_state()
> > > > > still calls local_irq_save(). Is this feasible in the PREEMPT_RT kernel?
> > > >
> > > > Hmm I was going to say it's necessary, but AFAICT we don't allocate
> > > > or free memory in hardirq context on PREEMPT_RT (that's the policy)
> > > > and so I'd say it's not necessary to disable IRQs.
> > > >
> > > > Sounds like we still want to disable IRQs only on !PREEMPT_RT on
> > > > such architectures?
> > > >
> > > > Not sure how seriously do PREEMPT_RT folks care about architectures
> > > > without HAVE_CMPXCHG_LOCAL. (riscv and loongarch have ARCH_SUPPORTS_RT
> > > > but doesn't have HAVE_CMPXCHG_LOCAL).
> > >
> > > We take things seriously and you shouldn't make assumption based on
> > > implementation. Either the API can be used as such or not.
> > > In case of mod_node_page_state(), the non-IRQ off version
> > > (__mod_node_page_state()) has a preempt_disable_nested() to ensure
> > > atomic update on PREEMPT_RT without disabling interrupts.
> >
> > Harry is talking about mod_node_page_state() on
> > !CONFIG_HAVE_CMPXCHG_LOCAL which is disabling irqs.
> >
> > void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
> > long delta)
> > {
> > unsigned long flags;
> >
> > local_irq_save(flags);
> > __mod_node_page_state(pgdat, item, delta);
> > local_irq_restore(flags);
> > }
> >
> > Is PREEMPT_RT fine with this?
>
> Yes.
> The local_irq_save() is not strictly needed but I am fine with it to
> keep it simple. The inner part is just counting.
Yeah I was wondering about this... and thanks for confirming!
--
Cheers,
Harry / Hyeonggon
On 2025-11-12 17:46:51 [+0900], Harry Yoo wrote: > > > Is PREEMPT_RT fine with this? > > > > Yes. > > The local_irq_save() is not strictly needed but I am fine with it to > > keep it simple. The inner part is just counting. > > Yeah I was wondering about this... and thanks for confirming! The preempt_disable_nested() and no in-IRQ user makes it okay. Not the counting, that was meant as "it is a small section". That might have been too short of an explanation. Sebastian
On 11/11/25 11:16 AM, Harry Yoo wrote:
> On Tue, Nov 11, 2025 at 11:04:09AM +0800, Qi Zheng wrote:
>>
>> On 11/11/25 12:47 AM, Shakeel Butt wrote:
>>> On Mon, Nov 10, 2025 at 02:43:21PM +0900, Harry Yoo wrote:
>>>> On Mon, Nov 10, 2025 at 12:30:06PM +0800, Qi Zheng wrote:
>>>>>> Maybe we could make it safe against re-entrant IRQ handlers by using
>>>>>> read-modify-write operations?
>>>>>
>>>>> Isn't it because of the RMW operation that we need to use IRQ to
>>>>> guarantee atomicity? Or have I misunderstood something?
>>>>
>>>> I meant using atomic operations instead of disabling IRQs, like, by
>>>> using this_cpu_add() or cmpxchg() instead.
>>>
>>> We already have mod_node_page_state() which is safe from IRQs and is
>>> optimized to not disable IRQs for archs with HAVE_CMPXCHG_LOCAL which
>>> includes x86 and arm64.
>>
>> However, in the !CONFIG_HAVE_CMPXCHG_LOCAL case, mod_node_page_state()
>> still calls local_irq_save(). Is this feasible in the PREEMPT_RT kernel?
>
> Hmm I was going to say it's necessary, but AFAICT we don't allocate
> or free memory in hardirq context on PREEMPT_RT (that's the policy)
> and so I'd say it's not necessary to disable IRQs.
>
> Sounds like we still want to disable IRQs only on !PREEMPT_RT on
> such architectures?
>
> Not sure how seriously do PREEMPT_RT folks care about architectures
> without HAVE_CMPXCHG_LOCAL. (riscv and loongarch have ARCH_SUPPORTS_RT
> but doesn't have HAVE_CMPXCHG_LOCAL).
>
> If they do care, this can be done as a separate patch series because
> we already call local_irq_{save,restore}() in many places in mm/vmstat.c
> if the architecture doesn't not have HAVE_CMPXCHG_LOCAL.
Got it. I will ignore it for now.
>
>>> Let me send the patch to cleanup the memcg code which uses
>>> __mod_node_page_state.
>
On Tue, Nov 11, 2025 at 11:04:09AM +0800, Qi Zheng wrote: > > On 11/11/25 12:47 AM, Shakeel Butt wrote: > > On Mon, Nov 10, 2025 at 02:43:21PM +0900, Harry Yoo wrote: > > > On Mon, Nov 10, 2025 at 12:30:06PM +0800, Qi Zheng wrote: > > > > > Maybe we could make it safe against re-entrant IRQ handlers by using > > > > > read-modify-write operations? > > > > > > > > Isn't it because of the RMW operation that we need to use IRQ to > > > > guarantee atomicity? Or have I misunderstood something? > > > > > > I meant using atomic operations instead of disabling IRQs, like, by > > > using this_cpu_add() or cmpxchg() instead. > > > > We already have mod_node_page_state() which is safe from IRQs and is > > optimized to not disable IRQs for archs with HAVE_CMPXCHG_LOCAL which > > includes x86 and arm64. > > However, in the !CONFIG_HAVE_CMPXCHG_LOCAL case, mod_node_page_state() > still calls local_irq_save(). Is this feasible in the PREEMPT_RT kernel? > Yes we can disable irqs on PREEMPT_RT but it is usually frown upon and it is usually requested to do so only for short window. However if someone running PREEMPT_RT on an arch without HAVE_CMPXCHG_LOCAL and has issues with mod_node_page_state() then they can solve it then. I don't think we need to fix that now.
On 11/11/25 11:17 AM, Shakeel Butt wrote: > On Tue, Nov 11, 2025 at 11:04:09AM +0800, Qi Zheng wrote: >> >> On 11/11/25 12:47 AM, Shakeel Butt wrote: >>> On Mon, Nov 10, 2025 at 02:43:21PM +0900, Harry Yoo wrote: >>>> On Mon, Nov 10, 2025 at 12:30:06PM +0800, Qi Zheng wrote: >>>>>> Maybe we could make it safe against re-entrant IRQ handlers by using >>>>>> read-modify-write operations? >>>>> >>>>> Isn't it because of the RMW operation that we need to use IRQ to >>>>> guarantee atomicity? Or have I misunderstood something? >>>> >>>> I meant using atomic operations instead of disabling IRQs, like, by >>>> using this_cpu_add() or cmpxchg() instead. >>> >>> We already have mod_node_page_state() which is safe from IRQs and is >>> optimized to not disable IRQs for archs with HAVE_CMPXCHG_LOCAL which >>> includes x86 and arm64. >> >> However, in the !CONFIG_HAVE_CMPXCHG_LOCAL case, mod_node_page_state() >> still calls local_irq_save(). Is this feasible in the PREEMPT_RT kernel? >> > > Yes we can disable irqs on PREEMPT_RT but it is usually frown upon and > it is usually requested to do so only for short window. However if Got it. > someone running PREEMPT_RT on an arch without HAVE_CMPXCHG_LOCAL and has > issues with mod_node_page_state() then they can solve it then. I don't > think we need to fix that now. OK. >
On Mon, Nov 10, 2025 at 08:47:57AM -0800, Shakeel Butt wrote: > On Mon, Nov 10, 2025 at 02:43:21PM +0900, Harry Yoo wrote: > > On Mon, Nov 10, 2025 at 12:30:06PM +0800, Qi Zheng wrote: > > > > Maybe we could make it safe against re-entrant IRQ handlers by using > > > > read-modify-write operations? > > > > > > Isn't it because of the RMW operation that we need to use IRQ to > > > guarantee atomicity? Or have I misunderstood something? > > > > I meant using atomic operations instead of disabling IRQs, like, by > > using this_cpu_add() or cmpxchg() instead. > > We already have mod_node_page_state() which is safe from IRQs and is > optimized to not disable IRQs for archs with HAVE_CMPXCHG_LOCAL which > includes x86 and arm64. Nice observation, thanks! > Let me send the patch to cleanup the memcg code which uses > __mod_node_page_state. I'll take a look. -- Cheers, Harry / Hyeonggon
On 11/10/25 1:43 PM, Harry Yoo wrote: > On Mon, Nov 10, 2025 at 12:30:06PM +0800, Qi Zheng wrote: >> >> >> On 11/10/25 10:13 AM, Harry Yoo wrote: >>> On Fri, Nov 07, 2025 at 10:32:52PM -0800, Shakeel Butt wrote: >>>> On Fri, Nov 07, 2025 at 10:20:57PM +0900, Harry Yoo wrote: >>>>> >>>>> Although it's mentioned in the locking documentation, I'm afraid that >>>>> local_lock is not the right interface to use here. Preemption will be >>>>> disabled anyway (on both PREEMPT_RT and !PREEMPT_RT) when the stats are >>>>> updated (in __mod_node_page_state()). >>>>> >>>>> Here we just want to disable IRQ only on !PREEMPT_RT (to update >>>>> the stats safely). >>>> >>>> I don't think there is a need to disable IRQs. There are three stats >>>> update functions called in that hunk. >>>> >>>> 1) __mod_lruvec_state >>>> 2) __count_vm_events >>>> 3) count_memcg_events >>>> >>>> count_memcg_events() can be called with IRQs. __count_vm_events can be >>>> replaced with count_vm_events. >>> >>> Right. >>> >>>> For __mod_lruvec_state, the >>>> __mod_node_page_state() inside needs preemption disabled. >>> >>> The function __mod_node_page_state() disables preemption. >>> And there's a comment in __mod_zone_page_state(): >>> >>>> /* >>>> * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, >>>> * atomicity is provided by IRQs being disabled -- either explicitly >>>> * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables >>>> * CPU migrations and preemption potentially corrupts a counter so >>>> * disable preemption. >>>> */ >>>> preempt_disable_nested(); >>> >>> We're relying on IRQs being disabled on !PREEMPT_RT. >> >> So it's possible for us to update vmstat within an interrupt context, >> right? > > Yes, for instance when freeing memory in an interrupt context we can > update vmstat and that's why we disable interrupts now. Got it. > >> There is also a comment above __mod_zone_page_state(): >> >> /* >> * For use when we know that interrupts are disabled, >> * or when we know that preemption is disabled and that >> * particular counter cannot be updated from interrupt context. >> */ > > Yeah we don't have to disable IRQs when we already know it's disabled. > >> BTW, the comment inside __mod_node_page_state() should be: >> >> /* See __mod_zone_page_state */ >> >> instead of >> >> /* See __mod_node_page_state */ >> >> Will fix it. > > Right :) Thanks! > >>> Maybe we could make it safe against re-entrant IRQ handlers by using >>> read-modify-write operations? >> >> Isn't it because of the RMW operation that we need to use IRQ to >> guarantee atomicity? Or have I misunderstood something? > > I meant using atomic operations instead of disabling IRQs, like, by > using this_cpu_add() or cmpxchg() instead. Got it. I will give it a try. Thanks, Qi >
© 2016 - 2026 Red Hat, Inc.