From nobody Thu Dec 18 05:00:03 2025 Received: from out-183.mta1.migadu.com (out-183.mta1.migadu.com [95.215.58.183]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AABEF19CC0A for ; Tue, 6 May 2025 22:56:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=95.215.58.183 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1746572184; cv=none; b=B3ZmVdIwVqUhT5TKdMvU24We2IyA3TQG/RwZakbyMxscJY+/9gfJYOSJBkdXtU9OFV5+d4M1B2+R+iolbyaHVQrVG/KN3W/rc/QTbXgL4DYdSDMbzhUbZ2HIUzILK7z/1jG5XMrKK70EYbp5OAgHAfQM4OVUSJ2crTrNN2wxgXI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1746572184; c=relaxed/simple; bh=zKY2DBKGkrFdvw2zlC5l0UyTzaOXJc43DnIbBbiFeek=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=En5uDJOVELyoZL/GijIfTA+jdoOgQcBHtBtKwEWDaCZWSIQVn3+uiPyMuwGHAzDaBc08qBhkL85m2/5nGBUNLVGEGW41XLo/CFPx+q+OQP1zkEVLcpWGK9QUSJkulDLZjVVIEP4gYMZiMvxZz7LJnc1CWD9UMQGpcVOuMkJoAFU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev; spf=pass smtp.mailfrom=linux.dev; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b=EKsCxB61; arc=none smtp.client-ip=95.215.58.183 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.dev Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.dev Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev header.b="EKsCxB61" X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and include these headers. DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1; t=1746572178; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=YaAG9lvwUJsYb3493NPAKmXORCWmUhf2pYJZJOrZTFQ=; b=EKsCxB61TUq7Ouvi/pOE5VM5G+dHVuLx3TD3AHHFgtEQTio4N99L5Whup5qNtnVThxKLOk R7FSAMSa7QGNPlwIpY9PV9yHKBxHIkHr81fQwBLf9mNX6saeX9GFq9WVkCg4ig1vM0hZTn acCrsXriJNcIImAYZanQPCefCtUT6ZI= From: Shakeel Butt To: Andrew Morton Cc: Johannes Weiner , Michal Hocko , Roman Gushchin , Muchun Song , Alexei Starovoitov , Vlastimil Babka , Sebastian Andrzej Siewior , Jakub Kicinski , Eric Dumazet , linux-mm@kvack.org, cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, bpf@vger.kernel.org, netdev@vger.kernel.org, Meta kernel team Subject: [PATCH 3/4] memcg: completely decouple memcg and obj stocks Date: Tue, 6 May 2025 15:55:32 -0700 Message-ID: <20250506225533.2580386-4-shakeel.butt@linux.dev> In-Reply-To: <20250506225533.2580386-1-shakeel.butt@linux.dev> References: <20250506225533.2580386-1-shakeel.butt@linux.dev> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Migadu-Flow: FLOW_OUT Content-Type: text/plain; charset="utf-8" Let's completely decouple the memcg and obj per-cpu stocks. This will enable us to make memcg per-cpu stocks to used without disabling irqs. Also it will enable us to make obj stocks nmi safe independently which is required to make kmalloc/slab safe for allocations from nmi context. Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka --- mm/memcontrol.c | 149 ++++++++++++++++++++++++++++++------------------ 1 file changed, 92 insertions(+), 57 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09a78c606d8d..14d73e5352fe 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1780,12 +1780,22 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *= memcg) * nr_pages in a single cacheline. This may change in future. */ #define NR_MEMCG_STOCK 7 +#define FLUSHING_CACHED_CHARGE 0 struct memcg_stock_pcp { - local_trylock_t memcg_lock; + local_trylock_t lock; uint8_t nr_pages[NR_MEMCG_STOCK]; struct mem_cgroup *cached[NR_MEMCG_STOCK]; =20 - local_trylock_t obj_lock; + struct work_struct work; + unsigned long flags; +}; + +static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) =3D { + .lock =3D INIT_LOCAL_TRYLOCK(lock), +}; + +struct obj_stock_pcp { + local_trylock_t lock; unsigned int nr_bytes; struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; @@ -1794,16 +1804,16 @@ struct memcg_stock_pcp { =20 struct work_struct work; unsigned long flags; -#define FLUSHING_CACHED_CHARGE 0 }; -static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) =3D { - .memcg_lock =3D INIT_LOCAL_TRYLOCK(memcg_lock), - .obj_lock =3D INIT_LOCAL_TRYLOCK(obj_lock), + +static DEFINE_PER_CPU_ALIGNED(struct obj_stock_pcp, obj_stock) =3D { + .lock =3D INIT_LOCAL_TRYLOCK(lock), }; + static DEFINE_MUTEX(percpu_charge_mutex); =20 -static void drain_obj_stock(struct memcg_stock_pcp *stock); -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +static void drain_obj_stock(struct obj_stock_pcp *stock); +static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg); =20 /** @@ -1826,7 +1836,7 @@ static bool consume_stock(struct mem_cgroup *memcg, u= nsigned int nr_pages) int i; =20 if (nr_pages > MEMCG_CHARGE_BATCH || - !local_trylock_irqsave(&memcg_stock.memcg_lock, flags)) + !local_trylock_irqsave(&memcg_stock.lock, flags)) return ret; =20 stock =3D this_cpu_ptr(&memcg_stock); @@ -1843,7 +1853,7 @@ static bool consume_stock(struct mem_cgroup *memcg, u= nsigned int nr_pages) break; } =20 - local_unlock_irqrestore(&memcg_stock.memcg_lock, flags); + local_unlock_irqrestore(&memcg_stock.lock, flags); =20 return ret; } @@ -1884,7 +1894,7 @@ static void drain_stock_fully(struct memcg_stock_pcp = *stock) drain_stock(stock, i); } =20 -static void drain_local_stock(struct work_struct *dummy) +static void drain_local_memcg_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; unsigned long flags; @@ -1892,16 +1902,30 @@ static void drain_local_stock(struct work_struct *d= ummy) if (WARN_ONCE(!in_task(), "drain in non-task context")) return; =20 - local_lock_irqsave(&memcg_stock.obj_lock, flags); - stock =3D this_cpu_ptr(&memcg_stock); - drain_obj_stock(stock); - local_unlock_irqrestore(&memcg_stock.obj_lock, flags); + local_lock_irqsave(&memcg_stock.lock, flags); =20 - local_lock_irqsave(&memcg_stock.memcg_lock, flags); stock =3D this_cpu_ptr(&memcg_stock); drain_stock_fully(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_unlock_irqrestore(&memcg_stock.memcg_lock, flags); + + local_unlock_irqrestore(&memcg_stock.lock, flags); +} + +static void drain_local_obj_stock(struct work_struct *dummy) +{ + struct obj_stock_pcp *stock; + unsigned long flags; + + if (WARN_ONCE(!in_task(), "drain in non-task context")) + return; + + local_lock_irqsave(&obj_stock.lock, flags); + + stock =3D this_cpu_ptr(&obj_stock); + drain_obj_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + + local_unlock_irqrestore(&obj_stock.lock, flags); } =20 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -1924,10 +1948,10 @@ static void refill_stock(struct mem_cgroup *memcg, = unsigned int nr_pages) VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); =20 if (nr_pages > MEMCG_CHARGE_BATCH || - !local_trylock_irqsave(&memcg_stock.memcg_lock, flags)) { + !local_trylock_irqsave(&memcg_stock.lock, flags)) { /* * In case of larger than batch refill or unlikely failure to - * lock the percpu memcg_lock, uncharge memcg directly. + * lock the percpu memcg_stock.lock, uncharge memcg directly. */ memcg_uncharge(memcg, nr_pages); return; @@ -1959,23 +1983,17 @@ static void refill_stock(struct mem_cgroup *memcg, = unsigned int nr_pages) WRITE_ONCE(stock->nr_pages[i], nr_pages); } =20 - local_unlock_irqrestore(&memcg_stock.memcg_lock, flags); + local_unlock_irqrestore(&memcg_stock.lock, flags); } =20 -static bool is_drain_needed(struct memcg_stock_pcp *stock, - struct mem_cgroup *root_memcg) +static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) { struct mem_cgroup *memcg; bool flush =3D false; int i; =20 rcu_read_lock(); - - if (obj_stock_flush_required(stock, root_memcg)) { - flush =3D true; - goto out; - } - for (i =3D 0; i < NR_MEMCG_STOCK; ++i) { memcg =3D READ_ONCE(stock->cached[i]); if (!memcg) @@ -1987,7 +2005,6 @@ static bool is_drain_needed(struct memcg_stock_pcp *s= tock, break; } } -out: rcu_read_unlock(); return flush; } @@ -2012,15 +2029,27 @@ void drain_all_stock(struct mem_cgroup *root_memcg) migrate_disable(); curcpu =3D smp_processor_id(); for_each_online_cpu(cpu) { - struct memcg_stock_pcp *stock =3D &per_cpu(memcg_stock, cpu); - bool flush =3D is_drain_needed(stock, root_memcg); + struct memcg_stock_pcp *memcg_st =3D &per_cpu(memcg_stock, cpu); + struct obj_stock_pcp *obj_st =3D &per_cpu(obj_stock, cpu); =20 - if (flush && - !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (!test_bit(FLUSHING_CACHED_CHARGE, &memcg_st->flags) && + is_memcg_drain_needed(memcg_st, root_memcg) && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, + &memcg_st->flags)) { if (cpu =3D=3D curcpu) - drain_local_stock(&stock->work); + drain_local_memcg_stock(&memcg_st->work); else if (!cpu_is_isolated(cpu)) - schedule_work_on(cpu, &stock->work); + schedule_work_on(cpu, &memcg_st->work); + } + + if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) && + obj_stock_flush_required(obj_st, root_memcg) && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, + &obj_st->flags)) { + if (cpu =3D=3D curcpu) + drain_local_obj_stock(&obj_st->work); + else if (!cpu_is_isolated(cpu)) + schedule_work_on(cpu, &obj_st->work); } } migrate_enable(); @@ -2029,18 +2058,18 @@ void drain_all_stock(struct mem_cgroup *root_memcg) =20 static int memcg_hotplug_cpu_dead(unsigned int cpu) { - struct memcg_stock_pcp *stock; + struct obj_stock_pcp *obj_st; unsigned long flags; =20 - stock =3D &per_cpu(memcg_stock, cpu); + obj_st =3D &per_cpu(obj_stock, cpu); =20 - /* drain_obj_stock requires obj_lock */ - local_lock_irqsave(&memcg_stock.obj_lock, flags); - drain_obj_stock(stock); - local_unlock_irqrestore(&memcg_stock.obj_lock, flags); + /* drain_obj_stock requires objstock.lock */ + local_lock_irqsave(&obj_stock.lock, flags); + drain_obj_stock(obj_st); + local_unlock_irqrestore(&obj_stock.lock, flags); =20 /* no need for the local lock */ - drain_stock_fully(stock); + drain_stock_fully(&per_cpu(memcg_stock, cpu)); =20 return 0; } @@ -2837,7 +2866,7 @@ void __memcg_kmem_uncharge_page(struct page *page, in= t order) } =20 static void __account_obj_stock(struct obj_cgroup *objcg, - struct memcg_stock_pcp *stock, int nr, + struct obj_stock_pcp *stock, int nr, struct pglist_data *pgdat, enum node_stat_item idx) { int *bytes; @@ -2888,13 +2917,13 @@ static void __account_obj_stock(struct obj_cgroup *= objcg, static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_by= tes, struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; + struct obj_stock_pcp *stock; unsigned long flags; bool ret =3D false; =20 - local_lock_irqsave(&memcg_stock.obj_lock, flags); + local_lock_irqsave(&obj_stock.lock, flags); =20 - stock =3D this_cpu_ptr(&memcg_stock); + stock =3D this_cpu_ptr(&obj_stock); if (objcg =3D=3D READ_ONCE(stock->cached_objcg) && stock->nr_bytes >=3D n= r_bytes) { stock->nr_bytes -=3D nr_bytes; ret =3D true; @@ -2903,12 +2932,12 @@ static bool consume_obj_stock(struct obj_cgroup *ob= jcg, unsigned int nr_bytes, __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); } =20 - local_unlock_irqrestore(&memcg_stock.obj_lock, flags); + local_unlock_irqrestore(&obj_stock.lock, flags); =20 return ret; } =20 -static void drain_obj_stock(struct memcg_stock_pcp *stock) +static void drain_obj_stock(struct obj_stock_pcp *stock) { struct obj_cgroup *old =3D READ_ONCE(stock->cached_objcg); =20 @@ -2969,32 +2998,35 @@ static void drain_obj_stock(struct memcg_stock_pcp = *stock) obj_cgroup_put(old); } =20 -static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +static bool obj_stock_flush_required(struct obj_stock_pcp *stock, struct mem_cgroup *root_memcg) { struct obj_cgroup *objcg =3D READ_ONCE(stock->cached_objcg); struct mem_cgroup *memcg; + bool flush =3D false; =20 + rcu_read_lock(); if (objcg) { memcg =3D obj_cgroup_memcg(objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) - return true; + flush =3D true; } + rcu_read_unlock(); =20 - return false; + return flush; } =20 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_byt= es, bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; + struct obj_stock_pcp *stock; unsigned long flags; unsigned int nr_pages =3D 0; =20 - local_lock_irqsave(&memcg_stock.obj_lock, flags); + local_lock_irqsave(&obj_stock.lock, flags); =20 - stock =3D this_cpu_ptr(&memcg_stock); + stock =3D this_cpu_ptr(&obj_stock); if (READ_ONCE(stock->cached_objcg) !=3D objcg) { /* reset if necessary */ drain_obj_stock(stock); obj_cgroup_get(objcg); @@ -3014,7 +3046,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg= , unsigned int nr_bytes, stock->nr_bytes &=3D (PAGE_SIZE - 1); } =20 - local_unlock_irqrestore(&memcg_stock.obj_lock, flags); + local_unlock_irqrestore(&obj_stock.lock, flags); =20 if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); @@ -5079,9 +5111,12 @@ int __init mem_cgroup_init(void) cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); =20 - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, - drain_local_stock); + drain_local_memcg_stock); + INIT_WORK(&per_cpu_ptr(&obj_stock, cpu)->work, + drain_local_obj_stock); + } =20 memcg_size =3D struct_size_t(struct mem_cgroup, nodeinfo, nr_node_ids); memcg_cachep =3D kmem_cache_create("mem_cgroup", memcg_size, 0, --=20 2.47.1