From nobody Sun Feb  8 16:42:44 2026
Received: from mail-pf1-f194.google.com (mail-pf1-f194.google.com
 [209.85.210.194])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 510651339A1;
	Thu, 25 Jan 2024 18:43:57 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.210.194
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706208242; cv=none;
 b=n9OF42a5cPT5Lguie12lNhWxlVUhyj9mO6c2GxV4NBePxjOTE1lmp0wqWyoMUbUg97kz1gtCEH5dlCxcjXFhErYjDCa+dAQxmXa5yPxV+RC8Z+ZUna9Yz/aNl0tkVbCfFQzKAjlIrS0P5gqwkqwgYlkmgS3QLp+3bmEQZ20DUAw=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706208242; c=relaxed/simple;
	bh=A/gwmbB4WkG+E8bBcS0jJdPNtFmTOMm9P1nybo3YutU=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=bALPm4G8Srh3Q/K8tM365n3evE8SJAMU6qbdQfXwucTRb0ttM01os2952KnUsxX+iF9ddHQityVPL5aORDHx6i7LKv0UEQSSFT/5brEQi4yIDwM3+xBmxtPrg+lSvzcRoWgfESfv4nPB9Wk+WiKAe+scCaHzMR6LqItTnf3agwA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=hifNVbpO; arc=none smtp.client-ip=209.85.210.194
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="hifNVbpO"
Received: by mail-pf1-f194.google.com with SMTP id
 d2e1a72fcca58-6dd85328325so2900256b3a.1;
        Thu, 25 Jan 2024 10:43:57 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1706208236; x=1706813036;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=hmYsuLrbhfeS9FkHVAV0PwAEzQixEgg+6xUQTYSEcCU=;
        b=hifNVbpOC+IlOedi1L35htrXItG0a/6V2RuZ0wZhMRCYAiKenm34MSzHOG0y33Ig9z
         OeMX/pTXWg/qrW4VnVQlV3S8+xvJtXGOecwUXXOMwMcI6rR8BXeynvR3Af9rPH0jyBc1
         6ovwsxpk18ZCiqZantFg4R+a+jeDgs3fnkSOQXpc8WeGLuxw62jkrxI0TSmQJ0b+nC7Z
         IOjW7mlbh/0K3bE5TDGm/XBwk9bXA3RJyxI3YbM8GJKrmM5ugBxZShcWEKz3yyTJu7f+
         NPwXq48IVa+ZIavPkwV0KeYzVABjO6Km7i8UH4O1wHVfOC4vTps+mcIb8q4NLz+jDmS9
         lCZQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1706208236; x=1706813036;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=hmYsuLrbhfeS9FkHVAV0PwAEzQixEgg+6xUQTYSEcCU=;
        b=j2wF8hfIJv0ecmKKFiHY0qTGuT2wWJEHIgHZ3qLAzKW+S/+IM3mQ658Uz2DloY9Rqo
         couvvr6szS+0ZW8RJXT1PJ+7UOvQvQUCuBuvTtEGbPKsTNAB5crlZGCzJMbFb14cQTcs
         IMoRSShbdtUM8keRAFTv+IpX3JiaXTHb7IV1jdZkv15Oy2wysuysfkIpzr12YfCneLDa
         Ku5nm1vibEfICddU03dc/xM7s5/gYREH4dnTsWJ3cZccgCxEZ3TU0qXvXT17u+NIxOYf
         gzmj7RlqLW3cRAgRvlIm77SRIlhoLh+drtRReRhoesL7H04grQPr06b2Kywgv5vx9RO2
         a9FQ==
X-Gm-Message-State: AOJu0YzuP+NHIRRlORkEokGQqG4Uwtf+Dzy2jyug0XZ0QhODbbh+CGRl
	m2bBRUkVbGItdoMh96NFX4T2OjN60VQ4Zu7Jwl/D5aKcTL8fgI0=
X-Google-Smtp-Source: 
 AGHT+IEqg27yq8B6HkgUKSGyfBqEu8l7uYy3U7f2VU4V68PFsFDVS5Uu2mKsvyuU8Sy0tPawAHsrNg==
X-Received: by 2002:a05:6a00:db:b0:6dd:8891:81ef with SMTP id
 e27-20020a056a0000db00b006dd889181efmr134165pfj.43.1706208236405;
        Thu, 25 Jan 2024 10:43:56 -0800 (PST)
Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net.
 [173.79.56.208])
        by smtp.gmail.com with ESMTPSA id
 p14-20020aa7860e000000b006ddcf56fb78sm1815070pfn.62.2024.01.25.10.43.53
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Thu, 25 Jan 2024 10:43:56 -0800 (PST)
From: Gregory Price <gourry.memverge@gmail.com>
X-Google-Original-From: Gregory Price <gregory.price@memverge.com>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org,
	linux-fsdevel@vger.kernel.org,
	linux-api@vger.kernel.org,
	corbet@lwn.net,
	akpm@linux-foundation.org,
	gregory.price@memverge.com,
	honggyu.kim@sk.com,
	rakie.kim@sk.com,
	hyeongtak.ji@sk.com,
	mhocko@kernel.org,
	ying.huang@intel.com,
	vtavarespetr@micron.com,
	jgroves@micron.com,
	ravis.opensrc@micron.com,
	sthanneeru@micron.com,
	emirakhur@micron.com,
	Hasan.Maruf@amd.com,
	seungjun.ha@samsung.com,
	hannes@cmpxchg.org,
	dan.j.williams@intel.com
Subject: [PATCH v3 1/4] mm/mempolicy: implement the sysfs-based
 weighted_interleave interface
Date: Thu, 25 Jan 2024 13:43:42 -0500
Message-Id: <20240125184345.47074-2-gregory.price@memverge.com>
X-Mailer: git-send-email 2.39.1
In-Reply-To: <20240125184345.47074-1-gregory.price@memverge.com>
References: <20240125184345.47074-1-gregory.price@memverge.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: quoted-printable

From: Rakie Kim <rakie.kim@sk.com>

This patch provides a way to set interleave weight information under
sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN

The sysfs structure is designed as follows.

  $ tree /sys/kernel/mm/mempolicy/
  /sys/kernel/mm/mempolicy/ [1]
  =E2=94=94=E2=94=80=E2=94=80 weighted_interleave [2]
      =E2=94=9C=E2=94=80=E2=94=80 node0 [3]
      =E2=94=94=E2=94=80=E2=94=80 node1

Each file above can be explained as follows.

[1] mm/mempolicy: configuration interface for mempolicy subsystem

[2] weighted_interleave/: config interface for weighted interleave policy

[3] weighted_interleave/nodeN: weight for nodeN

If a node value is set to `0`, the system-default value will be used.
As of this patch, the system-default for all nodes is always 1.

Suggested-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Co-developed-by: Gregory Price <gregory.price@memverge.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Suggested-by: Andi Kleen <ak@linux.intel.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Frank van der Linden <fvdl@google.com>
Suggested-by: Gregory Price <gregory.price@memverge.com>
Suggested-by: Hao Wang <haowang3@fb.com>
Suggested-by: Hasan Al Maruf <hasanalmaruf@fb.com>
Suggested-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: John Groves <john@jagalactic.com>
Suggested-by: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Suggested-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
Suggested-by: Vinicius Tavares Petrucci <vtavarespetr@micron.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Suggested-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
---
 .../ABI/testing/sysfs-kernel-mm-mempolicy     |   4 +
 ...fs-kernel-mm-mempolicy-weighted-interleave |  25 ++
 mm/mempolicy.c                                | 224 ++++++++++++++++++
 3 files changed, 253 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-wei=
ghted-interleave

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Document=
ation/ABI/testing/sysfs-kernel-mm-mempolicy
new file mode 100644
index 000000000000..2dcf24f4384a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
@@ -0,0 +1,4 @@
+What:		/sys/kernel/mm/mempolicy/
+Date:		December 2023
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Interface for Mempolicy
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-i=
nterleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-in=
terleave
new file mode 100644
index 000000000000..0062b02703ff
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interlea=
ve
@@ -0,0 +1,25 @@
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/
+Date:		January 2024
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Configuration Interface for the Weighted Interleave policy
+
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/nodeN
+Date:		January 2024
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Weight configuration interface for nodeN
+
+		The interleave weight for a memory node (N). These weights are
+		utilized by taskss which have set their mempolicy to
+		MPOL_WEIGHTED_INTERLEAVE.
+
+		These weights only affect new allocations, and changes at runtime
+		will not cause migrations on already allocated pages.
+
+		The minimum weight for a node is always 1.
+
+		Minimum weight: 1
+		Maximum weight: 255
+
+		Writing an empty string or `0` will reset the weight to the
+		system default. The system default may be set by the kernel
+		or drivers at boot or during hotplug events.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..f1627d45b0c8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,6 +131,17 @@ static struct mempolicy default_policy =3D {
=20
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
=20
+/*
+ * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
+ * system-default value should be used. A NULL iw_table also denotes that
+ * system-default values should be used. Until the system-default table
+ * is implemented, the system-default is always 1.
+ *
+ * iw_table is RCU protected
+ */
+static u8 __rcu *iw_table;
+static DEFINE_MUTEX(iw_table_lock);
+
 /**
  * numa_nearest_node - Find nearest node by state
  * @node: Node id to start the search
@@ -3067,3 +3078,216 @@ void mpol_to_str(char *buffer, int maxlen, struct m=
empolicy *pol)
 		p +=3D scnprintf(p, buffer + maxlen - p, ":%*pbl",
 			       nodemask_pr_args(&nodes));
 }
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+	struct kobj_attribute kobj_attr;
+	int nid;
+};
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
+{
+	struct iw_node_attr *node_attr;
+	u8 weight;
+	u8 __rcu *table;
+
+	node_attr =3D container_of(attr, struct iw_node_attr, kobj_attr);
+
+	rcu_read_lock();
+	table =3D rcu_dereference(iw_table);
+	weight =3D table ? table[node_attr->nid] : 1;
+	rcu_read_unlock();
+
+	return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *att=
r,
+			  const char *buf, size_t count)
+{
+	struct iw_node_attr *node_attr;
+	u8 __rcu *new;
+	u8 __rcu *old;
+	u8 weight =3D 0;
+
+	node_attr =3D container_of(attr, struct iw_node_attr, kobj_attr);
+	if (count =3D=3D 0 || sysfs_streq(buf, ""))
+		weight =3D 0;
+	else if (kstrtou8(buf, 0, &weight))
+		return -EINVAL;
+
+	/*
+	 * The default weight is 1, for now. When the kernel-internal
+	 * default weight array is implemented, 0 will be a directive to
+	 * the allocators to use the system-default weight instead.
+	 */
+	if (!weight)
+		weight =3D 1;
+
+	new =3D kmalloc(nr_node_ids, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	mutex_lock(&iw_table_lock);
+	old =3D rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	if (old)
+		memcpy(new, old, nr_node_ids);
+	else
+		memset(new, 1, nr_node_ids);
+	new[node_attr->nid] =3D weight;
+	rcu_assign_pointer(iw_table, new);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	kfree(old);
+	return count;
+}
+
+static struct iw_node_attr **node_attrs;
+
+static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
+				  struct kobject *parent)
+{
+	if (!node_attr)
+		return;
+	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
+	kfree(node_attr->kobj_attr.attr.name);
+	kfree(node_attr);
+}
+
+static void sysfs_wi_release(struct kobject *wi_kobj)
+{
+	int i;
+
+	for (i =3D 0; i < nr_node_ids; i++)
+		sysfs_wi_node_release(node_attrs[i], wi_kobj);
+	kobject_put(wi_kobj);
+}
+
+static const struct kobj_type wi_ktype =3D {
+	.sysfs_ops =3D &kobj_sysfs_ops,
+	.release =3D sysfs_wi_release,
+};
+
+static int add_weight_node(int nid, struct kobject *wi_kobj)
+{
+	struct iw_node_attr *node_attr;
+	char *name;
+
+	node_attr =3D kzalloc(sizeof(*node_attr), GFP_KERNEL);
+	if (!node_attr)
+		return -ENOMEM;
+
+	name =3D kasprintf(GFP_KERNEL, "node%d", nid);
+	if (!name) {
+		kfree(node_attr);
+		return -ENOMEM;
+	}
+
+	sysfs_attr_init(&node_attr->kobj_attr.attr);
+	node_attr->kobj_attr.attr.name =3D name;
+	node_attr->kobj_attr.attr.mode =3D 0644;
+	node_attr->kobj_attr.show =3D node_show;
+	node_attr->kobj_attr.store =3D node_store;
+	node_attr->nid =3D nid;
+
+	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+		kfree(node_attr->kobj_attr.attr.name);
+		kfree(node_attr);
+		pr_err("failed to add attribute to weighted_interleave\n");
+		return -ENOMEM;
+	}
+
+	node_attrs[nid] =3D node_attr;
+	return 0;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+	struct kobject *wi_kobj;
+	int nid, err;
+
+	wi_kobj =3D kzalloc(sizeof(struct kobject), GFP_KERNEL);
+	if (!wi_kobj)
+		return -ENOMEM;
+
+	err =3D kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+				   "weighted_interleave");
+	if (err) {
+		kfree(wi_kobj);
+		return err;
+	}
+
+	for_each_node_state(nid, N_POSSIBLE) {
+		err =3D add_weight_node(nid, wi_kobj);
+		if (err) {
+			pr_err("failed to add sysfs [node%d]\n", nid);
+			break;
+		}
+	}
+	if (err)
+		kobject_put(wi_kobj);
+	return 0;
+}
+
+static void mempolicy_kobj_release(struct kobject *kobj)
+{
+	u8 __rcu *old;
+
+	mutex_lock(&iw_table_lock);
+	old =3D rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	rcu_assign_pointer(iw_table, NULL);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	kfree(old);
+	kfree(node_attrs);
+	kfree(kobj);
+}
+
+static const struct kobj_type mempolicy_ktype =3D {
+	.release =3D mempolicy_kobj_release
+};
+
+static int __init mempolicy_sysfs_init(void)
+{
+	int err;
+	static struct kobject *mempolicy_kobj;
+
+	mempolicy_kobj =3D kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
+	if (!mempolicy_kobj) {
+		err =3D -ENOMEM;
+		goto err_out;
+	}
+
+	node_attrs =3D kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
+			     GFP_KERNEL);
+	if (!node_attrs) {
+		err =3D -ENOMEM;
+		goto mempol_out;
+	}
+
+	err =3D kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
+				   "mempolicy");
+	if (err)
+		goto node_out;
+
+	err =3D add_weighted_interleave_group(mempolicy_kobj);
+	if (err) {
+		pr_err("mempolicy sysfs structure failed to initialize\n");
+		kobject_put(mempolicy_kobj);
+		return err;
+	}
+
+	return err;
+node_out:
+	kfree(node_attrs);
+mempol_out:
+	kfree(mempolicy_kobj);
+err_out:
+	pr_err("failed to add mempolicy kobject to the system\n");
+	return err;
+}
+
+late_initcall(mempolicy_sysfs_init);
+#endif /* CONFIG_SYSFS */
--=20
2.39.1

From nobody Sun Feb  8 16:42:44 2026
Received: from mail-pg1-f195.google.com (mail-pg1-f195.google.com
 [209.85.215.195])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 068661758C;
	Thu, 25 Jan 2024 18:44:01 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.215.195
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706208243; cv=none;
 b=UvEKOGXB3KWUOpxc/VeRVH1tGgybi1HAEN+CueCS+Yvi30jHrcaj6x3+dvtTOILvi81sltyP9KBbR9rq3pJmZuKUUr1J6rngoET/bF99AH8ahn7qYb9ZZRyt56IGcMy9ZLi6/SQdWIRIeAecccNMcVLi5929jAJ9/lZnV29X8Fo=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706208243; c=relaxed/simple;
	bh=R+zdl8Rbvfn3XwhywHxnZHDY+Le4DTtVRLSN2sCUJaI=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=c0E5qCWxdcqVecxdLyomBpDJ7bKYEOVuUUSk7YegryV6ldZyG7/SIlx+09cBxi94lAFc4YmhxOLLUxIdhgVk0VjF29WZ0AXufvwF3BI+MpNR1fJScKGv0uvf6zBS/et0za4z+A0cSCEnAKcFiBOKjy/mG9YDy0t88ETtzBvkDVs=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=GJO2T0t1; arc=none smtp.client-ip=209.85.215.195
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="GJO2T0t1"
Received: by mail-pg1-f195.google.com with SMTP id
 41be03b00d2f7-5d42e7ab8a9so1587338a12.3;
        Thu, 25 Jan 2024 10:44:01 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1706208241; x=1706813041;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=VzGhMbeDHhADN16eti5ZGSWn33ZQML077ijAcKWfxyI=;
        b=GJO2T0t14MC+4SpyJQNtD14qXCdZjx85Kklp6yx5yz8P4RIFbVnmILivCzJ9S2Khue
         3w66LWneNoTs+fe2xRRyKWqKpTHDREmJ1K/nb+1FLAQ8q2F/Lin5Z41Lz/ZtTAd64eSV
         6QqgpLbQwV98JpGwWn5qFEf16IKIlRKo2BosAJcphz/vrtGFybvPLTzQVRtOEKH9nYLb
         nJwwik1LrzFkNJU8bRvEEFEBm+/j5TsGOs64LSl+RMJJ5E6jFM5QXrShzMqtua6OkNgW
         JEnoSKt1pOT0jixYl0qgpU6q+d8lXHhDwTxOETWTn4l0XFjBPD7iH5RLmWYk0/lumLqs
         Kk6w==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1706208241; x=1706813041;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=VzGhMbeDHhADN16eti5ZGSWn33ZQML077ijAcKWfxyI=;
        b=O3vuRouOG/Ce/GxoOYCP6TRJn0m8x/luJZOYUXQ1owJe2asvMrmrThgM9lbv754Ohn
         Lf4g+IOjQUTgLmYX6QkcHlGpp8ql4VtQorSFqJAwb2KVNe8Z7uQtl9dUrPUa+Lu6IWmL
         FuSS/dPejeV42ssiuQpX2eH2GxZqHsE95NO5hXVjKmStoSc5U4aG8xP52YdVjpGAwxKi
         4NY/V9hpaGYciy0mxag6ou9aSnalLIyEXTZjOuQG76zn1v8qSRuV6IQGdl4d2yjHH89e
         Wsq312u2awcwFdgf86Iou6KaWJCGzQPTGihq3x7zWM+NUx1KaOI3GlZjN9XM01CQZWaw
         xlRg==
X-Gm-Message-State: AOJu0Ywqa5IiWRI7/K73B0tNxVrLxrNfwa2zQbzOA+FJKpLx37Xa2Edn
	nDVzr0LOsuYfTN5SkuS1nGfjg81fSIBW2MHhnE19SyTyG8oTioc=
X-Google-Smtp-Source: 
 AGHT+IEcH1Tk+08R16sVhPM8oB55q9tIFfKiHt4xwLriTagfM09GAnko6cUAoLV1xiWOuP2rG1zkBg==
X-Received: by 2002:a05:6a20:1e52:b0:194:f8dd:4277 with SMTP id
 cy18-20020a056a201e5200b00194f8dd4277mr81192pzb.106.1706208241161;
        Thu, 25 Jan 2024 10:44:01 -0800 (PST)
Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net.
 [173.79.56.208])
        by smtp.gmail.com with ESMTPSA id
 p14-20020aa7860e000000b006ddcf56fb78sm1815070pfn.62.2024.01.25.10.43.58
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Thu, 25 Jan 2024 10:44:00 -0800 (PST)
From: Gregory Price <gourry.memverge@gmail.com>
X-Google-Original-From: Gregory Price <gregory.price@memverge.com>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org,
	linux-fsdevel@vger.kernel.org,
	linux-api@vger.kernel.org,
	corbet@lwn.net,
	akpm@linux-foundation.org,
	gregory.price@memverge.com,
	honggyu.kim@sk.com,
	rakie.kim@sk.com,
	hyeongtak.ji@sk.com,
	mhocko@kernel.org,
	ying.huang@intel.com,
	vtavarespetr@micron.com,
	jgroves@micron.com,
	ravis.opensrc@micron.com,
	sthanneeru@micron.com,
	emirakhur@micron.com,
	Hasan.Maruf@amd.com,
	seungjun.ha@samsung.com,
	hannes@cmpxchg.org,
	dan.j.williams@intel.com
Subject: [PATCH v3 2/4] mm/mempolicy: refactor a read-once mechanism into a
 function for re-use
Date: Thu, 25 Jan 2024 13:43:43 -0500
Message-Id: <20240125184345.47074-3-gregory.price@memverge.com>
X-Mailer: git-send-email 2.39.1
In-Reply-To: <20240125184345.47074-1-gregory.price@memverge.com>
References: <20240125184345.47074-1-gregory.price@memverge.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

move the use of barrier() to force policy->nodemask onto the stack into
a function `read_once_policy_nodemask` so that it may be re-used.

Suggested-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Suggested-by: Andi Kleen <ak@linux.intel.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Frank van der Linden <fvdl@google.com>
Suggested-by: Gregory Price <gregory.price@memverge.com>
Suggested-by: Hao Wang <haowang3@fb.com>
Suggested-by: Hasan Al Maruf <hasanalmaruf@fb.com>
Suggested-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: John Groves <john@jagalactic.com>
Suggested-by: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Suggested-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
Suggested-by: Vinicius Tavares Petrucci <vtavarespetr@micron.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Suggested-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
---
 mm/mempolicy.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1627d45b0c8..b13c45a0bfcb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1894,6 +1894,20 @@ unsigned int mempolicy_slab_node(void)
 	}
 }
=20
+static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
+					      nodemask_t *mask)
+{
+	/*
+	 * barrier stabilizes the nodemask locally so that it can be iterated
+	 * over safely without concern for changes. Allocators validate node
+	 * selection does not violate mems_allowed, so this is safe.
+	 */
+	barrier();
+	memcpy(mask, &pol->nodes, sizeof(nodemask_t));
+	barrier();
+	return nodes_weight(*mask);
+}
+
 /*
  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
  * node in pol->nodes (starting from ilx=3D0), wrapping around if ilx
@@ -1901,20 +1915,12 @@ unsigned int mempolicy_slab_node(void)
  */
 static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
 {
-	nodemask_t nodemask =3D pol->nodes;
+	nodemask_t nodemask;
 	unsigned int target, nnodes;
 	int i;
 	int nid;
-	/*
-	 * The barrier will stabilize the nodemask in a register or on
-	 * the stack so that it will stop changing under the code.
-	 *
-	 * Between first_node() and next_node(), pol->nodes could be changed
-	 * by other threads. So we put pol->nodes in a local stack.
-	 */
-	barrier();
=20
-	nnodes =3D nodes_weight(nodemask);
+	nnodes =3D read_once_policy_nodemask(pol, &nodemask);
 	if (!nnodes)
 		return numa_node_id();
 	target =3D ilx % nnodes;
--=20
2.39.1
From nobody Sun Feb  8 16:42:44 2026
Received: from mail-pf1-f193.google.com (mail-pf1-f193.google.com
 [209.85.210.193])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0A74F135A5C;
	Thu, 25 Jan 2024 18:44:05 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.210.193
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706208248; cv=none;
 b=r8jJ6dSxest8tjO19eZFmWE/h7n3ru8NCjGlpq4jI0F9a35XzIt/KC8gI5Dt+Bq2Oz97rZODCqa3ilNhDSfKhdr2K9lIHN06IGaSEwyjHrnAqK72R4lMT9yJ6+zSS1udbkvH9rpDhYza54Wi2qCzKTX4fUMpL+m+AhIzlZYM0IE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706208248; c=relaxed/simple;
	bh=4CXcduw/Xk9UrumRFbOxXT/jVuq9BdtzqZqCW2srHGU=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=vCioD4Vjmf5R3wO7kJLSqqSLvM7k46wFm7jZDcro4Lv530I1LDY/cBrolZaNb2QK6TxrrlQB58yQVDnEitFeuHQ/rnUw9wo5zCMpaHhGtWTk19SkWbnjvitTfJVXg6zST9pZaEvIQcvYoRtLEquiqr+c3u3Rpc0g3cwGnwbtw2g=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=nYzXlcSV; arc=none smtp.client-ip=209.85.210.193
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="nYzXlcSV"
Received: by mail-pf1-f193.google.com with SMTP id
 d2e1a72fcca58-6ddc5faeb7fso1257103b3a.3;
        Thu, 25 Jan 2024 10:44:05 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1706208245; x=1706813045;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=XCM3+Jwfhpm+dmL8fIjOhTZIVTHnDTgH9yjGWopoEEE=;
        b=nYzXlcSVkc+oh5EIf+NErYZ/UnjILz1mSUL78SNN4aKr5wrCSrXgYPEABX8MCtROqV
         Hie/GeWDpH11sYQWN0SaCi0CU0gPvRLNdV1v3QR979CIas2KVt2Yb26O+tZnRJdN+r4a
         uKSi7Zk7uZxhDz/ctnv82fYuvW+yxkEjTgVtofnWyhZxbSewfHgvs+14k6Fee0DoBnbP
         cZz4zsEvvY39JA0Z37CPMUkY81ijvjHFXkGbCrrjTKzRENZ+gVF8+yQVRTsAbQrsJbO1
         p7r1kdR3eeEk/1VOWwwHGmyXtgGBWqkNnWv/L+jof409BiHhqEnjaOfrS/BKHVR0GWnW
         TiTA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1706208245; x=1706813045;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=XCM3+Jwfhpm+dmL8fIjOhTZIVTHnDTgH9yjGWopoEEE=;
        b=Po6Mg1vp571iftY9acegeRPh+n5BlO+ABXGwgg5YnYmPO0+lXzqXFHMzr9oGWSH9s9
         WKV0no06alHaEJgivl0hhFBbz4G4RYHTO3CbfO6+y33g8d7F61dsBBu9xv/kkchykxil
         MLPtEtiDbYZrO4gdOr6p3h6At4P0Y4gWEmDjiHwPAu4jTd1PHvljYB37IuL/16hm93gE
         YOIPPMbfQ8/tqawH3Z64w+r6nWZI4Of53lBkY/OyG4IBT8FAaT3QGqAAyDPaTaeFMzid
         6AV5gdDEsB/wryKb64HUeYOKX6OrH4CF4WadJc7CFQTTJ2hfIPF418T9BSWShePecGk4
         frGw==
X-Gm-Message-State: AOJu0Yw57J+62p0cXRXT85cU7tGhI2OqhNl5tumz9rHsxvvemDRZF2Bc
	rYa87DGBcgufW81/SnYnrOl4ZVXGW43S6dHFXlCKDYm8Dk0sOcbbCfGC2NIBF8Gn
X-Google-Smtp-Source: 
 AGHT+IG8IKF2LbF/tulleNXQK9a0buVlgJX+T6ILRE02yC4Dg04nb88+bqtRLNJHNefgetU7IyONPg==
X-Received: by 2002:a05:6a00:1d13:b0:6db:cdbc:311e with SMTP id
 a19-20020a056a001d1300b006dbcdbc311emr167814pfx.61.1706208245137;
        Thu, 25 Jan 2024 10:44:05 -0800 (PST)
Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net.
 [173.79.56.208])
        by smtp.gmail.com with ESMTPSA id
 p14-20020aa7860e000000b006ddcf56fb78sm1815070pfn.62.2024.01.25.10.44.02
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Thu, 25 Jan 2024 10:44:04 -0800 (PST)
From: Gregory Price <gourry.memverge@gmail.com>
X-Google-Original-From: Gregory Price <gregory.price@memverge.com>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org,
	linux-fsdevel@vger.kernel.org,
	linux-api@vger.kernel.org,
	corbet@lwn.net,
	akpm@linux-foundation.org,
	gregory.price@memverge.com,
	honggyu.kim@sk.com,
	rakie.kim@sk.com,
	hyeongtak.ji@sk.com,
	mhocko@kernel.org,
	ying.huang@intel.com,
	vtavarespetr@micron.com,
	jgroves@micron.com,
	ravis.opensrc@micron.com,
	sthanneeru@micron.com,
	emirakhur@micron.com,
	Hasan.Maruf@amd.com,
	seungjun.ha@samsung.com,
	hannes@cmpxchg.org,
	dan.j.williams@intel.com,
	Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Subject: [PATCH v3 3/4] mm/mempolicy: introduce MPOL_WEIGHTED_INTERLEAVE for
 weighted interleaving
Date: Thu, 25 Jan 2024 13:43:44 -0500
Message-Id: <20240125184345.47074-4-gregory.price@memverge.com>
X-Mailer: git-send-email 2.39.1
In-Reply-To: <20240125184345.47074-1-gregory.price@memverge.com>
References: <20240125184345.47074-1-gregory.price@memverge.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

When a system has multiple NUMA nodes and it becomes bandwidth hungry,
using the current MPOL_INTERLEAVE could be an wise option.

However, if those NUMA nodes consist of different types of memory such
as socket-attached DRAM and CXL/PCIe attached DRAM, the round-robin
based interleave policy does not optimally distribute data to make use
of their different bandwidth characteristics.

Instead, interleave is more effective when the allocation policy follows
each NUMA nodes' bandwidth weight rather than a simple 1:1 distribution.

This patch introduces a new memory policy, MPOL_WEIGHTED_INTERLEAVE,
enabling weighted interleave between NUMA nodes.  Weighted interleave
allows for proportional distribution of memory across multiple numa
nodes, preferably apportioned to match the bandwidth of each node.

For example, if a system has 1 CPU node (0), and 2 memory nodes (0,1),
with bandwidth of (100GB/s, 50GB/s) respectively, the appropriate
weight distribution is (2:1).

Weights for each node can be assigned via the new sysfs extension:
/sys/kernel/mm/mempolicy/weighted_interleave/

For now, the default value of all nodes will be `1`, which matches
the behavior of standard 1:1 round-robin interleave. An extension
will be added in the future to allow default values to be registered
at kernel and device bringup time.

The policy allocates a number of pages equal to the set weights. For
example, if the weights are (2,1), then 2 pages will be allocated on
node0 for every 1 page allocated on node1.

The new flag MPOL_WEIGHTED_INTERLEAVE can be used in set_mempolicy(2)
and mbind(2).

There are 3 integration points:

weighted_interleave_nodes:
    Counts the number of allocations as they occur, and applies the
    weight for the current node.  When the weight reaches 0, switch
    to the next node.

weighted_interleave_nid:
    Gets the total weight of the nodemask as well as each individual
    node weight, then calculates the node based on the given index.

bulk_array_weighted_interleave:
    Gets the total weight of the nodemask as well as each individual
    node weight, then calculates the number of "interleave rounds" as
    well as any delta ("partial round").  Calculates the number of
    pages for each node and allocates them.

    If a node was scheduled for interleave via interleave_nodes, the
    current weight (pol->cur_il_weight) will be allocated first, before
    the remaining bulk calculation is done.

One piece of complexity is the interaction between a recent refactor
which split the logic to acquire the "ilx" (interleave index) of an
allocation and the actually application of the interleave.  The
calculation of the `interleave index` is done by `get_vma_policy()`,
while the actual selection of the node will be later appliex by the
relevant weighted_interleave function.

Suggested-by: Hasan Al Maruf <Hasan.Maruf@amd.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Co-developed-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Co-developed-by: Honggyu Kim <honggyu.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Co-developed-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Co-developed-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Signed-off-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Suggested-by: Andi Kleen <ak@linux.intel.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Frank van der Linden <fvdl@google.com>
Suggested-by: Gregory Price <gregory.price@memverge.com>
Suggested-by: Hao Wang <haowang3@fb.com>
Suggested-by: Hasan Al Maruf <hasanalmaruf@fb.com>
Suggested-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: John Groves <john@jagalactic.com>
Suggested-by: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Suggested-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
Suggested-by: Vinicius Tavares Petrucci <vtavarespetr@micron.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Suggested-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
---
 .../admin-guide/mm/numa_memory_policy.rst     |   9 +
 include/linux/mempolicy.h                     |   3 +
 include/uapi/linux/mempolicy.h                |   1 +
 mm/mempolicy.c                                | 274 +++++++++++++++++-
 4 files changed, 283 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Document=
ation/admin-guide/mm/numa_memory_policy.rst
index eca38fa81e0f..a70f20ce1ffb 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -250,6 +250,15 @@ MPOL_PREFERRED_MANY
 	can fall back to all existing numa nodes. This is effectively
 	MPOL_PREFERRED allowed for a mask rather than a single node.
=20
+MPOL_WEIGHTED_INTERLEAVE
+	This mode operates the same as MPOL_INTERLEAVE, except that
+	interleaving behavior is executed based on weights set in
+	/sys/kernel/mm/mempolicy/weighted_interleave/
+
+	Weighted interleave allocates pages on nodes according to a
+	weight.  For example if nodes [0,1] are weighted [5,2], 5 pages
+	will be allocated on node0 for every 2 pages allocated on node1.
+
 NUMA memory policy supports the following optional mode flags:
=20
 MPOL_F_STATIC_NODES
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 931b118336f4..c644d7bbd396 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -54,6 +54,9 @@ struct mempolicy {
 		nodemask_t cpuset_mems_allowed;	/* relative to these nodes */
 		nodemask_t user_nodemask;	/* nodemask passed by user */
 	} w;
+
+	/* Weighted interleave settings */
+	u8 cur_il_weight;
 };
=20
 /*
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index a8963f7ef4c2..1f9bb10d1a47 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -23,6 +23,7 @@ enum {
 	MPOL_INTERLEAVE,
 	MPOL_LOCAL,
 	MPOL_PREFERRED_MANY,
+	MPOL_WEIGHTED_INTERLEAVE,
 	MPOL_MAX,	/* always last member of enum */
 };
=20
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b13c45a0bfcb..5a517511658e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -19,6 +19,13 @@
  *                for anonymous memory. For process policy an process coun=
ter
  *                is used.
  *
+ * weighted interleave
+ *                Allocate memory interleaved over a set of nodes based on
+ *                a set of weights (per-node), with normal fallback if it
+ *                fails.  Otherwise operates the same as interleave.
+ *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
+ *                on node 0 for every 1 page allocated on node 1.
+ *
  * bind           Only allocate memory on a specific set of nodes,
  *                no fallback.
  *                FIXME: memory is allocated starting with the first node
@@ -314,6 +321,7 @@ static struct mempolicy *mpol_new(unsigned short mode, =
unsigned short flags,
 	policy->mode =3D mode;
 	policy->flags =3D flags;
 	policy->home_node =3D NUMA_NO_NODE;
+	policy->cur_il_weight =3D 0;
=20
 	return policy;
 }
@@ -426,6 +434,10 @@ static const struct mempolicy_operations mpol_ops[MPOL=
_MAX] =3D {
 		.create =3D mpol_new_nodemask,
 		.rebind =3D mpol_rebind_preferred,
 	},
+	[MPOL_WEIGHTED_INTERLEAVE] =3D {
+		.create =3D mpol_new_nodemask,
+		.rebind =3D mpol_rebind_nodemask,
+	},
 };
=20
 static bool migrate_folio_add(struct folio *folio, struct list_head *folio=
list,
@@ -847,7 +859,8 @@ static long do_set_mempolicy(unsigned short mode, unsig=
ned short flags,
=20
 	old =3D current->mempolicy;
 	current->mempolicy =3D new;
-	if (new && new->mode =3D=3D MPOL_INTERLEAVE)
+	if (new && (new->mode =3D=3D MPOL_INTERLEAVE ||
+		    new->mode =3D=3D MPOL_WEIGHTED_INTERLEAVE))
 		current->il_prev =3D MAX_NUMNODES-1;
 	task_unlock(current);
 	mpol_put(old);
@@ -873,6 +886,7 @@ static void get_policy_nodemask(struct mempolicy *pol, =
nodemask_t *nodes)
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		*nodes =3D pol->nodes;
 		break;
 	case MPOL_LOCAL:
@@ -957,6 +971,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *=
nmask,
 		} else if (pol =3D=3D current->mempolicy &&
 				pol->mode =3D=3D MPOL_INTERLEAVE) {
 			*policy =3D next_node_in(current->il_prev, pol->nodes);
+		} else if (pol =3D=3D current->mempolicy &&
+				(pol->mode =3D=3D MPOL_WEIGHTED_INTERLEAVE)) {
+			if (pol->cur_il_weight)
+				*policy =3D current->il_prev;
+			else
+				*policy =3D next_node_in(current->il_prev,
+						       pol->nodes);
 		} else {
 			err =3D -EINVAL;
 			goto out;
@@ -1769,7 +1790,8 @@ struct mempolicy *__get_vma_policy(struct vm_area_str=
uct *vma,
  * @vma: virtual memory area whose policy is sought
  * @addr: address in @vma for shared policy lookup
  * @order: 0, or appropriate huge_page_order for interleaving
- * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
+ * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
+ *       MPOL_WEIGHTED_INTERLEAVE
  *
  * Returns effective policy for a VMA at specified address.
  * Falls back to current->mempolicy or system default policy, as necessary.
@@ -1786,7 +1808,8 @@ struct mempolicy *get_vma_policy(struct vm_area_struc=
t *vma,
 	pol =3D __get_vma_policy(vma, addr, ilx);
 	if (!pol)
 		pol =3D get_task_policy(current);
-	if (pol->mode =3D=3D MPOL_INTERLEAVE) {
+	if (pol->mode =3D=3D MPOL_INTERLEAVE ||
+	    pol->mode =3D=3D MPOL_WEIGHTED_INTERLEAVE) {
 		*ilx +=3D vma->vm_pgoff >> order;
 		*ilx +=3D (addr - vma->vm_start) >> (PAGE_SHIFT + order);
 	}
@@ -1836,6 +1859,44 @@ bool apply_policy_zone(struct mempolicy *policy, enu=
m zone_type zone)
 	return zone >=3D dynamic_policy_zone;
 }
=20
+static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
+{
+	unsigned int node, next;
+	struct task_struct *me =3D current;
+	u8 __rcu *table;
+	u8 weight;
+
+	node =3D next_node_in(me->il_prev, policy->nodes);
+	if (node =3D=3D MAX_NUMNODES)
+		return node;
+
+	/* on first alloc after setting mempolicy, acquire first weight */
+	if (unlikely(!policy->cur_il_weight)) {
+		rcu_read_lock();
+		table =3D rcu_dereference(iw_table);
+		/* detect system-default values */
+		weight =3D table ? table[node] : 1;
+		policy->cur_il_weight =3D weight ? weight : 1;
+		rcu_read_unlock();
+	}
+
+	/* account for this allocation call */
+	policy->cur_il_weight--;
+
+	/* if now at 0, move to next node and set up that node's weight */
+	if (unlikely(!policy->cur_il_weight)) {
+		me->il_prev =3D node;
+		next =3D next_node_in(node, policy->nodes);
+		rcu_read_lock();
+		table =3D rcu_dereference(iw_table);
+		/* detect system-default values */
+		weight =3D table ? table[next] : 1;
+		policy->cur_il_weight =3D weight ? weight : 1;
+		rcu_read_unlock();
+	}
+	return node;
+}
+
 /* Do dynamic interleaving for a process */
 static unsigned int interleave_nodes(struct mempolicy *policy)
 {
@@ -1870,6 +1931,9 @@ unsigned int mempolicy_slab_node(void)
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
=20
+	case MPOL_WEIGHTED_INTERLEAVE:
+		return weighted_interleave_nodes(policy);
+
 	case MPOL_BIND:
 	case MPOL_PREFERRED_MANY:
 	{
@@ -1908,6 +1972,39 @@ static unsigned int read_once_policy_nodemask(struct=
 mempolicy *pol,
 	return nodes_weight(*mask);
 }
=20
+static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t=
 ilx)
+{
+	nodemask_t nodemask;
+	unsigned int target, nr_nodes;
+	u8 __rcu *table;
+	unsigned int weight_total =3D 0;
+	u8 weight;
+	int nid;
+
+	nr_nodes =3D read_once_policy_nodemask(pol, &nodemask);
+	if (!nr_nodes)
+		return numa_node_id();
+
+	rcu_read_lock();
+	table =3D rcu_dereference(iw_table);
+	/* calculate the total weight */
+	for_each_node_mask(nid, nodemask)
+		weight_total +=3D table ? table[nid] : 1;
+
+	/* Calculate the node offset based on totals */
+	target =3D ilx % weight_total;
+	nid =3D first_node(nodemask);
+	while (target) {
+		weight =3D table ? table[nid] : 1;
+		if (target < weight)
+			break;
+		target -=3D weight;
+		nid =3D next_node_in(nid, nodemask);
+	}
+	rcu_read_unlock();
+	return nid;
+}
+
 /*
  * Do static interleaving for interleave index @ilx.  Returns the ilx'th
  * node in pol->nodes (starting from ilx=3D0), wrapping around if ilx
@@ -1968,6 +2065,11 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct=
 mempolicy *pol,
 		*nid =3D (ilx =3D=3D NO_INTERLEAVE_INDEX) ?
 			interleave_nodes(pol) : interleave_nid(pol, ilx);
 		break;
+	case MPOL_WEIGHTED_INTERLEAVE:
+		*nid =3D (ilx =3D=3D NO_INTERLEAVE_INDEX) ?
+			weighted_interleave_nodes(pol) :
+			weighted_interleave_nid(pol, ilx);
+		break;
 	}
=20
 	return nodemask;
@@ -2029,6 +2131,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		*mask =3D mempolicy->nodes;
 		break;
=20
@@ -2128,7 +2231,8 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int=
 order,
 		 * If the policy is interleave or does not allow the current
 		 * node in its nodemask, we allocate the standard way.
 		 */
-		if (pol->mode !=3D MPOL_INTERLEAVE &&
+		if ((pol->mode !=3D MPOL_INTERLEAVE &&
+		    pol->mode !=3D MPOL_WEIGHTED_INTERLEAVE) &&
 		    (!nodemask || node_isset(nid, *nodemask))) {
 			/*
 			 * First, try to allocate THP only on local node, but
@@ -2264,6 +2368,156 @@ static unsigned long alloc_pages_bulk_array_interle=
ave(gfp_t gfp,
 	return total_allocated;
 }
=20
+static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+		struct mempolicy *pol, unsigned long nr_pages,
+		struct page **page_array)
+{
+	struct task_struct *me =3D current;
+	unsigned long total_allocated =3D 0;
+	unsigned long nr_allocated;
+	unsigned long rounds;
+	unsigned long node_pages, delta;
+	u8 weight, resume_weight;
+	u8 __rcu *table;
+	u8 *weights;
+	unsigned int weight_total =3D 0;
+	unsigned long rem_pages =3D nr_pages;
+	nodemask_t nodes;
+	int nnodes, node, resume_node, next_node;
+	int prev_node =3D me->il_prev;
+	int i;
+
+	if (!nr_pages)
+		return 0;
+
+	nnodes =3D read_once_policy_nodemask(pol, &nodes);
+	if (!nnodes)
+		return 0;
+
+	/* Continue allocating from most recent node and adjust the nr_pages */
+	if (pol->cur_il_weight) {
+		node =3D next_node_in(prev_node, nodes);
+		node_pages =3D pol->cur_il_weight;
+		if (node_pages > rem_pages)
+			node_pages =3D rem_pages;
+		nr_allocated =3D __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array +=3D nr_allocated;
+		total_allocated +=3D nr_allocated;
+		/*
+		 * if that's all the pages, no need to interleave, otherwise
+		 * we need to set up the next interleave node/weight correctly.
+		 */
+		if (rem_pages < pol->cur_il_weight) {
+			/* stay on current node, adjust cur_il_weight */
+			pol->cur_il_weight -=3D rem_pages;
+			return total_allocated;
+		} else if (rem_pages =3D=3D pol->cur_il_weight) {
+			/* move to next node / weight */
+			me->il_prev =3D node;
+			next_node =3D next_node_in(node, nodes);
+			rcu_read_lock();
+			table =3D rcu_dereference(iw_table);
+			weight =3D table ? table[next_node] : 1;
+			/* detect system-default usage */
+			pol->cur_il_weight =3D weight ? weight : 1;
+			rcu_read_unlock();
+			return total_allocated;
+		}
+		/* Otherwise we adjust nr_pages down, and continue from there */
+		rem_pages -=3D pol->cur_il_weight;
+		pol->cur_il_weight =3D 0;
+		prev_node =3D node;
+	}
+
+	/* create a local copy of node weights to operate on outside rcu */
+	weights =3D kmalloc(nr_node_ids, GFP_KERNEL);
+	if (!weights)
+		return total_allocated;
+
+	rcu_read_lock();
+	table =3D rcu_dereference(iw_table);
+	/* If table is not registered, use system defaults */
+	if (table)
+		memcpy(weights, iw_table, nr_node_ids);
+	else
+		memset(weights, 1, nr_node_ids);
+	rcu_read_unlock();
+
+	/* calculate total, detect system default usage */
+	for_each_node_mask(node, nodes) {
+		/* detect system-default usage */
+		if (!weights[node])
+			weights[node] =3D 1;
+		weight_total +=3D weights[node];
+	}
+
+	/*
+	 * Now we can continue allocating from 0 instead of an offset
+	 * We calculate the number of rounds and any partial rounds so
+	 * that we minimize the number of calls to __alloc_pages_bulk
+	 * This requires us to track which node we should resume from.
+	 *
+	 * if (rounds > 0) and (delta =3D=3D 0), resume_node will always be
+	 * the current value of prev_node, which may be NUMA_NO_NODE if
+	 * this is the first allocation after a policy is replaced. The
+	 * resume weight will be the weight of the next node.
+	 *
+	 * if (delta > 0) and delta is depleted exactly on a node-weight
+	 * boundary, resume node will be the node last allocated from when
+	 * delta reached 0.
+	 *
+	 * if (delta > 0) and delta is not depleted on a node-weight boundary,
+	 * resume node will be the node prior to the node last allocated from.
+	 *
+	 * (rounds =3D=3D 0) and (delta =3D=3D 0) is not possible (earlier exit)
+	 */
+	rounds =3D rem_pages / weight_total;
+	delta =3D rem_pages % weight_total;
+	resume_node =3D prev_node;
+	resume_weight =3D weights[next_node_in(prev_node, nodes)];
+	/* If no delta, we'll resume from current prev_node and first weight */
+	for (i =3D 0; i < nnodes; i++) {
+		node =3D next_node_in(prev_node, nodes);
+		weight =3D weights[node];
+		node_pages =3D weight * rounds;
+		/* If a delta exists, add this node's portion of the delta */
+		if (delta > weight) {
+			node_pages +=3D weight;
+			delta -=3D weight;
+			resume_node =3D node;
+		} else if (delta) {
+			node_pages +=3D delta;
+			if (delta =3D=3D weight) {
+				/* resume from next node with its weight */
+				resume_node =3D node;
+				next_node =3D next_node_in(node, nodes);
+				resume_weight =3D weights[next_node];
+			} else {
+				/* resume from this node w/ remaining weight */
+				resume_node =3D prev_node;
+				resume_weight =3D weight - (node_pages % weight);
+			}
+			delta =3D 0;
+		}
+		/* node_pages can be 0 if an allocation fails and rounds =3D=3D 0 */
+		if (!node_pages)
+			break;
+		nr_allocated =3D __alloc_pages_bulk(gfp, node, NULL, node_pages,
+						  NULL, page_array);
+		page_array +=3D nr_allocated;
+		total_allocated +=3D nr_allocated;
+		if (total_allocated =3D=3D nr_pages)
+			break;
+		prev_node =3D node;
+	}
+	/* resume allocating from the calculated node and weight */
+	me->il_prev =3D resume_node;
+	pol->cur_il_weight =3D resume_weight;
+	kfree(weights);
+	return total_allocated;
+}
+
 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int =
nid,
 		struct mempolicy *pol, unsigned long nr_pages,
 		struct page **page_array)
@@ -2304,6 +2558,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t=
 gfp,
 		return alloc_pages_bulk_array_interleave(gfp, pol,
 							 nr_pages, page_array);
=20
+	if (pol->mode =3D=3D MPOL_WEIGHTED_INTERLEAVE)
+		return alloc_pages_bulk_array_weighted_interleave(
+				  gfp, pol, nr_pages, page_array);
+
 	if (pol->mode =3D=3D MPOL_PREFERRED_MANY)
 		return alloc_pages_bulk_array_preferred_many(gfp,
 				numa_node_id(), pol, nr_pages, page_array);
@@ -2379,6 +2637,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempoli=
cy *b)
 	case MPOL_INTERLEAVE:
 	case MPOL_PREFERRED:
 	case MPOL_PREFERRED_MANY:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		return !!nodes_equal(a->nodes, b->nodes);
 	case MPOL_LOCAL:
 		return true;
@@ -2515,6 +2774,10 @@ int mpol_misplaced(struct folio *folio, struct vm_ar=
ea_struct *vma,
 		polnid =3D interleave_nid(pol, ilx);
 		break;
=20
+	case MPOL_WEIGHTED_INTERLEAVE:
+		polnid =3D weighted_interleave_nid(pol, ilx);
+		break;
+
 	case MPOL_PREFERRED:
 		if (node_isset(curnid, pol->nodes))
 			goto out;
@@ -2889,6 +3152,7 @@ static const char * const policy_modes[] =3D
 	[MPOL_PREFERRED]  =3D "prefer",
 	[MPOL_BIND]       =3D "bind",
 	[MPOL_INTERLEAVE] =3D "interleave",
+	[MPOL_WEIGHTED_INTERLEAVE] =3D "weighted interleave",
 	[MPOL_LOCAL]      =3D "local",
 	[MPOL_PREFERRED_MANY]  =3D "prefer (many)",
 };
@@ -2948,6 +3212,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
 		}
 		break;
 	case MPOL_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		/*
 		 * Default to online nodes with memory if no nodelist
 		 */
@@ -3058,6 +3323,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mem=
policy *pol)
 	case MPOL_PREFERRED_MANY:
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
+	case MPOL_WEIGHTED_INTERLEAVE:
 		nodes =3D pol->nodes;
 		break;
 	default:
--=20
2.39.1
From nobody Sun Feb  8 16:42:44 2026
Received: from mail-pf1-f193.google.com (mail-pf1-f193.google.com
 [209.85.210.193])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 348EB135A7C;
	Thu, 25 Jan 2024 18:44:10 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.210.193
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1706208252; cv=none;
 b=GT4wFuDDtzlxJmRJj737ptsGSskpf2l6pFb4wthOQmDA9ewf+6++9yocO2xKKZPJWYmhg9YcUL85MoZos6NCHHwvAeTegR49Zyw6w6mm/BRhkHccmhc9CwiQBJh8hm+amy7MVSyYKPhxy4mHmpB/YFglOynB//GHBDiAU6Frztg=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1706208252; c=relaxed/simple;
	bh=ecRKWJQhY4wR39DMh1DAJeETva8Uc2luxfi1zh9+yS0=;
	h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References:
	 MIME-Version;
 b=QJcqk7kWFDSAd5fPPCbWhstVe/++1lsK7UdsBFjpCiInCqvZDOVoZGLrAns1bg02LRs4DL4F4GMDh71puOKrfxLdanFlhfzYrvc4MgYWDA1uhaOF+LfwVafC9RZDngFdgJXuegsIA59hyj2cVJVw7/DvIeEPeIP3dvBgSgIavLs=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=EDrK9D2g; arc=none smtp.client-ip=209.85.210.193
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="EDrK9D2g"
Received: by mail-pf1-f193.google.com with SMTP id
 d2e1a72fcca58-6dc6f47302bso3586231b3a.1;
        Thu, 25 Jan 2024 10:44:10 -0800 (PST)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20230601; t=1706208250; x=1706813050;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=aiHyh81A5rrvisQFQaV58YSJpwtynFeoOhG26thY/oo=;
        b=EDrK9D2gKSAxlpFG5CvxVTYxQe6XmbOFPscf7bK3r+pVUP1GGs6bVRzXYBrZ3jlis5
         b/CcoRKM4tBYEluwKO7O4UFhjh5Xi3DwZ6DHiOAZIVCsK5stY2+10HMfMQ6YaS4Uy+7u
         XScPz/1kpYq0GAWDCiLlbG2ISNoIZYGNFvhatxuIdVOZXcaFHicALuWqBQgCr+Ko2/YY
         t7otp4v57iQ2zzgFauaZAW2JkExSvvr2CivllkGP+951CgfLfzfFkBIFZBHI4B4Tuixr
         bW/r7MDGlvzV+eYZ6MooN97UG7v9snd87c6PfL2cArh7shLBwLMN1Z3RYMxyzHHxddXf
         GZKQ==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20230601; t=1706208250; x=1706813050;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=aiHyh81A5rrvisQFQaV58YSJpwtynFeoOhG26thY/oo=;
        b=CMHod1dmUhJmTn6HwIfBvqAPgLPcnlWHB99XPZW4oR4nziXg+SexhQHw+jwSRoz5HZ
         pBtKrzKYSpCMXJ0+qHssB+aqG67ehtVw8PFLzXcGLQzV0ySpfkkkHwZXz1XFZltVWyXO
         mNazbEg6OzlcEFuICSdaH3oGPbp9oQ5yBVcWTrggJa+Tc0TmS5pl+Q5hT10P6DlgQ7YL
         CiVuoETGIgawYImAbTD1ewsGTTDNSXAGZfuUaLjzMO5fe8gRiM+lfXNBDyx1zjkaq+W/
         1SB0Z7SApOxO0sAxwIN9hfhAd1II9axzB9ulZNCzXP3CepPuHtAP7Hz3nuS3wvofh+wr
         Sj/w==
X-Gm-Message-State: AOJu0Yyi46gdCDD8JfZ2Obwq5A0YiSx8im1R6UODbzPI3Iv6kk7LOTcD
	whHYBI/78z7stWZQSg8U+dMaRp0BOIWRdjyp0PBzULgZwKyHA7Q=
X-Google-Smtp-Source: 
 AGHT+IGg7DSmaG2TFNg0cpW9ZTaAi3nnBJTCM2ZL3E6BfG30NMueGRRYhoyixqtu5WQyXV2EyJrwbQ==
X-Received: by 2002:aa7:8a07:0:b0:6d0:8b0f:1091 with SMTP id
 m7-20020aa78a07000000b006d08b0f1091mr108795pfa.30.1706208248914;
        Thu, 25 Jan 2024 10:44:08 -0800 (PST)
Received: from fedora.mshome.net (pool-173-79-56-208.washdc.fios.verizon.net.
 [173.79.56.208])
        by smtp.gmail.com with ESMTPSA id
 p14-20020aa7860e000000b006ddcf56fb78sm1815070pfn.62.2024.01.25.10.44.06
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Thu, 25 Jan 2024 10:44:08 -0800 (PST)
From: Gregory Price <gourry.memverge@gmail.com>
X-Google-Original-From: Gregory Price <gregory.price@memverge.com>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org,
	linux-fsdevel@vger.kernel.org,
	linux-api@vger.kernel.org,
	corbet@lwn.net,
	akpm@linux-foundation.org,
	gregory.price@memverge.com,
	honggyu.kim@sk.com,
	rakie.kim@sk.com,
	hyeongtak.ji@sk.com,
	mhocko@kernel.org,
	ying.huang@intel.com,
	vtavarespetr@micron.com,
	jgroves@micron.com,
	ravis.opensrc@micron.com,
	sthanneeru@micron.com,
	emirakhur@micron.com,
	Hasan.Maruf@amd.com,
	seungjun.ha@samsung.com,
	hannes@cmpxchg.org,
	dan.j.williams@intel.com
Subject: [PATCH v3 4/4] mm/mempolicy: change cur_il_weight to atomic and carry
 the node with it
Date: Thu, 25 Jan 2024 13:43:45 -0500
Message-Id: <20240125184345.47074-5-gregory.price@memverge.com>
X-Mailer: git-send-email 2.39.1
In-Reply-To: <20240125184345.47074-1-gregory.price@memverge.com>
References: <20240125184345.47074-1-gregory.price@memverge.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

In the prior patch, we carry only the current weight for a weighted
interleave round with us across calls through the allocator path.

node =3D next_node_in(current->il_prev, pol->nodemask)
pol->cur_il_weight <--- this weight applies to the above node

This separation of data can cause a race condition.

If a cgroup-initiated task migration or mems_allowed change occurs
from outside the context of the task, this can cause the weight to
become stale, meaning we may end using that weight to allocate
memory on the wrong node.

Example:
  1) task A sets (cur_il_weight =3D 8) and (current->il_prev) to
     node0. node1 is the next set bit in pol->nodemask
  2) rebind event occurs, removing node1 from the nodemask.
     node2 is now the next set bit in pol->nodemask
     cur_il_weight is now stale.
  3) allocation occurs, next_node_in(il_prev, nodes) returns
     node2. cur_il_weight is now applied to the wrong node.

The upper level allocator logic must still enforce mems_allowed,
so this isn't dangerous, but it is innaccurate.

Just clearing the weight is insufficient, as it creates two more
race conditions.  The root of the issue is the separation of weight
and node data between nodemask and cur_il_weight.

To solve this, update cur_il_weight to be an atomic_t, and place the
node that the weight applies to in the upper bits of the field:

atomic_t cur_il_weight
	node bits 32:8
	weight bits 7:0

Now retrieving or clearing the active interleave node and weight
is a single atomic operation, and we are not dependent on the
potentially changing state of (pol->nodemask) to determine what
node the weight applies to.

Two special observations:
- if the weight is non-zero, cur_il_weight must *always* have a
  valid node number, e.g. it cannot be NUMA_NO_NODE (-1).
  This is because we steal the top byte for the weight.

- MAX_NUMNODES is presently limited to 1024 or less on every
  architecture. This would permanently limit MAX_NUMNODES to
  an absolute maximum of (1 << 24) to avoid overflows.

Per some reading and discussion, it appears that max nodes is
limited to 1024 so that zone type still fits in page flags, so
this method seemed preferable compared to the alternatives of
trying to make all or part of mempolicy RCU protected (which
may not be possible, since it is often referenced during code
chunks which call operations that may sleep).

Signed-off-by: Gregory Price <gregory.price@memverge.com>
Suggested-by: Andi Kleen <ak@linux.intel.com>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Frank van der Linden <fvdl@google.com>
Suggested-by: Gregory Price <gregory.price@memverge.com>
Suggested-by: Hao Wang <haowang3@fb.com>
Suggested-by: Hasan Al Maruf <hasanalmaruf@fb.com>
Suggested-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: John Groves <john@jagalactic.com>
Suggested-by: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Suggested-by: Srinivasulu Thanneeru <sthanneeru@micron.com>
Suggested-by: Vinicius Tavares Petrucci <vtavarespetr@micron.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Suggested-by: Zhongkun He <hezhongkun.hzk@bytedance.com>
---
 include/linux/mempolicy.h |  2 +-
 mm/mempolicy.c            | 93 +++++++++++++++++++++++++--------------
 2 files changed, 61 insertions(+), 34 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index c644d7bbd396..8108fc6e96ca 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -56,7 +56,7 @@ struct mempolicy {
 	} w;
=20
 	/* Weighted interleave settings */
-	u8 cur_il_weight;
+	atomic_t cur_il_weight;
 };
=20
 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5a517511658e..41b5fef0a6f5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -321,7 +321,7 @@ static struct mempolicy *mpol_new(unsigned short mode, =
unsigned short flags,
 	policy->mode =3D mode;
 	policy->flags =3D flags;
 	policy->home_node =3D NUMA_NO_NODE;
-	policy->cur_il_weight =3D 0;
+	atomic_set(&policy->cur_il_weight, 0);
=20
 	return policy;
 }
@@ -356,6 +356,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol,=
 const nodemask_t *nodes)
 		tmp =3D *nodes;
=20
 	pol->nodes =3D tmp;
+	atomic_set(&pol->cur_il_weight, 0);
 }
=20
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -973,8 +974,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *=
nmask,
 			*policy =3D next_node_in(current->il_prev, pol->nodes);
 		} else if (pol =3D=3D current->mempolicy &&
 				(pol->mode =3D=3D MPOL_WEIGHTED_INTERLEAVE)) {
-			if (pol->cur_il_weight)
-				*policy =3D current->il_prev;
+			int cweight =3D atomic_read(&pol->cur_il_weight);
+
+			if (cweight & 0xFF)
+				*policy =3D cweight >> 8;
 			else
 				*policy =3D next_node_in(current->il_prev,
 						       pol->nodes);
@@ -1864,36 +1867,48 @@ static unsigned int weighted_interleave_nodes(struc=
t mempolicy *policy)
 	unsigned int node, next;
 	struct task_struct *me =3D current;
 	u8 __rcu *table;
+	int cur_weight;
 	u8 weight;
=20
-	node =3D next_node_in(me->il_prev, policy->nodes);
-	if (node =3D=3D MAX_NUMNODES)
-		return node;
+	cur_weight =3D atomic_read(&policy->cur_il_weight);
+	node =3D cur_weight >> 8;
+	weight =3D cur_weight & 0xff;
=20
-	/* on first alloc after setting mempolicy, acquire first weight */
-	if (unlikely(!policy->cur_il_weight)) {
+	/* If nodemask was rebound, just fetch the next node */
+	if (!weight || !node_isset(node, policy->nodes)) {
+		node =3D next_node_in(me->il_prev, policy->nodes);
+		/* can only happen if nodemask has become invalid */
+		if (node =3D=3D MAX_NUMNODES)
+			return node;
 		rcu_read_lock();
 		table =3D rcu_dereference(iw_table);
 		/* detect system-default values */
 		weight =3D table ? table[node] : 1;
-		policy->cur_il_weight =3D weight ? weight : 1;
+		weight =3D weight ? weight : 1;
 		rcu_read_unlock();
 	}
=20
 	/* account for this allocation call */
-	policy->cur_il_weight--;
+	weight--;
=20
 	/* if now at 0, move to next node and set up that node's weight */
-	if (unlikely(!policy->cur_il_weight)) {
+	if (unlikely(!weight)) {
 		me->il_prev =3D node;
 		next =3D next_node_in(node, policy->nodes);
-		rcu_read_lock();
-		table =3D rcu_dereference(iw_table);
-		/* detect system-default values */
-		weight =3D table ? table[next] : 1;
-		policy->cur_il_weight =3D weight ? weight : 1;
-		rcu_read_unlock();
-	}
+		if (next !=3D MAX_NUMNODES) {
+			rcu_read_lock();
+			table =3D rcu_dereference(iw_table);
+			/* detect system-default values */
+			weight =3D table ? table[next] : 1;
+			weight =3D weight ? weight : 1;
+			rcu_read_unlock();
+			cur_weight =3D (next << 8) | weight;
+		} else /* policy->nodes became invalid */
+			cur_weight =3D 0;
+	} else
+		cur_weight =3D (node << 8) | weight;
+
+	atomic_set(&policy->cur_il_weight, cur_weight);
 	return node;
 }
=20
@@ -2385,6 +2400,7 @@ static unsigned long alloc_pages_bulk_array_weighted_=
interleave(gfp_t gfp,
 	nodemask_t nodes;
 	int nnodes, node, resume_node, next_node;
 	int prev_node =3D me->il_prev;
+	int cur_node_and_weight =3D atomic_read(&pol->cur_il_weight);
 	int i;
=20
 	if (!nr_pages)
@@ -2394,10 +2410,11 @@ static unsigned long alloc_pages_bulk_array_weighte=
d_interleave(gfp_t gfp,
 	if (!nnodes)
 		return 0;
=20
+	node =3D cur_node_and_weight >> 8;
+	weight =3D cur_node_and_weight & 0xff;
 	/* Continue allocating from most recent node and adjust the nr_pages */
-	if (pol->cur_il_weight) {
-		node =3D next_node_in(prev_node, nodes);
-		node_pages =3D pol->cur_il_weight;
+	if (weight && node_isset(node, nodes)) {
+		node_pages =3D weight;
 		if (node_pages > rem_pages)
 			node_pages =3D rem_pages;
 		nr_allocated =3D __alloc_pages_bulk(gfp, node, NULL, node_pages,
@@ -2408,27 +2425,36 @@ static unsigned long alloc_pages_bulk_array_weighte=
d_interleave(gfp_t gfp,
 		 * if that's all the pages, no need to interleave, otherwise
 		 * we need to set up the next interleave node/weight correctly.
 		 */
-		if (rem_pages < pol->cur_il_weight) {
+		if (rem_pages < weight) {
 			/* stay on current node, adjust cur_il_weight */
-			pol->cur_il_weight -=3D rem_pages;
+			weight -=3D rem_pages;
+			atomic_set(&pol->cur_il_weight, ((node << 8) | weight));
 			return total_allocated;
-		} else if (rem_pages =3D=3D pol->cur_il_weight) {
+		} else if (rem_pages =3D=3D weight) {
 			/* move to next node / weight */
 			me->il_prev =3D node;
 			next_node =3D next_node_in(node, nodes);
-			rcu_read_lock();
-			table =3D rcu_dereference(iw_table);
-			weight =3D table ? table[next_node] : 1;
-			/* detect system-default usage */
-			pol->cur_il_weight =3D weight ? weight : 1;
-			rcu_read_unlock();
+			if (next_node =3D=3D MAX_NUMNODES) {
+				next_node =3D 0;
+				weight =3D 0;
+			} else {
+				rcu_read_lock();
+				table =3D rcu_dereference(iw_table);
+				weight =3D table ? table[next_node] : 1;
+				/* detect system-default usage */
+				weight =3D weight ? weight : 1;
+				rcu_read_unlock();
+			}
+			atomic_set(&pol->cur_il_weight,
+				   ((next_node << 8) | weight));
 			return total_allocated;
 		}
 		/* Otherwise we adjust nr_pages down, and continue from there */
-		rem_pages -=3D pol->cur_il_weight;
-		pol->cur_il_weight =3D 0;
+		rem_pages -=3D weight;
 		prev_node =3D node;
 	}
+	/* clear cur_il_weight in case of an allocation failure */
+	atomic_set(&pol->cur_il_weight, 0);
=20
 	/* create a local copy of node weights to operate on outside rcu */
 	weights =3D kmalloc(nr_node_ids, GFP_KERNEL);
@@ -2513,7 +2539,8 @@ static unsigned long alloc_pages_bulk_array_weighted_=
interleave(gfp_t gfp,
 	}
 	/* resume allocating from the calculated node and weight */
 	me->il_prev =3D resume_node;
-	pol->cur_il_weight =3D resume_weight;
+	resume_node =3D next_node_in(resume_node, nodes);
+	atomic_set(&pol->cur_il_weight, ((resume_node << 8) | resume_weight));
 	kfree(weights);
 	return total_allocated;
 }
--=20
2.39.1