[PATCH v2 1/3] mm/mempolicy: implement the sysfs-based weighted_interleave interface

Gregory Price posted 3 patches 1 year, 11 months ago
There is a newer version of this series
[PATCH v2 1/3] mm/mempolicy: implement the sysfs-based weighted_interleave interface
Posted by Gregory Price 1 year, 11 months ago
From: Rakie Kim <rakie.kim@sk.com>

This patch provides a way to set interleave weight information under
sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN

The sysfs structure is designed as follows.

  $ tree /sys/kernel/mm/mempolicy/
  /sys/kernel/mm/mempolicy/ [1]
  └── weighted_interleave [2]
      ├── node0 [3]
      └── node1

Each file above can be explained as follows.

[1] mm/mempolicy: configuration interface for mempolicy subsystem

[2] weighted_interleave/: config interface for weighted interleave policy

[3] weighted_interleave/nodeN: weight for nodeN

If a node value is set to `0`, the system-default value will be used.
As of this patch, the system-default for all nodes is always 1.

Suggested-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
Co-developed-by: Gregory Price <gregory.price@memverge.com>
Signed-off-by: Gregory Price <gregory.price@memverge.com>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
---
 .../ABI/testing/sysfs-kernel-mm-mempolicy     |   4 +
 ...fs-kernel-mm-mempolicy-weighted-interleave |  26 ++
 mm/mempolicy.c                                | 231 ++++++++++++++++++
 3 files changed, 261 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
new file mode 100644
index 000000000000..2dcf24f4384a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
@@ -0,0 +1,4 @@
+What:		/sys/kernel/mm/mempolicy/
+Date:		December 2023
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Interface for Mempolicy
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
new file mode 100644
index 000000000000..e6a38139bf0f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
@@ -0,0 +1,26 @@
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/
+Date:		December 2023
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Configuration Interface for the Weighted Interleave policy
+
+What:		/sys/kernel/mm/mempolicy/weighted_interleave/nodeN
+Date:		December 2023
+Contact:	Linux memory management mailing list <linux-mm@kvack.org>
+Description:	Weight configuration interface for nodeN
+
+		The interleave weight for a memory node (N). These weights are
+		utilized by processes which have set their mempolicy to
+		MPOL_WEIGHTED_INTERLEAVE and have opted into global weights by
+		omitting a task-local weight array.
+
+		These weights only affect new allocations, and changes at runtime
+		will not cause migrations on already allocated pages.
+
+		The minimum weight for a node is always 1.
+
+		Minimum weight: 1
+		Maximum weight: 255
+
+		Writing an empty string or `0` will reset the weight to the
+		system default. The system default may be set by the kernel
+		or drivers at boot or during hotplug events.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..ae925216798f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,6 +131,16 @@ static struct mempolicy default_policy = {
 
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 
+/*
+ * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
+ * system-default value should be used. Until system-defaults are implemented,
+ * the system-default is always 1.
+ *
+ * iw_table is RCU protected
+ */
+static u8 __rcu *iw_table;
+static DEFINE_MUTEX(iw_table_lock);
+
 /**
  * numa_nearest_node - Find nearest node by state
  * @node: Node id to start the search
@@ -3067,3 +3077,224 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
 			       nodemask_pr_args(&nodes));
 }
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+	struct kobj_attribute kobj_attr;
+	int nid;
+};
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
+{
+	struct iw_node_attr *node_attr;
+	u8 weight;
+	u8 __rcu *table;
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+
+	rcu_read_lock();
+	table = rcu_dereference(iw_table);
+	weight = table ? table[node_attr->nid] : 1;
+	rcu_read_unlock();
+
+	return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct iw_node_attr *node_attr;
+	u8 __rcu *new;
+	u8 __rcu *old;
+	u8 weight = 0;
+
+	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+	if (count == 0 || sysfs_streq(buf, ""))
+		weight = 0;
+	else if (kstrtou8(buf, 0, &weight))
+		return -EINVAL;
+
+	/*
+	 * The default weight is 1 (for now), when the kernel-internal
+	 * default weight array is implemented, this should be updated to
+	 * collect the system-default weight of the node if the user passes 0.
+	 */
+	if (!weight)
+		weight = 1;
+
+	/* We only need to allocate up to the number of possible nodes */
+	new = kmalloc(nr_node_ids, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	mutex_lock(&iw_table_lock);
+	old = rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	if (old)
+		memcpy(new, old, nr_node_ids);
+	else
+		memset(new, 1, nr_node_ids);
+	new[node_attr->nid] = weight;
+	rcu_assign_pointer(iw_table, new);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	kfree(old);
+	return count;
+}
+
+static struct iw_node_attr *node_attrs[MAX_NUMNODES];
+
+static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
+				  struct kobject *parent)
+{
+	if (!node_attr)
+		return;
+	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
+	kfree(node_attr->kobj_attr.attr.name);
+	kfree(node_attr);
+}
+
+static void sysfs_wi_release(struct kobject *wi_kobj)
+{
+	int i;
+
+	for (i = 0; i < MAX_NUMNODES; i++)
+		sysfs_wi_node_release(node_attrs[i], wi_kobj);
+	kobject_put(wi_kobj);
+}
+
+static const struct kobj_type wi_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = sysfs_wi_release,
+};
+
+static int add_weight_node(int nid, struct kobject *wi_kobj)
+{
+	struct iw_node_attr *node_attr;
+	char *name;
+
+	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
+	if (!node_attr)
+		return -ENOMEM;
+
+	name = kasprintf(GFP_KERNEL, "node%d", nid);
+	if (!name) {
+		kfree(node_attr);
+		return -ENOMEM;
+	}
+
+	sysfs_attr_init(&node_attr->kobj_attr.attr);
+	node_attr->kobj_attr.attr.name = name;
+	node_attr->kobj_attr.attr.mode = 0644;
+	node_attr->kobj_attr.show = node_show;
+	node_attr->kobj_attr.store = node_store;
+	node_attr->nid = nid;
+
+	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+		kfree(node_attr->kobj_attr.attr.name);
+		kfree(node_attr);
+		pr_err("failed to add attribute to weighted_interleave\n");
+		return -ENOMEM;
+	}
+
+	node_attrs[nid] = node_attr;
+	return 0;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+	struct kobject *wi_kobj;
+	int nid, err;
+
+	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+	if (!wi_kobj)
+		return -ENOMEM;
+
+	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+				   "weighted_interleave");
+	if (err) {
+		kfree(wi_kobj);
+		return err;
+	}
+
+	memset(node_attrs, 0, sizeof(node_attrs));
+	for_each_node_state(nid, N_POSSIBLE) {
+		err = add_weight_node(nid, wi_kobj);
+		if (err) {
+			pr_err("failed to add sysfs [node%d]\n", nid);
+			break;
+		}
+	}
+	if (err)
+		kobject_put(wi_kobj);
+	return 0;
+}
+
+static void mempolicy_kobj_release(struct kobject *kobj)
+{
+	u8 __rcu *old;
+
+	mutex_lock(&iw_table_lock);
+	old = rcu_dereference_protected(iw_table,
+					lockdep_is_held(&iw_table_lock));
+	rcu_assign_pointer(iw_table, NULL);
+	mutex_unlock(&iw_table_lock);
+	synchronize_rcu();
+	/* Never free the default table, it's always in use */
+	kfree(old);
+	kfree(kobj);
+}
+
+static const struct kobj_type mempolicy_ktype = {
+	.release = mempolicy_kobj_release
+};
+
+static struct kobject *mempolicy_kobj;
+static int __init mempolicy_sysfs_init(void)
+{
+	int err;
+	struct kobject *mempolicy_kobj;
+
+	/* A NULL iw_table is interpreted by interleave logic as "all 1s" */
+	iw_table = NULL;
+	mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
+	if (!mempolicy_kobj) {
+		pr_err("failed to add mempolicy kobject to the system\n");
+		return -ENOMEM;
+	}
+	err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
+				   "mempolicy");
+	if (err) {
+		kfree(mempolicy_kobj);
+		return err;
+	}
+
+	err = add_weighted_interleave_group(mempolicy_kobj);
+
+	if (err) {
+		kobject_put(mempolicy_kobj);
+		return err;
+	}
+
+	return err;
+}
+
+static void __exit mempolicy_exit(void)
+{
+	if (mempolicy_kobj)
+		kobject_put(mempolicy_kobj);
+}
+
+#else
+static int __init mempolicy_sysfs_init(void)
+{
+	/* A NULL iw_table is interpreted by interleave logic as "all 1s" */
+	iw_table = NULL;
+	return 0;
+}
+
+static void __exit mempolicy_exit(void) { }
+#endif /* CONFIG_SYSFS */
+late_initcall(mempolicy_sysfs_init);
+module_exit(mempolicy_exit);
-- 
2.39.1

Re: [PATCH v2 1/3] mm/mempolicy: implement the sysfs-based weighted_interleave interface
Posted by Huang, Ying 1 year, 11 months ago
Gregory Price <gourry.memverge@gmail.com> writes:

> From: Rakie Kim <rakie.kim@sk.com>
>
> This patch provides a way to set interleave weight information under
> sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN
>
> The sysfs structure is designed as follows.
>
>   $ tree /sys/kernel/mm/mempolicy/
>   /sys/kernel/mm/mempolicy/ [1]
>   └── weighted_interleave [2]
>       ├── node0 [3]
>       └── node1
>
> Each file above can be explained as follows.
>
> [1] mm/mempolicy: configuration interface for mempolicy subsystem
>
> [2] weighted_interleave/: config interface for weighted interleave policy
>
> [3] weighted_interleave/nodeN: weight for nodeN
>
> If a node value is set to `0`, the system-default value will be used.
> As of this patch, the system-default for all nodes is always 1.
>
> Suggested-by: Huang Ying <ying.huang@intel.com>
> Signed-off-by: Rakie Kim <rakie.kim@sk.com>
> Signed-off-by: Honggyu Kim <honggyu.kim@sk.com>
> Co-developed-by: Gregory Price <gregory.price@memverge.com>
> Signed-off-by: Gregory Price <gregory.price@memverge.com>
> Co-developed-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
> Signed-off-by: Hyeongtak Ji <hyeongtak.ji@sk.com>
> ---
>  .../ABI/testing/sysfs-kernel-mm-mempolicy     |   4 +
>  ...fs-kernel-mm-mempolicy-weighted-interleave |  26 ++
>  mm/mempolicy.c                                | 231 ++++++++++++++++++
>  3 files changed, 261 insertions(+)
>  create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
>  create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
>
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
> new file mode 100644
> index 000000000000..2dcf24f4384a
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
> @@ -0,0 +1,4 @@
> +What:		/sys/kernel/mm/mempolicy/
> +Date:		December 2023
> +Contact:	Linux memory management mailing list <linux-mm@kvack.org>
> +Description:	Interface for Mempolicy
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
> new file mode 100644
> index 000000000000..e6a38139bf0f
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
> @@ -0,0 +1,26 @@
> +What:		/sys/kernel/mm/mempolicy/weighted_interleave/
> +Date:		December 2023

May be not a big deal.  The date should be "January 2024"?

> +Contact:	Linux memory management mailing list <linux-mm@kvack.org>
> +Description:	Configuration Interface for the Weighted Interleave policy
> +
> +What:		/sys/kernel/mm/mempolicy/weighted_interleave/nodeN
> +Date:		December 2023
> +Contact:	Linux memory management mailing list <linux-mm@kvack.org>
> +Description:	Weight configuration interface for nodeN
> +
> +		The interleave weight for a memory node (N). These weights are
> +		utilized by processes which have set their mempolicy to

s/processes/tasks or memory areas/

> +		MPOL_WEIGHTED_INTERLEAVE and have opted into global weights by
> +		omitting a task-local weight array.

Now, we haven't introduced task-local weight array.  So, leave this
until we introduce that?

> +
> +		These weights only affect new allocations, and changes at runtime
> +		will not cause migrations on already allocated pages.
> +
> +		The minimum weight for a node is always 1.
> +
> +		Minimum weight: 1
> +		Maximum weight: 255
> +
> +		Writing an empty string or `0` will reset the weight to the
> +		system default. The system default may be set by the kernel
> +		or drivers at boot or during hotplug events.
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 10a590ee1c89..ae925216798f 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -131,6 +131,16 @@ static struct mempolicy default_policy = {
>  
>  static struct mempolicy preferred_node_policy[MAX_NUMNODES];
>  
> +/*
> + * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
> + * system-default value should be used. Until system-defaults are implemented,
> + * the system-default is always 1.
> + *
> + * iw_table is RCU protected
> + */
> +static u8 __rcu *iw_table;
> +static DEFINE_MUTEX(iw_table_lock);
> +
>  /**
>   * numa_nearest_node - Find nearest node by state
>   * @node: Node id to start the search
> @@ -3067,3 +3077,224 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
>  		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
>  			       nodemask_pr_args(&nodes));
>  }
> +
> +#ifdef CONFIG_SYSFS
> +struct iw_node_attr {
> +	struct kobj_attribute kobj_attr;
> +	int nid;
> +};
> +
> +static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
> +			 char *buf)
> +{
> +	struct iw_node_attr *node_attr;
> +	u8 weight;
> +	u8 __rcu *table;
> +
> +	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
> +
> +	rcu_read_lock();
> +	table = rcu_dereference(iw_table);
> +	weight = table ? table[node_attr->nid] : 1;
> +	rcu_read_unlock();
> +
> +	return sysfs_emit(buf, "%d\n", weight);
> +}
> +
> +static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
> +			  const char *buf, size_t count)
> +{
> +	struct iw_node_attr *node_attr;
> +	u8 __rcu *new;
> +	u8 __rcu *old;
> +	u8 weight = 0;
> +
> +	node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
> +	if (count == 0 || sysfs_streq(buf, ""))
> +		weight = 0;
> +	else if (kstrtou8(buf, 0, &weight))
> +		return -EINVAL;
> +
> +	/*
> +	 * The default weight is 1 (for now), when the kernel-internal
> +	 * default weight array is implemented, this should be updated to
> +	 * collect the system-default weight of the node if the user passes 0.
> +	 */
> +	if (!weight)
> +		weight = 1;

From functionality point of view, it's OK to set "weight = 1" here now.
But when we add system default weight table in the future, we need to
use "weight = 0".  Otherwise, we cannot distinguish whether the default
value have been customized via sysfs.  So, I suggest to use that rule.

> +
> +	/* We only need to allocate up to the number of possible nodes */

This comment appears not necessary.

> +	new = kmalloc(nr_node_ids, GFP_KERNEL);
> +	if (!new)
> +		return -ENOMEM;
> +
> +	mutex_lock(&iw_table_lock);
> +	old = rcu_dereference_protected(iw_table,
> +					lockdep_is_held(&iw_table_lock));
> +	if (old)
> +		memcpy(new, old, nr_node_ids);
> +	else
> +		memset(new, 1, nr_node_ids);

With similar reason as above ("From functionality..."), I suggest to set
"0" here.

> +	new[node_attr->nid] = weight;
> +	rcu_assign_pointer(iw_table, new);
> +	mutex_unlock(&iw_table_lock);
> +	synchronize_rcu();
> +	kfree(old);
> +	return count;
> +}
> +
> +static struct iw_node_attr *node_attrs[MAX_NUMNODES];

node_attrs[] can be allocated dynamically too.  Just a suggestion.

> +
> +static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
> +				  struct kobject *parent)
> +{
> +	if (!node_attr)
> +		return;
> +	sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
> +	kfree(node_attr->kobj_attr.attr.name);
> +	kfree(node_attr);
> +}
> +
> +static void sysfs_wi_release(struct kobject *wi_kobj)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_NUMNODES; i++)

Nitpick, nr_node_ids should be OK here.

> +		sysfs_wi_node_release(node_attrs[i], wi_kobj);
> +	kobject_put(wi_kobj);
> +}
> +
> +static const struct kobj_type wi_ktype = {
> +	.sysfs_ops = &kobj_sysfs_ops,
> +	.release = sysfs_wi_release,
> +};
> +
> +static int add_weight_node(int nid, struct kobject *wi_kobj)
> +{
> +	struct iw_node_attr *node_attr;
> +	char *name;
> +
> +	node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
> +	if (!node_attr)
> +		return -ENOMEM;
> +
> +	name = kasprintf(GFP_KERNEL, "node%d", nid);
> +	if (!name) {
> +		kfree(node_attr);
> +		return -ENOMEM;
> +	}
> +
> +	sysfs_attr_init(&node_attr->kobj_attr.attr);
> +	node_attr->kobj_attr.attr.name = name;
> +	node_attr->kobj_attr.attr.mode = 0644;
> +	node_attr->kobj_attr.show = node_show;
> +	node_attr->kobj_attr.store = node_store;
> +	node_attr->nid = nid;
> +
> +	if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
> +		kfree(node_attr->kobj_attr.attr.name);
> +		kfree(node_attr);
> +		pr_err("failed to add attribute to weighted_interleave\n");
> +		return -ENOMEM;
> +	}
> +
> +	node_attrs[nid] = node_attr;
> +	return 0;
> +}
> +
> +static int add_weighted_interleave_group(struct kobject *root_kobj)
> +{
> +	struct kobject *wi_kobj;
> +	int nid, err;
> +
> +	wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
> +	if (!wi_kobj)
> +		return -ENOMEM;
> +
> +	err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
> +				   "weighted_interleave");
> +	if (err) {
> +		kfree(wi_kobj);
> +		return err;
> +	}
> +
> +	memset(node_attrs, 0, sizeof(node_attrs));
> +	for_each_node_state(nid, N_POSSIBLE) {
> +		err = add_weight_node(nid, wi_kobj);
> +		if (err) {
> +			pr_err("failed to add sysfs [node%d]\n", nid);
> +			break;
> +		}
> +	}
> +	if (err)
> +		kobject_put(wi_kobj);
> +	return 0;
> +}
> +
> +static void mempolicy_kobj_release(struct kobject *kobj)
> +{
> +	u8 __rcu *old;
> +
> +	mutex_lock(&iw_table_lock);
> +	old = rcu_dereference_protected(iw_table,
> +					lockdep_is_held(&iw_table_lock));
> +	rcu_assign_pointer(iw_table, NULL);
> +	mutex_unlock(&iw_table_lock);
> +	synchronize_rcu();
> +	/* Never free the default table, it's always in use */

Obsolete comment?

> +	kfree(old);

It appears unnecessary to free iw_table in error path.  But this isn't a
big deal because error path will almost never be executed in practice.

> +	kfree(kobj);
> +}
> +
> +static const struct kobj_type mempolicy_ktype = {
> +	.release = mempolicy_kobj_release
> +};
> +
> +static struct kobject *mempolicy_kobj;
> +static int __init mempolicy_sysfs_init(void)
> +{
> +	int err;
> +	struct kobject *mempolicy_kobj;

This overrides the global "mempolicy_kobj" defined before function.  But
I don't think we need the global definition.

> +
> +	/* A NULL iw_table is interpreted by interleave logic as "all 1s" */

As I suggested above, it will be "all 0s", that is, use default weight.

> +	iw_table = NULL;

The default value is NULL already, it appears unnecessary to do this.

> +	mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
> +	if (!mempolicy_kobj) {
> +		pr_err("failed to add mempolicy kobject to the system\n");
> +		return -ENOMEM;
> +	}
> +	err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
> +				   "mempolicy");
> +	if (err) {
> +		kfree(mempolicy_kobj);
> +		return err;
> +	}
> +
> +	err = add_weighted_interleave_group(mempolicy_kobj);
> +
> +	if (err) {
> +		kobject_put(mempolicy_kobj);
> +		return err;
> +	}
> +
> +	return err;
> +}
> +
> +static void __exit mempolicy_exit(void)
> +{
> +	if (mempolicy_kobj)
> +		kobject_put(mempolicy_kobj);
> +}
> +
> +#else
> +static int __init mempolicy_sysfs_init(void)
> +{
> +	/* A NULL iw_table is interpreted by interleave logic as "all 1s" */
> +	iw_table = NULL;
> +	return 0;
> +}
> +
> +static void __exit mempolicy_exit(void) { }
> +#endif /* CONFIG_SYSFS */
> +late_initcall(mempolicy_sysfs_init);
> +module_exit(mempolicy_exit);

mempolicy.c will not be compiled as module, so we don't need
module_exit().

--
Best Regards,
Huang, Ying
Re: [PATCH v2 1/3] mm/mempolicy: implement the sysfs-based weighted_interleave interface
Posted by Gregory Price 1 year, 11 months ago
On Mon, Jan 22, 2024 at 04:03:53PM +0800, Huang, Ying wrote:
> Gregory Price <gourry.memverge@gmail.com> writes:
> 
> > +	/*
> > +	 * The default weight is 1 (for now), when the kernel-internal
> > +	 * default weight array is implemented, this should be updated to
> > +	 * collect the system-default weight of the node if the user passes 0.
> > +	 */
> > +	if (!weight)
> > +		weight = 1;
> 
> From functionality point of view, it's OK to set "weight = 1" here now.
> But when we add system default weight table in the future, we need to
> use "weight = 0".  Otherwise, we cannot distinguish whether the default
> value have been customized via sysfs.  So, I suggest to use that rule.
>
[... snip ...]
> > +	else
> > +		memset(new, 1, nr_node_ids);
> 
> With similar reason as above ("From functionality..."), I suggest to set
> "0" here.
> 

blah - the comment is misleading at best.  The future patch should pass
0 through to the sysfs table and the allocators updated to collect the
system-default weight of the node.

re: doing it this way right now -

I chose to do it this way for now because it ultimately simplifies the
logic in the allocators - all of which will need to be updated with the
future patch set regardless of our implementation choice now.

e.g.

rcu_read_lock();
table = rcu_dereference(iw_table);
if (!policy->wil.cur_weight)
	policy->wil.cur_weight = table ? table[next] : 1;
	                         ^^^ only need single conditional now
rcu_read_unlock();

This logic will need to be updated to use default table values, so I
chose the simpler implementation and left the change to be explicit
at the time the default table is implemented.

If you prefer it the other way now, I can change it, but this seemed
cleaner and simpler for the time being.

> > +	new[node_attr->nid] = weight;
> > +	rcu_assign_pointer(iw_table, new);
> > +	mutex_unlock(&iw_table_lock);
> > +	synchronize_rcu();
> > +	kfree(old);
> > +	return count;
> > +}
> > +
> > +static struct iw_node_attr *node_attrs[MAX_NUMNODES];
> 
> node_attrs[] can be allocated dynamically too.  Just a suggestion.
> 

ack to this and other references to nr_node_ids, will change.

> > +	kfree(old);
> 
> It appears unnecessary to free iw_table in error path.  But this isn't a
> big deal because error path will almost never be executed in practice.
>

checkpatch.pl yells at you if you do null checks before kfree :]

> > +	int err;
> > +	struct kobject *mempolicy_kobj;
> 
> This overrides the global "mempolicy_kobj" defined before function.  But
> I don't think we need the global definition.
> 

Assuming the exit path isn't needed then yeah the global isn't needed.

> > +static int __init mempolicy_sysfs_init(void)
> > +{
> > +	/* A NULL iw_table is interpreted by interleave logic as "all 1s" */
> > +	iw_table = NULL;
> > +	return 0;
> > +}
> > +
> > +static void __exit mempolicy_exit(void) { }
> > +#endif /* CONFIG_SYSFS */
> > +late_initcall(mempolicy_sysfs_init);
> > +module_exit(mempolicy_exit);
> 
> mempolicy.c will not be compiled as module, so we don't need
> module_exit().
> 

ack