Accelerate page migration with batch copying and hardware offload

[RFC V3 6/9] mtcopy: introduce multi-threaded page copy routine

Posted by Shivank Garg 4 months, 2 weeks ago

From: Zi Yan <ziy@nvidia.com>

Now page copies are batched, multi-threaded page copy can be used to
increase page copy throughput.

Enable using:
echo 1 >  /sys/kernel/cpu_mt/offloading
echo NR_THREADS >  /sys/kernel/cpu_mt/threads

Disable:
echo 0 >  /sys/kernel/cpu_mt/offloading

Signed-off-by: Zi Yan <ziy@nvidia.com>
Co-developed-by: Shivank Garg <shivankg@amd.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
 drivers/Kconfig                        |   2 +
 drivers/Makefile                       |   3 +
 drivers/migoffcopy/Kconfig             |   9 +
 drivers/migoffcopy/Makefile            |   1 +
 drivers/migoffcopy/mtcopy/Makefile     |   1 +
 drivers/migoffcopy/mtcopy/copy_pages.c | 327 +++++++++++++++++++++++++
 6 files changed, 343 insertions(+)
 create mode 100644 drivers/migoffcopy/Kconfig
 create mode 100644 drivers/migoffcopy/Makefile
 create mode 100644 drivers/migoffcopy/mtcopy/Makefile
 create mode 100644 drivers/migoffcopy/mtcopy/copy_pages.c

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 4915a63866b0..d2cbc97a7683 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -251,4 +251,6 @@ source "drivers/hte/Kconfig"
 
 source "drivers/cdx/Kconfig"
 
+source "drivers/migoffcopy/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index b5749cf67044..5326d88cf31c 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -42,6 +42,9 @@ obj-y				+= clk/
 # really early.
 obj-$(CONFIG_DMADEVICES)	+= dma/
 
+# Migration copy Offload
+obj-$(CONFIG_OFFC_MIGRATION)	+= migoffcopy/
+
 # SOC specific infrastructure drivers.
 obj-y				+= soc/
 obj-$(CONFIG_PM_GENERIC_DOMAINS)	+= pmdomain/
diff --git a/drivers/migoffcopy/Kconfig b/drivers/migoffcopy/Kconfig
new file mode 100644
index 000000000000..e73698af3e72
--- /dev/null
+++ b/drivers/migoffcopy/Kconfig
@@ -0,0 +1,9 @@
+config MTCOPY_CPU
+       bool "Multi-Threaded Copy with CPU"
+       depends on OFFC_MIGRATION
+       default n
+       help
+         Interface MT COPY CPU driver for batch page migration
+         offloading. Say Y if you want to try offloading with
+         MultiThreaded CPU copy APIs.
+
diff --git a/drivers/migoffcopy/Makefile b/drivers/migoffcopy/Makefile
new file mode 100644
index 000000000000..0a3c356d67e6
--- /dev/null
+++ b/drivers/migoffcopy/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_MTCOPY_CPU) += mtcopy/
diff --git a/drivers/migoffcopy/mtcopy/Makefile b/drivers/migoffcopy/mtcopy/Makefile
new file mode 100644
index 000000000000..b4d7da85eda9
--- /dev/null
+++ b/drivers/migoffcopy/mtcopy/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_MTCOPY_CPU) += copy_pages.o
diff --git a/drivers/migoffcopy/mtcopy/copy_pages.c b/drivers/migoffcopy/mtcopy/copy_pages.c
new file mode 100644
index 000000000000..68e50de602d6
--- /dev/null
+++ b/drivers/migoffcopy/mtcopy/copy_pages.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Parallel page copy routine.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/init.h>
+#include <linux/sysctl.h>
+#include <linux/sysfs.h>
+#include <linux/highmem.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <linux/migrate.h>
+#include <linux/migrate_offc.h>
+
+#define MAX_NUM_COPY_THREADS 64
+
+unsigned int limit_mt_num = 4;
+static int is_dispatching;
+
+static int copy_page_lists_mt(struct list_head *dst_folios,
+		struct list_head *src_folios, unsigned int nr_items);
+
+static DEFINE_MUTEX(migratecfg_mutex);
+
+/* CPU Multithreaded Batch Migrator */
+struct migrator cpu_migrator = {
+	.name = "CPU_MT_COPY\0",
+	.migrate_offc = copy_page_lists_mt,
+	.owner = THIS_MODULE,
+};
+
+struct copy_item {
+	char *to;
+	char *from;
+	unsigned long chunk_size;
+};
+
+struct copy_page_info {
+	struct work_struct copy_page_work;
+	int ret;
+	unsigned long num_items;
+	struct copy_item item_list[];
+};
+
+static unsigned long copy_page_routine(char *vto, char *vfrom,
+	unsigned long chunk_size)
+{
+	return copy_mc_to_kernel(vto, vfrom, chunk_size);
+}
+
+static void copy_page_work_queue_thread(struct work_struct *work)
+{
+	struct copy_page_info *my_work = (struct copy_page_info *)work;
+	int i;
+
+	my_work->ret = 0;
+	for (i = 0; i < my_work->num_items; ++i)
+		my_work->ret |= !!copy_page_routine(my_work->item_list[i].to,
+					my_work->item_list[i].from,
+					my_work->item_list[i].chunk_size);
+}
+
+static ssize_t mt_offloading_set(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	int ccode;
+	int action;
+
+	ccode = kstrtoint(buf, 0, &action);
+	if (ccode) {
+		pr_debug("(%s:) error parsing input %s\n", __func__, buf);
+		return ccode;
+	}
+
+	/*
+	 * action is 0: User wants to disable MT offloading.
+	 * action is 1: User wants to enable MT offloading.
+	 */
+	switch (action) {
+	case 0:
+		mutex_lock(&migratecfg_mutex);
+		if (is_dispatching == 1) {
+			stop_offloading();
+			is_dispatching = 0;
+		} else
+			pr_debug("MT migration offloading is already OFF\n");
+		mutex_unlock(&migratecfg_mutex);
+		break;
+	case 1:
+		mutex_lock(&migratecfg_mutex);
+		if (is_dispatching == 0) {
+			start_offloading(&cpu_migrator);
+			is_dispatching = 1;
+		} else
+			pr_debug("MT migration offloading is already ON\n");
+		mutex_unlock(&migratecfg_mutex);
+		break;
+	default:
+		pr_debug("input should be zero or one, parsed as %d\n", action);
+	}
+	return sizeof(action);
+}
+
+static ssize_t mt_offloading_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%d\n", is_dispatching);
+}
+
+static ssize_t mt_threads_set(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	int ccode;
+	unsigned int threads;
+
+	ccode = kstrtouint(buf, 0, &threads);
+	if (ccode) {
+		pr_debug("(%s:) error parsing input %s\n", __func__, buf);
+		return ccode;
+	}
+
+	if (threads > 0 && threads <= MAX_NUM_COPY_THREADS) {
+		mutex_lock(&migratecfg_mutex);
+		limit_mt_num = threads;
+		mutex_unlock(&migratecfg_mutex);
+		pr_debug("MT threads set to %u\n", limit_mt_num);
+	} else {
+		pr_debug("Invalid thread count. Must be between 1 and %d\n", MAX_NUM_COPY_THREADS);
+		return -EINVAL;
+	}
+
+	return count;
+}
+
+static ssize_t mt_threads_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", limit_mt_num);
+}
+
+int copy_page_lists_mt(struct list_head *dst_folios,
+		struct list_head *src_folios, unsigned int nr_items)
+{
+	struct copy_page_info *work_items[MAX_NUM_COPY_THREADS] = {0};
+	unsigned int total_mt_num = limit_mt_num;
+	struct folio *src, *src2, *dst, *dst2;
+	int max_items_per_thread;
+	int item_idx;
+	int err = 0;
+	int cpu;
+	int i;
+
+	if (IS_ENABLED(CONFIG_HIGHMEM))
+		return -EOPNOTSUPP;
+
+	/* Each threads get part of each page, if nr_items < totla_mt_num */
+	if (nr_items < total_mt_num)
+		max_items_per_thread = nr_items;
+	else
+		max_items_per_thread = (nr_items / total_mt_num) +
+				((nr_items % total_mt_num) ? 1 : 0);
+
+
+	for (cpu = 0; cpu < total_mt_num; ++cpu) {
+		work_items[cpu] = kzalloc(sizeof(struct copy_page_info) +
+						sizeof(struct copy_item) *
+							max_items_per_thread,
+					  GFP_NOWAIT);
+		if (!work_items[cpu]) {
+			err = -ENOMEM;
+			goto free_work_items;
+		}
+	}
+
+	if (nr_items < total_mt_num) {
+		for (cpu = 0; cpu < total_mt_num; ++cpu) {
+			INIT_WORK((struct work_struct *)work_items[cpu],
+					  copy_page_work_queue_thread);
+			work_items[cpu]->num_items = max_items_per_thread;
+		}
+
+		item_idx = 0;
+		dst = list_first_entry(dst_folios, struct folio, lru);
+		dst2 = list_next_entry(dst, lru);
+		list_for_each_entry_safe(src, src2, src_folios, lru) {
+			unsigned long chunk_size = PAGE_SIZE * folio_nr_pages(src) / total_mt_num;
+			char *vfrom = page_address(&src->page);
+			char *vto = page_address(&dst->page);
+
+			VM_WARN_ON(PAGE_SIZE * folio_nr_pages(src) % total_mt_num);
+			VM_WARN_ON(folio_nr_pages(dst) != folio_nr_pages(src));
+
+			for (cpu = 0; cpu < total_mt_num; ++cpu) {
+				work_items[cpu]->item_list[item_idx].to =
+					vto + chunk_size * cpu;
+				work_items[cpu]->item_list[item_idx].from =
+					vfrom + chunk_size * cpu;
+				work_items[cpu]->item_list[item_idx].chunk_size =
+					chunk_size;
+			}
+
+			item_idx++;
+			dst = dst2;
+			dst2 = list_next_entry(dst, lru);
+		}
+
+		for (cpu = 0; cpu < total_mt_num; ++cpu)
+			queue_work(system_unbound_wq,
+				   (struct work_struct *)work_items[cpu]);
+	} else {
+		int num_xfer_per_thread = nr_items / total_mt_num;
+		int per_cpu_item_idx;
+
+
+		for (cpu = 0; cpu < total_mt_num; ++cpu) {
+			INIT_WORK((struct work_struct *)work_items[cpu],
+					  copy_page_work_queue_thread);
+
+			work_items[cpu]->num_items = num_xfer_per_thread +
+					(cpu < (nr_items % total_mt_num));
+		}
+
+		cpu = 0;
+		per_cpu_item_idx = 0;
+		item_idx = 0;
+		dst = list_first_entry(dst_folios, struct folio, lru);
+		dst2 = list_next_entry(dst, lru);
+		list_for_each_entry_safe(src, src2, src_folios, lru) {
+			work_items[cpu]->item_list[per_cpu_item_idx].to =
+				page_address(&dst->page);
+			work_items[cpu]->item_list[per_cpu_item_idx].from =
+				page_address(&src->page);
+			work_items[cpu]->item_list[per_cpu_item_idx].chunk_size =
+				PAGE_SIZE * folio_nr_pages(src);
+
+			VM_WARN_ON(folio_nr_pages(dst) !=
+				   folio_nr_pages(src));
+
+			per_cpu_item_idx++;
+			item_idx++;
+			dst = dst2;
+			dst2 = list_next_entry(dst, lru);
+
+			if (per_cpu_item_idx == work_items[cpu]->num_items) {
+				queue_work(system_unbound_wq,
+					(struct work_struct *)work_items[cpu]);
+				per_cpu_item_idx = 0;
+				cpu++;
+			}
+		}
+		if (item_idx != nr_items)
+			pr_warn("%s: only %d out of %d pages are transferred\n",
+				__func__, item_idx - 1, nr_items);
+	}
+
+	/* Wait until it finishes  */
+	for (i = 0; i < total_mt_num; ++i) {
+		flush_work((struct work_struct *)work_items[i]);
+		/* retry if any copy fails */
+		if (work_items[i]->ret)
+			err = -EAGAIN;
+	}
+
+free_work_items:
+	for (cpu = 0; cpu < total_mt_num; ++cpu)
+		kfree(work_items[cpu]);
+
+	return err;
+}
+
+static struct kobject *mt_kobj_ref;
+static struct kobj_attribute mt_offloading_attribute = __ATTR(offloading, 0664,
+		mt_offloading_show, mt_offloading_set);
+static struct kobj_attribute mt_threads_attribute = __ATTR(threads, 0664,
+		mt_threads_show, mt_threads_set);
+
+static int __init cpu_mt_module_init(void)
+{
+	int ret = 0;
+
+	mt_kobj_ref = kobject_create_and_add("cpu_mt", kernel_kobj);
+	if (!mt_kobj_ref)
+		return -ENOMEM;
+
+	ret = sysfs_create_file(mt_kobj_ref, &mt_offloading_attribute.attr);
+	if (ret)
+		goto out_offloading;
+
+	ret = sysfs_create_file(mt_kobj_ref, &mt_threads_attribute.attr);
+	if (ret)
+		goto out_threads;
+
+	is_dispatching = 0;
+
+	return 0;
+
+out_threads:
+	sysfs_remove_file(mt_kobj_ref, &mt_offloading_attribute.attr);
+out_offloading:
+	kobject_put(mt_kobj_ref);
+	return ret;
+}
+
+static void __exit cpu_mt_module_exit(void)
+{
+	/* Stop the MT offloading to unload the module */
+	mutex_lock(&migratecfg_mutex);
+	if (is_dispatching == 1) {
+		stop_offloading();
+		is_dispatching = 0;
+	}
+	mutex_unlock(&migratecfg_mutex);
+
+	sysfs_remove_file(mt_kobj_ref, &mt_threads_attribute.attr);
+	sysfs_remove_file(mt_kobj_ref, &mt_offloading_attribute.attr);
+	kobject_put(mt_kobj_ref);
+}
+
+module_init(cpu_mt_module_init);
+module_exit(cpu_mt_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Zi Yan");
+MODULE_DESCRIPTION("CPU_MT_COPY"); /* CPU Multithreaded Batch Migrator */
-- 
2.43.0

Re: [RFC V3 6/9] mtcopy: introduce multi-threaded page copy routine

Posted by Byungchul Park 3 months, 3 weeks ago

On Tue, Sep 23, 2025 at 05:47:41PM +0000, Shivank Garg wrote:
> From: Zi Yan <ziy@nvidia.com>
> 
> Now page copies are batched, multi-threaded page copy can be used to
> increase page copy throughput.
> 
> Enable using:
> echo 1 >  /sys/kernel/cpu_mt/offloading
> echo NR_THREADS >  /sys/kernel/cpu_mt/threads
> 
> Disable:
> echo 0 >  /sys/kernel/cpu_mt/offloading
> 
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> Co-developed-by: Shivank Garg <shivankg@amd.com>
> Signed-off-by: Shivank Garg <shivankg@amd.com>
> ---
>  drivers/Kconfig                        |   2 +
>  drivers/Makefile                       |   3 +
>  drivers/migoffcopy/Kconfig             |   9 +
>  drivers/migoffcopy/Makefile            |   1 +
>  drivers/migoffcopy/mtcopy/Makefile     |   1 +
>  drivers/migoffcopy/mtcopy/copy_pages.c | 327 +++++++++++++++++++++++++
>  6 files changed, 343 insertions(+)
>  create mode 100644 drivers/migoffcopy/Kconfig
>  create mode 100644 drivers/migoffcopy/Makefile
>  create mode 100644 drivers/migoffcopy/mtcopy/Makefile
>  create mode 100644 drivers/migoffcopy/mtcopy/copy_pages.c
> 
> diff --git a/drivers/Kconfig b/drivers/Kconfig
> index 4915a63866b0..d2cbc97a7683 100644
> --- a/drivers/Kconfig
> +++ b/drivers/Kconfig
> @@ -251,4 +251,6 @@ source "drivers/hte/Kconfig"
> 
>  source "drivers/cdx/Kconfig"
> 
> +source "drivers/migoffcopy/Kconfig"
> +
>  endmenu
> diff --git a/drivers/Makefile b/drivers/Makefile
> index b5749cf67044..5326d88cf31c 100644
> --- a/drivers/Makefile
> +++ b/drivers/Makefile
> @@ -42,6 +42,9 @@ obj-y                         += clk/
>  # really early.
>  obj-$(CONFIG_DMADEVICES)       += dma/
> 
> +# Migration copy Offload
> +obj-$(CONFIG_OFFC_MIGRATION)   += migoffcopy/
> +
>  # SOC specific infrastructure drivers.
>  obj-y                          += soc/
>  obj-$(CONFIG_PM_GENERIC_DOMAINS)       += pmdomain/
> diff --git a/drivers/migoffcopy/Kconfig b/drivers/migoffcopy/Kconfig
> new file mode 100644
> index 000000000000..e73698af3e72
> --- /dev/null
> +++ b/drivers/migoffcopy/Kconfig
> @@ -0,0 +1,9 @@
> +config MTCOPY_CPU
> +       bool "Multi-Threaded Copy with CPU"
> +       depends on OFFC_MIGRATION
> +       default n
> +       help
> +         Interface MT COPY CPU driver for batch page migration
> +         offloading. Say Y if you want to try offloading with
> +         MultiThreaded CPU copy APIs.
> +
> diff --git a/drivers/migoffcopy/Makefile b/drivers/migoffcopy/Makefile
> new file mode 100644
> index 000000000000..0a3c356d67e6
> --- /dev/null
> +++ b/drivers/migoffcopy/Makefile
> @@ -0,0 +1 @@
> +obj-$(CONFIG_MTCOPY_CPU) += mtcopy/
> diff --git a/drivers/migoffcopy/mtcopy/Makefile b/drivers/migoffcopy/mtcopy/Makefile
> new file mode 100644
> index 000000000000..b4d7da85eda9
> --- /dev/null
> +++ b/drivers/migoffcopy/mtcopy/Makefile
> @@ -0,0 +1 @@
> +obj-$(CONFIG_MTCOPY_CPU) += copy_pages.o
> diff --git a/drivers/migoffcopy/mtcopy/copy_pages.c b/drivers/migoffcopy/mtcopy/copy_pages.c
> new file mode 100644
> index 000000000000..68e50de602d6
> --- /dev/null
> +++ b/drivers/migoffcopy/mtcopy/copy_pages.c
> @@ -0,0 +1,327 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Parallel page copy routine.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/printk.h>
> +#include <linux/init.h>
> +#include <linux/sysctl.h>
> +#include <linux/sysfs.h>
> +#include <linux/highmem.h>
> +#include <linux/workqueue.h>
> +#include <linux/slab.h>
> +#include <linux/migrate.h>
> +#include <linux/migrate_offc.h>
> +
> +#define MAX_NUM_COPY_THREADS 64
> +
> +unsigned int limit_mt_num = 4;
> +static int is_dispatching;
> +
> +static int copy_page_lists_mt(struct list_head *dst_folios,
> +               struct list_head *src_folios, unsigned int nr_items);
> +
> +static DEFINE_MUTEX(migratecfg_mutex);
> +
> +/* CPU Multithreaded Batch Migrator */
> +struct migrator cpu_migrator = {
> +       .name = "CPU_MT_COPY\0",
> +       .migrate_offc = copy_page_lists_mt,
> +       .owner = THIS_MODULE,
> +};
> +
> +struct copy_item {
> +       char *to;
> +       char *from;
> +       unsigned long chunk_size;
> +};
> +
> +struct copy_page_info {
> +       struct work_struct copy_page_work;
> +       int ret;
> +       unsigned long num_items;
> +       struct copy_item item_list[];
> +};
> +
> +static unsigned long copy_page_routine(char *vto, char *vfrom,
> +       unsigned long chunk_size)
> +{
> +       return copy_mc_to_kernel(vto, vfrom, chunk_size);
> +}
> +
> +static void copy_page_work_queue_thread(struct work_struct *work)
> +{
> +       struct copy_page_info *my_work = (struct copy_page_info *)work;
> +       int i;
> +
> +       my_work->ret = 0;
> +       for (i = 0; i < my_work->num_items; ++i)
> +               my_work->ret |= !!copy_page_routine(my_work->item_list[i].to,
> +                                       my_work->item_list[i].from,
> +                                       my_work->item_list[i].chunk_size);
> +}
> +
> +static ssize_t mt_offloading_set(struct kobject *kobj, struct kobj_attribute *attr,
> +               const char *buf, size_t count)
> +{
> +       int ccode;
> +       int action;
> +
> +       ccode = kstrtoint(buf, 0, &action);
> +       if (ccode) {
> +               pr_debug("(%s:) error parsing input %s\n", __func__, buf);
> +               return ccode;
> +       }
> +
> +       /*
> +        * action is 0: User wants to disable MT offloading.
> +        * action is 1: User wants to enable MT offloading.
> +        */
> +       switch (action) {
> +       case 0:
> +               mutex_lock(&migratecfg_mutex);
> +               if (is_dispatching == 1) {
> +                       stop_offloading();
> +                       is_dispatching = 0;
> +               } else
> +                       pr_debug("MT migration offloading is already OFF\n");
> +               mutex_unlock(&migratecfg_mutex);
> +               break;
> +       case 1:
> +               mutex_lock(&migratecfg_mutex);
> +               if (is_dispatching == 0) {
> +                       start_offloading(&cpu_migrator);
> +                       is_dispatching = 1;
> +               } else
> +                       pr_debug("MT migration offloading is already ON\n");
> +               mutex_unlock(&migratecfg_mutex);
> +               break;
> +       default:
> +               pr_debug("input should be zero or one, parsed as %d\n", action);
> +       }
> +       return sizeof(action);
> +}
> +
> +static ssize_t mt_offloading_show(struct kobject *kobj,
> +               struct kobj_attribute *attr, char *buf)
> +{
> +       return sysfs_emit(buf, "%d\n", is_dispatching);
> +}
> +
> +static ssize_t mt_threads_set(struct kobject *kobj, struct kobj_attribute *attr,
> +               const char *buf, size_t count)
> +{
> +       int ccode;
> +       unsigned int threads;
> +
> +       ccode = kstrtouint(buf, 0, &threads);
> +       if (ccode) {
> +               pr_debug("(%s:) error parsing input %s\n", __func__, buf);
> +               return ccode;
> +       }
> +
> +       if (threads > 0 && threads <= MAX_NUM_COPY_THREADS) {
> +               mutex_lock(&migratecfg_mutex);
> +               limit_mt_num = threads;
> +               mutex_unlock(&migratecfg_mutex);
> +               pr_debug("MT threads set to %u\n", limit_mt_num);
> +       } else {
> +               pr_debug("Invalid thread count. Must be between 1 and %d\n", MAX_NUM_COPY_THREADS);
> +               return -EINVAL;
> +       }
> +
> +       return count;
> +}
> +
> +static ssize_t mt_threads_show(struct kobject *kobj,
> +               struct kobj_attribute *attr, char *buf)
> +{
> +       return sysfs_emit(buf, "%u\n", limit_mt_num);
> +}
> +
> +int copy_page_lists_mt(struct list_head *dst_folios,
> +               struct list_head *src_folios, unsigned int nr_items)
> +{
> +       struct copy_page_info *work_items[MAX_NUM_COPY_THREADS] = {0};
> +       unsigned int total_mt_num = limit_mt_num;
> +       struct folio *src, *src2, *dst, *dst2;
> +       int max_items_per_thread;
> +       int item_idx;
> +       int err = 0;
> +       int cpu;
> +       int i;
> +
> +       if (IS_ENABLED(CONFIG_HIGHMEM))
> +               return -EOPNOTSUPP;
> +
> +       /* Each threads get part of each page, if nr_items < totla_mt_num */
> +       if (nr_items < total_mt_num)
> +               max_items_per_thread = nr_items;
> +       else
> +               max_items_per_thread = (nr_items / total_mt_num) +
> +                               ((nr_items % total_mt_num) ? 1 : 0);
> +
> +
> +       for (cpu = 0; cpu < total_mt_num; ++cpu) {
> +               work_items[cpu] = kzalloc(sizeof(struct copy_page_info) +
> +                                               sizeof(struct copy_item) *
> +                                                       max_items_per_thread,
> +                                         GFP_NOWAIT);
> +               if (!work_items[cpu]) {
> +                       err = -ENOMEM;
> +                       goto free_work_items;
> +               }
> +       }
> +
> +       if (nr_items < total_mt_num) {
> +               for (cpu = 0; cpu < total_mt_num; ++cpu) {
> +                       INIT_WORK((struct work_struct *)work_items[cpu],
> +                                         copy_page_work_queue_thread);
> +                       work_items[cpu]->num_items = max_items_per_thread;
> +               }
> +
> +               item_idx = 0;
> +               dst = list_first_entry(dst_folios, struct folio, lru);
> +               dst2 = list_next_entry(dst, lru);
> +               list_for_each_entry_safe(src, src2, src_folios, lru) {
> +                       unsigned long chunk_size = PAGE_SIZE * folio_nr_pages(src) / total_mt_num;
> +                       char *vfrom = page_address(&src->page);
> +                       char *vto = page_address(&dst->page);
> +
> +                       VM_WARN_ON(PAGE_SIZE * folio_nr_pages(src) % total_mt_num);
> +                       VM_WARN_ON(folio_nr_pages(dst) != folio_nr_pages(src));
> +
> +                       for (cpu = 0; cpu < total_mt_num; ++cpu) {
> +                               work_items[cpu]->item_list[item_idx].to =
> +                                       vto + chunk_size * cpu;
> +                               work_items[cpu]->item_list[item_idx].from =
> +                                       vfrom + chunk_size * cpu;
> +                               work_items[cpu]->item_list[item_idx].chunk_size =
> +                                       chunk_size;
> +                       }
> +
> +                       item_idx++;
> +                       dst = dst2;
> +                       dst2 = list_next_entry(dst, lru);
> +               }
> +
> +               for (cpu = 0; cpu < total_mt_num; ++cpu)
> +                       queue_work(system_unbound_wq,
> +                                  (struct work_struct *)work_items[cpu]);
> +       } else {
> +               int num_xfer_per_thread = nr_items / total_mt_num;
> +               int per_cpu_item_idx;
> +
> +
> +               for (cpu = 0; cpu < total_mt_num; ++cpu) {
> +                       INIT_WORK((struct work_struct *)work_items[cpu],
> +                                         copy_page_work_queue_thread);
> +
> +                       work_items[cpu]->num_items = num_xfer_per_thread +
> +                                       (cpu < (nr_items % total_mt_num));
> +               }
> +
> +               cpu = 0;
> +               per_cpu_item_idx = 0;
> +               item_idx = 0;
> +               dst = list_first_entry(dst_folios, struct folio, lru);
> +               dst2 = list_next_entry(dst, lru);
> +               list_for_each_entry_safe(src, src2, src_folios, lru) {
> +                       work_items[cpu]->item_list[per_cpu_item_idx].to =
> +                               page_address(&dst->page);
> +                       work_items[cpu]->item_list[per_cpu_item_idx].from =
> +                               page_address(&src->page);
> +                       work_items[cpu]->item_list[per_cpu_item_idx].chunk_size =
> +                               PAGE_SIZE * folio_nr_pages(src);
> +
> +                       VM_WARN_ON(folio_nr_pages(dst) !=
> +                                  folio_nr_pages(src));
> +
> +                       per_cpu_item_idx++;
> +                       item_idx++;
> +                       dst = dst2;
> +                       dst2 = list_next_entry(dst, lru);
> +
> +                       if (per_cpu_item_idx == work_items[cpu]->num_items) {
> +                               queue_work(system_unbound_wq,
> +                                       (struct work_struct *)work_items[cpu]);

Thanks for the great work.

By the way, is it okay to use work queue?  When the system is idle, this
patch will improve the migration performance, but when there are a lot
of other runnable tasks in the system, it might be worse than the
current one.  That's gonna be even worse if there are some other tasks
that wait for the migration to end.  It's worth noting that
padata_do_multithreaded() also uses work queue internally.

I think, at worst, the performance should be same as is.  Or am I
missing something?

	Byungchul

> +                               per_cpu_item_idx = 0;
> +                               cpu++;
> +                       }
> +               }
> +               if (item_idx != nr_items)
> +                       pr_warn("%s: only %d out of %d pages are transferred\n",
> +                               __func__, item_idx - 1, nr_items);
> +       }
> +
> +       /* Wait until it finishes  */
> +       for (i = 0; i < total_mt_num; ++i) {
> +               flush_work((struct work_struct *)work_items[i]);
> +               /* retry if any copy fails */
> +               if (work_items[i]->ret)
> +                       err = -EAGAIN;
> +       }
> +
> +free_work_items:
> +       for (cpu = 0; cpu < total_mt_num; ++cpu)
> +               kfree(work_items[cpu]);
> +
> +       return err;
> +}
> +
> +static struct kobject *mt_kobj_ref;
> +static struct kobj_attribute mt_offloading_attribute = __ATTR(offloading, 0664,
> +               mt_offloading_show, mt_offloading_set);
> +static struct kobj_attribute mt_threads_attribute = __ATTR(threads, 0664,
> +               mt_threads_show, mt_threads_set);
> +
> +static int __init cpu_mt_module_init(void)
> +{
> +       int ret = 0;
> +
> +       mt_kobj_ref = kobject_create_and_add("cpu_mt", kernel_kobj);
> +       if (!mt_kobj_ref)
> +               return -ENOMEM;
> +
> +       ret = sysfs_create_file(mt_kobj_ref, &mt_offloading_attribute.attr);
> +       if (ret)
> +               goto out_offloading;
> +
> +       ret = sysfs_create_file(mt_kobj_ref, &mt_threads_attribute.attr);
> +       if (ret)
> +               goto out_threads;
> +
> +       is_dispatching = 0;
> +
> +       return 0;
> +
> +out_threads:
> +       sysfs_remove_file(mt_kobj_ref, &mt_offloading_attribute.attr);
> +out_offloading:
> +       kobject_put(mt_kobj_ref);
> +       return ret;
> +}
> +
> +static void __exit cpu_mt_module_exit(void)
> +{
> +       /* Stop the MT offloading to unload the module */
> +       mutex_lock(&migratecfg_mutex);
> +       if (is_dispatching == 1) {
> +               stop_offloading();
> +               is_dispatching = 0;
> +       }
> +       mutex_unlock(&migratecfg_mutex);
> +
> +       sysfs_remove_file(mt_kobj_ref, &mt_threads_attribute.attr);
> +       sysfs_remove_file(mt_kobj_ref, &mt_offloading_attribute.attr);
> +       kobject_put(mt_kobj_ref);
> +}
> +
> +module_init(cpu_mt_module_init);
> +module_exit(cpu_mt_module_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Zi Yan");
> +MODULE_DESCRIPTION("CPU_MT_COPY"); /* CPU Multithreaded Batch Migrator */
> --
> 2.43.0

Re: [RFC V3 6/9] mtcopy: introduce multi-threaded page copy routine

Posted by Garg, Shivank 3 months ago


On 10/20/2025 1:58 PM, Byungchul Park wrote:
> On Tue, Sep 23, 2025 at 05:47:41PM +0000, Shivank Garg wrote:
>> From: Zi Yan <ziy@nvidia.com>
>>
>> Now page copies are batched, multi-threaded page copy can be used to
>> increase page copy throughput.
>>
>> Enable using:
>> echo 1 >  /sys/kernel/cpu_mt/offloading
>> echo NR_THREADS >  /sys/kernel/cpu_mt/threads
>>
>> Disable:
>> echo 0 >  /sys/kernel/cpu_mt/offloading
>>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> Co-developed-by: Shivank Garg <shivankg@amd.com>
>> Signed-off-by: Shivank Garg <shivankg@amd.com>
>> ---
>>  drivers/Kconfig                        |   2 +
>>  drivers/Makefile                       |   3 +
>>  drivers/migoffcopy/Kconfig             |   9 +
>>  drivers/migoffcopy/Makefile            |   1 +
>>  drivers/migoffcopy/mtcopy/Makefile     |   1 +
>>  drivers/migoffcopy/mtcopy/copy_pages.c | 327 +++++++++++++++++++++++++
>>  6 files changed, 343 insertions(+)
>>  create mode 100644 drivers/migoffcopy/Kconfig
>>  create mode 100644 drivers/migoffcopy/Makefile
>>  create mode 100644 drivers/migoffcopy/mtcopy/Makefile
>>  create mode 100644 drivers/migoffcopy/mtcopy/copy_pages.c
>>
>> diff --git a/drivers/Kconfig b/drivers/Kconfig
>> index 4915a63866b0..d2cbc97a7683 100644
>> --- a/drivers/Kconfig
>> +++ b/drivers/Kconfig
>> @@ -251,4 +251,6 @@ source "drivers/hte/Kconfig"
>>
>>  source "drivers/cdx/Kconfig"
>>
>> +source "drivers/migoffcopy/Kconfig"
>> +
>>  endmenu
>> diff --git a/drivers/Makefile b/drivers/Makefile
>> index b5749cf67044..5326d88cf31c 100644
>> --- a/drivers/Makefile
>> +++ b/drivers/Makefile
>> @@ -42,6 +42,9 @@ obj-y                         += clk/
>>  # really early.
>>  obj-$(CONFIG_DMADEVICES)       += dma/
>>
>> +# Migration copy Offload
>> +obj-$(CONFIG_OFFC_MIGRATION)   += migoffcopy/
>> +
>>  # SOC specific infrastructure drivers.
>>  obj-y                          += soc/
>>  obj-$(CONFIG_PM_GENERIC_DOMAINS)       += pmdomain/
>> diff --git a/drivers/migoffcopy/Kconfig b/drivers/migoffcopy/Kconfig
>> new file mode 100644
>> index 000000000000..e73698af3e72
>> --- /dev/null
>> +++ b/drivers/migoffcopy/Kconfig
>> @@ -0,0 +1,9 @@
>> +config MTCOPY_CPU
>> +       bool "Multi-Threaded Copy with CPU"
>> +       depends on OFFC_MIGRATION
>> +       default n
>> +       help
>> +         Interface MT COPY CPU driver for batch page migration
>> +         offloading. Say Y if you want to try offloading with
>> +         MultiThreaded CPU copy APIs.
>> +
>> diff --git a/drivers/migoffcopy/Makefile b/drivers/migoffcopy/Makefile
>> new file mode 100644
>> index 000000000000..0a3c356d67e6
>> --- /dev/null
>> +++ b/drivers/migoffcopy/Makefile
>> @@ -0,0 +1 @@
>> +obj-$(CONFIG_MTCOPY_CPU) += mtcopy/
>> diff --git a/drivers/migoffcopy/mtcopy/Makefile b/drivers/migoffcopy/mtcopy/Makefile
>> new file mode 100644
>> index 000000000000..b4d7da85eda9
>> --- /dev/null
>> +++ b/drivers/migoffcopy/mtcopy/Makefile
>> @@ -0,0 +1 @@
>> +obj-$(CONFIG_MTCOPY_CPU) += copy_pages.o
>> diff --git a/drivers/migoffcopy/mtcopy/copy_pages.c b/drivers/migoffcopy/mtcopy/copy_pages.c
>> new file mode 100644
>> index 000000000000..68e50de602d6
>> --- /dev/null
>> +++ b/drivers/migoffcopy/mtcopy/copy_pages.c
>> @@ -0,0 +1,327 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Parallel page copy routine.
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/kernel.h>
>> +#include <linux/printk.h>
>> +#include <linux/init.h>
>> +#include <linux/sysctl.h>
>> +#include <linux/sysfs.h>
>> +#include <linux/highmem.h>
>> +#include <linux/workqueue.h>
>> +#include <linux/slab.h>
>> +#include <linux/migrate.h>
>> +#include <linux/migrate_offc.h>
>> +
>> +#define MAX_NUM_COPY_THREADS 64
>> +
>> +unsigned int limit_mt_num = 4;
>> +static int is_dispatching;
>> +
>> +static int copy_page_lists_mt(struct list_head *dst_folios,
>> +               struct list_head *src_folios, unsigned int nr_items);
>> +
>> +static DEFINE_MUTEX(migratecfg_mutex);
>> +
>> +/* CPU Multithreaded Batch Migrator */
>> +struct migrator cpu_migrator = {
>> +       .name = "CPU_MT_COPY\0",
>> +       .migrate_offc = copy_page_lists_mt,
>> +       .owner = THIS_MODULE,
>> +};
>> +
>> +struct copy_item {
>> +       char *to;
>> +       char *from;
>> +       unsigned long chunk_size;
>> +};
>> +
>> +struct copy_page_info {
>> +       struct work_struct copy_page_work;
>> +       int ret;
>> +       unsigned long num_items;
>> +       struct copy_item item_list[];
>> +};
>> +
>> +static unsigned long copy_page_routine(char *vto, char *vfrom,
>> +       unsigned long chunk_size)
>> +{
>> +       return copy_mc_to_kernel(vto, vfrom, chunk_size);
>> +}
>> +
>> +static void copy_page_work_queue_thread(struct work_struct *work)
>> +{
>> +       struct copy_page_info *my_work = (struct copy_page_info *)work;
>> +       int i;
>> +
>> +       my_work->ret = 0;
>> +       for (i = 0; i < my_work->num_items; ++i)
>> +               my_work->ret |= !!copy_page_routine(my_work->item_list[i].to,
>> +                                       my_work->item_list[i].from,
>> +                                       my_work->item_list[i].chunk_size);
>> +}
>> +
>> +static ssize_t mt_offloading_set(struct kobject *kobj, struct kobj_attribute *attr,
>> +               const char *buf, size_t count)
>> +{
>> +       int ccode;
>> +       int action;
>> +
>> +       ccode = kstrtoint(buf, 0, &action);
>> +       if (ccode) {
>> +               pr_debug("(%s:) error parsing input %s\n", __func__, buf);
>> +               return ccode;
>> +       }
>> +
>> +       /*
>> +        * action is 0: User wants to disable MT offloading.
>> +        * action is 1: User wants to enable MT offloading.
>> +        */
>> +       switch (action) {
>> +       case 0:
>> +               mutex_lock(&migratecfg_mutex);
>> +               if (is_dispatching == 1) {
>> +                       stop_offloading();
>> +                       is_dispatching = 0;
>> +               } else
>> +                       pr_debug("MT migration offloading is already OFF\n");
>> +               mutex_unlock(&migratecfg_mutex);
>> +               break;
>> +       case 1:
>> +               mutex_lock(&migratecfg_mutex);
>> +               if (is_dispatching == 0) {
>> +                       start_offloading(&cpu_migrator);
>> +                       is_dispatching = 1;
>> +               } else
>> +                       pr_debug("MT migration offloading is already ON\n");
>> +               mutex_unlock(&migratecfg_mutex);
>> +               break;
>> +       default:
>> +               pr_debug("input should be zero or one, parsed as %d\n", action);
>> +       }
>> +       return sizeof(action);
>> +}
>> +
>> +static ssize_t mt_offloading_show(struct kobject *kobj,
>> +               struct kobj_attribute *attr, char *buf)
>> +{
>> +       return sysfs_emit(buf, "%d\n", is_dispatching);
>> +}
>> +
>> +static ssize_t mt_threads_set(struct kobject *kobj, struct kobj_attribute *attr,
>> +               const char *buf, size_t count)
>> +{
>> +       int ccode;
>> +       unsigned int threads;
>> +
>> +       ccode = kstrtouint(buf, 0, &threads);
>> +       if (ccode) {
>> +               pr_debug("(%s:) error parsing input %s\n", __func__, buf);
>> +               return ccode;
>> +       }
>> +
>> +       if (threads > 0 && threads <= MAX_NUM_COPY_THREADS) {
>> +               mutex_lock(&migratecfg_mutex);
>> +               limit_mt_num = threads;
>> +               mutex_unlock(&migratecfg_mutex);
>> +               pr_debug("MT threads set to %u\n", limit_mt_num);
>> +       } else {
>> +               pr_debug("Invalid thread count. Must be between 1 and %d\n", MAX_NUM_COPY_THREADS);
>> +               return -EINVAL;
>> +       }
>> +
>> +       return count;
>> +}
>> +
>> +static ssize_t mt_threads_show(struct kobject *kobj,
>> +               struct kobj_attribute *attr, char *buf)
>> +{
>> +       return sysfs_emit(buf, "%u\n", limit_mt_num);
>> +}
>> +
>> +int copy_page_lists_mt(struct list_head *dst_folios,
>> +               struct list_head *src_folios, unsigned int nr_items)
>> +{
>> +       struct copy_page_info *work_items[MAX_NUM_COPY_THREADS] = {0};
>> +       unsigned int total_mt_num = limit_mt_num;
>> +       struct folio *src, *src2, *dst, *dst2;
>> +       int max_items_per_thread;
>> +       int item_idx;
>> +       int err = 0;
>> +       int cpu;
>> +       int i;
>> +
>> +       if (IS_ENABLED(CONFIG_HIGHMEM))
>> +               return -EOPNOTSUPP;
>> +
>> +       /* Each threads get part of each page, if nr_items < totla_mt_num */
>> +       if (nr_items < total_mt_num)
>> +               max_items_per_thread = nr_items;
>> +       else
>> +               max_items_per_thread = (nr_items / total_mt_num) +
>> +                               ((nr_items % total_mt_num) ? 1 : 0);
>> +
>> +
>> +       for (cpu = 0; cpu < total_mt_num; ++cpu) {
>> +               work_items[cpu] = kzalloc(sizeof(struct copy_page_info) +
>> +                                               sizeof(struct copy_item) *
>> +                                                       max_items_per_thread,
>> +                                         GFP_NOWAIT);
>> +               if (!work_items[cpu]) {
>> +                       err = -ENOMEM;
>> +                       goto free_work_items;
>> +               }
>> +       }
>> +
>> +       if (nr_items < total_mt_num) {
>> +               for (cpu = 0; cpu < total_mt_num; ++cpu) {
>> +                       INIT_WORK((struct work_struct *)work_items[cpu],
>> +                                         copy_page_work_queue_thread);
>> +                       work_items[cpu]->num_items = max_items_per_thread;
>> +               }
>> +
>> +               item_idx = 0;
>> +               dst = list_first_entry(dst_folios, struct folio, lru);
>> +               dst2 = list_next_entry(dst, lru);
>> +               list_for_each_entry_safe(src, src2, src_folios, lru) {
>> +                       unsigned long chunk_size = PAGE_SIZE * folio_nr_pages(src) / total_mt_num;
>> +                       char *vfrom = page_address(&src->page);
>> +                       char *vto = page_address(&dst->page);
>> +
>> +                       VM_WARN_ON(PAGE_SIZE * folio_nr_pages(src) % total_mt_num);
>> +                       VM_WARN_ON(folio_nr_pages(dst) != folio_nr_pages(src));
>> +
>> +                       for (cpu = 0; cpu < total_mt_num; ++cpu) {
>> +                               work_items[cpu]->item_list[item_idx].to =
>> +                                       vto + chunk_size * cpu;
>> +                               work_items[cpu]->item_list[item_idx].from =
>> +                                       vfrom + chunk_size * cpu;
>> +                               work_items[cpu]->item_list[item_idx].chunk_size =
>> +                                       chunk_size;
>> +                       }
>> +
>> +                       item_idx++;
>> +                       dst = dst2;
>> +                       dst2 = list_next_entry(dst, lru);
>> +               }
>> +
>> +               for (cpu = 0; cpu < total_mt_num; ++cpu)
>> +                       queue_work(system_unbound_wq,
>> +                                  (struct work_struct *)work_items[cpu]);
>> +       } else {
>> +               int num_xfer_per_thread = nr_items / total_mt_num;
>> +               int per_cpu_item_idx;
>> +
>> +
>> +               for (cpu = 0; cpu < total_mt_num; ++cpu) {
>> +                       INIT_WORK((struct work_struct *)work_items[cpu],
>> +                                         copy_page_work_queue_thread);
>> +
>> +                       work_items[cpu]->num_items = num_xfer_per_thread +
>> +                                       (cpu < (nr_items % total_mt_num));
>> +               }
>> +
>> +               cpu = 0;
>> +               per_cpu_item_idx = 0;
>> +               item_idx = 0;
>> +               dst = list_first_entry(dst_folios, struct folio, lru);
>> +               dst2 = list_next_entry(dst, lru);
>> +               list_for_each_entry_safe(src, src2, src_folios, lru) {
>> +                       work_items[cpu]->item_list[per_cpu_item_idx].to =
>> +                               page_address(&dst->page);
>> +                       work_items[cpu]->item_list[per_cpu_item_idx].from =
>> +                               page_address(&src->page);
>> +                       work_items[cpu]->item_list[per_cpu_item_idx].chunk_size =
>> +                               PAGE_SIZE * folio_nr_pages(src);
>> +
>> +                       VM_WARN_ON(folio_nr_pages(dst) !=
>> +                                  folio_nr_pages(src));
>> +
>> +                       per_cpu_item_idx++;
>> +                       item_idx++;
>> +                       dst = dst2;
>> +                       dst2 = list_next_entry(dst, lru);
>> +
>> +                       if (per_cpu_item_idx == work_items[cpu]->num_items) {
>> +                               queue_work(system_unbound_wq,
>> +                                       (struct work_struct *)work_items[cpu]);
> 
> Thanks for the great work.
> 
> By the way, is it okay to use work queue?  When the system is idle, this
> patch will improve the migration performance, but when there are a lot
> of other runnable tasks in the system, it might be worse than the
> current one.  That's gonna be even worse if there are some other tasks
> that wait for the migration to end.  It's worth noting that
> padata_do_multithreaded() also uses work queue internally.
> 
> I think, at worst, the performance should be same as is.  Or am I
> missing something?
> 
> 	Byungchul

Hi Byungchul,

This was addressed by Zi in the mail:
https://lore.kernel.org/linux-mm/61F6152C-A91E-453B-9521-34B7497AE532@nvidia.com

So, there are some specific use cases that can benefit significantly when CPU cores are idle
while GPUs or accelerators handle most of the workload.
In such scenarios, migrating pages to and from device memory (GPU or AI accelerator) quickly
is critical and ensure hot data is always available for accelerators.

Thanks,
Shivank

> 
>> +                               per_cpu_item_idx = 0;
>> +                               cpu++;
>> +                       }
>> +               }
>> +               if (item_idx != nr_items)
>> +                       pr_warn("%s: only %d out of %d pages are transferred\n",
>> +                               __func__, item_idx - 1, nr_items);
>> +       }
>> +
>> +       /* Wait until it finishes  */
>> +       for (i = 0; i < total_mt_num; ++i) {
>> +               flush_work((struct work_struct *)work_items[i]);
>> +               /* retry if any copy fails */
>> +               if (work_items[i]->ret)
>> +                       err = -EAGAIN;
>> +       }
>> +
>> +free_work_items:
>> +       for (cpu = 0; cpu < total_mt_num; ++cpu)
>> +               kfree(work_items[cpu]);
>> +
>> +       return err;
>> +}
>> +
>> +static struct kobject *mt_kobj_ref;
>> +static struct kobj_attribute mt_offloading_attribute = __ATTR(offloading, 0664,
>> +               mt_offloading_show, mt_offloading_set);
>> +static struct kobj_attribute mt_threads_attribute = __ATTR(threads, 0664,
>> +               mt_threads_show, mt_threads_set);
>> +
>> +static int __init cpu_mt_module_init(void)
>> +{
>> +       int ret = 0;
>> +
>> +       mt_kobj_ref = kobject_create_and_add("cpu_mt", kernel_kobj);
>> +       if (!mt_kobj_ref)
>> +               return -ENOMEM;
>> +
>> +       ret = sysfs_create_file(mt_kobj_ref, &mt_offloading_attribute.attr);
>> +       if (ret)
>> +               goto out_offloading;
>> +
>> +       ret = sysfs_create_file(mt_kobj_ref, &mt_threads_attribute.attr);
>> +       if (ret)
>> +               goto out_threads;
>> +
>> +       is_dispatching = 0;
>> +
>> +       return 0;
>> +
>> +out_threads:
>> +       sysfs_remove_file(mt_kobj_ref, &mt_offloading_attribute.attr);
>> +out_offloading:
>> +       kobject_put(mt_kobj_ref);
>> +       return ret;
>> +}
>> +
>> +static void __exit cpu_mt_module_exit(void)
>> +{
>> +       /* Stop the MT offloading to unload the module */
>> +       mutex_lock(&migratecfg_mutex);
>> +       if (is_dispatching == 1) {
>> +               stop_offloading();
>> +               is_dispatching = 0;
>> +       }
>> +       mutex_unlock(&migratecfg_mutex);
>> +
>> +       sysfs_remove_file(mt_kobj_ref, &mt_threads_attribute.attr);
>> +       sysfs_remove_file(mt_kobj_ref, &mt_offloading_attribute.attr);
>> +       kobject_put(mt_kobj_ref);
>> +}
>> +
>> +module_init(cpu_mt_module_init);
>> +module_exit(cpu_mt_module_exit);
>> +
>> +MODULE_LICENSE("GPL");
>> +MODULE_AUTHOR("Zi Yan");
>> +MODULE_DESCRIPTION("CPU_MT_COPY"); /* CPU Multithreaded Batch Migrator */
>> --
>> 2.43.0

Re: [RFC V3 6/9] mtcopy: introduce multi-threaded page copy routine

Posted by Byungchul Park 2 months, 4 weeks ago

On Thu, Nov 06, 2025 at 11:57:54AM +0530, Garg, Shivank wrote:
> On 10/20/2025 1:58 PM, Byungchul Park wrote:
> > Thanks for the great work.
> >
> > By the way, is it okay to use work queue?  When the system is idle, this
> > patch will improve the migration performance, but when there are a lot
> > of other runnable tasks in the system, it might be worse than the
> > current one.  That's gonna be even worse if there are some other tasks
> > that wait for the migration to end.  It's worth noting that
> > padata_do_multithreaded() also uses work queue internally.
> >
> > I think, at worst, the performance should be same as is.  Or am I
> > missing something?
> >
> >       Byungchul
> 
> Hi Byungchul,
> 
> This was addressed by Zi in the mail:
> https://lore.kernel.org/linux-mm/61F6152C-A91E-453B-9521-34B7497AE532@nvidia.com
> 
> So, there are some specific use cases that can benefit significantly when CPU cores are idle

Sure.  I think so.  I meant the mechanism using multi-threads would
better be performed for faster migration when a system is idle, but it'd
better avoid the aggressiveness when the system is busy.

Or we might observe a performance degradation due to this work.

	Byungchul

> while GPUs or accelerators handle most of the workload.
> In such scenarios, migrating pages to and from device memory (GPU or AI accelerator) quickly
> is critical and ensure hot data is always available for accelerators.
> 
> Thanks,
> Shivank
> 
> >
> >> +                               per_cpu_item_idx = 0;
> >> +                               cpu++;
> >> +                       }
> >> +               }
> >> +               if (item_idx != nr_items)
> >> +                       pr_warn("%s: only %d out of %d pages are transferred\n",
> >> +                               __func__, item_idx - 1, nr_items);
> >> +       }
> >> +
> >> +       /* Wait until it finishes  */
> >> +       for (i = 0; i < total_mt_num; ++i) {
> >> +               flush_work((struct work_struct *)work_items[i]);
> >> +               /* retry if any copy fails */
> >> +               if (work_items[i]->ret)
> >> +                       err = -EAGAIN;
> >> +       }
> >> +
> >> +free_work_items:
> >> +       for (cpu = 0; cpu < total_mt_num; ++cpu)
> >> +               kfree(work_items[cpu]);
> >> +
> >> +       return err;
> >> +}
> >> +
> >> +static struct kobject *mt_kobj_ref;
> >> +static struct kobj_attribute mt_offloading_attribute = __ATTR(offloading, 0664,
> >> +               mt_offloading_show, mt_offloading_set);
> >> +static struct kobj_attribute mt_threads_attribute = __ATTR(threads, 0664,
> >> +               mt_threads_show, mt_threads_set);
> >> +
> >> +static int __init cpu_mt_module_init(void)
> >> +{
> >> +       int ret = 0;
> >> +
> >> +       mt_kobj_ref = kobject_create_and_add("cpu_mt", kernel_kobj);
> >> +       if (!mt_kobj_ref)
> >> +               return -ENOMEM;
> >> +
> >> +       ret = sysfs_create_file(mt_kobj_ref, &mt_offloading_attribute.attr);
> >> +       if (ret)
> >> +               goto out_offloading;
> >> +
> >> +       ret = sysfs_create_file(mt_kobj_ref, &mt_threads_attribute.attr);
> >> +       if (ret)
> >> +               goto out_threads;
> >> +
> >> +       is_dispatching = 0;
> >> +
> >> +       return 0;
> >> +
> >> +out_threads:
> >> +       sysfs_remove_file(mt_kobj_ref, &mt_offloading_attribute.attr);
> >> +out_offloading:
> >> +       kobject_put(mt_kobj_ref);
> >> +       return ret;
> >> +}
> >> +
> >> +static void __exit cpu_mt_module_exit(void)
> >> +{
> >> +       /* Stop the MT offloading to unload the module */
> >> +       mutex_lock(&migratecfg_mutex);
> >> +       if (is_dispatching == 1) {
> >> +               stop_offloading();
> >> +               is_dispatching = 0;
> >> +       }
> >> +       mutex_unlock(&migratecfg_mutex);
> >> +
> >> +       sysfs_remove_file(mt_kobj_ref, &mt_threads_attribute.attr);
> >> +       sysfs_remove_file(mt_kobj_ref, &mt_offloading_attribute.attr);
> >> +       kobject_put(mt_kobj_ref);
> >> +}
> >> +
> >> +module_init(cpu_mt_module_init);
> >> +module_exit(cpu_mt_module_exit);
> >> +
> >> +MODULE_LICENSE("GPL");
> >> +MODULE_AUTHOR("Zi Yan");
> >> +MODULE_DESCRIPTION("CPU_MT_COPY"); /* CPU Multithreaded Batch Migrator */
> >> --
> >> 2.43.0

Re: [RFC V3 6/9] mtcopy: introduce multi-threaded page copy routine

Posted by Jonathan Cameron 4 months, 1 week ago

On Tue, 23 Sep 2025 17:47:41 +0000
Shivank Garg <shivankg@amd.com> wrote:

> From: Zi Yan <ziy@nvidia.com>
> 
> Now page copies are batched, multi-threaded page copy can be used to
> increase page copy throughput.
> 
> Enable using:
> echo 1 >  /sys/kernel/cpu_mt/offloading
> echo NR_THREADS >  /sys/kernel/cpu_mt/threads

I guess this order is to show that you can update threads with it on
as system load changes.

Maybe call this out explicitly?

> 
> Disable:
> echo 0 >  /sys/kernel/cpu_mt/offloading
> 
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> Co-developed-by: Shivank Garg <shivankg@amd.com>
> Signed-off-by: Shivank Garg <shivankg@amd.com>

Various other things inline.

Thanks,

Jonathan

> diff --git a/drivers/migoffcopy/Kconfig b/drivers/migoffcopy/Kconfig
> new file mode 100644
> index 000000000000..e73698af3e72
> --- /dev/null
> +++ b/drivers/migoffcopy/Kconfig
> @@ -0,0 +1,9 @@
> +config MTCOPY_CPU
> +       bool "Multi-Threaded Copy with CPU"
> +       depends on OFFC_MIGRATION
> +       default n
> +       help
> +         Interface MT COPY CPU driver for batch page migration
> +         offloading. Say Y if you want to try offloading with
> +         MultiThreaded CPU copy APIs.
Try?  I'd be more positive in the help text :)

> +

> diff --git a/drivers/migoffcopy/mtcopy/copy_pages.c b/drivers/migoffcopy/mtcopy/copy_pages.c
> new file mode 100644
> index 000000000000..68e50de602d6
> --- /dev/null
> +++ b/drivers/migoffcopy/mtcopy/copy_pages.c
> @@ -0,0 +1,327 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Parallel page copy routine.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
Generally we are trying to get away from anything including kernel.h
directly.  There is relatively little still in there, so maybe check
you actually need it here.

> +#include <linux/printk.h>
> +#include <linux/init.h>
> +#include <linux/sysctl.h>
> +#include <linux/sysfs.h>
> +#include <linux/highmem.h>
> +#include <linux/workqueue.h>
> +#include <linux/slab.h>
> +#include <linux/migrate.h>
> +#include <linux/migrate_offc.h>
> +
> +#define MAX_NUM_COPY_THREADS 64
Add a comment on why this number.

> +
> +struct copy_page_info {
> +	struct work_struct copy_page_work;
> +	int ret;
> +	unsigned long num_items;
> +	struct copy_item item_list[];
__counted_by

> +};
> +
> +static unsigned long copy_page_routine(char *vto, char *vfrom,
> +	unsigned long chunk_size)
> +{
> +	return copy_mc_to_kernel(vto, vfrom, chunk_size);
> +}
> +
> +static void copy_page_work_queue_thread(struct work_struct *work)
> +{
> +	struct copy_page_info *my_work = (struct copy_page_info *)work;

container_of()

> +	int i;
> +
> +	my_work->ret = 0;
> +	for (i = 0; i < my_work->num_items; ++i)
> +		my_work->ret |= !!copy_page_routine(my_work->item_list[i].to,
> +					my_work->item_list[i].from,
> +					my_work->item_list[i].chunk_size);
> +}
> +
> +static ssize_t mt_offloading_set(struct kobject *kobj, struct kobj_attribute *attr,
> +		const char *buf, size_t count)
> +{
> +	int ccode;
> +	int action;
> +
> +	ccode = kstrtoint(buf, 0, &action);
> +	if (ccode) {
> +		pr_debug("(%s:) error parsing input %s\n", __func__, buf);
> +		return ccode;
> +	}
> +
> +	/*
> +	 * action is 0: User wants to disable MT offloading.
> +	 * action is 1: User wants to enable MT offloading.
> +	 */
> +	switch (action) {
> +	case 0:
> +		mutex_lock(&migratecfg_mutex);
> +		if (is_dispatching == 1) {
> +			stop_offloading();
> +			is_dispatching = 0;
> +		} else
> +			pr_debug("MT migration offloading is already OFF\n");
> +		mutex_unlock(&migratecfg_mutex);
> +		break;
> +	case 1:
> +		mutex_lock(&migratecfg_mutex);
> +		if (is_dispatching == 0) {
> +			start_offloading(&cpu_migrator);
> +			is_dispatching = 1;
> +		} else
> +			pr_debug("MT migration offloading is already ON\n");
> +		mutex_unlock(&migratecfg_mutex);
> +		break;
> +	default:
> +		pr_debug("input should be zero or one, parsed as %d\n", action);
> +	}
> +	return sizeof(action);
> +}
> +
> +static ssize_t mt_offloading_show(struct kobject *kobj,
> +		struct kobj_attribute *attr, char *buf)
> +{
> +	return sysfs_emit(buf, "%d\n", is_dispatching);
> +}
> +
> +static ssize_t mt_threads_set(struct kobject *kobj, struct kobj_attribute *attr,
> +		const char *buf, size_t count)
> +{
> +	int ccode;
> +	unsigned int threads;
> +
> +	ccode = kstrtouint(buf, 0, &threads);
> +	if (ccode) {
> +		pr_debug("(%s:) error parsing input %s\n", __func__, buf);

I'm fairly sure you can use dynamic debug here to add the __func__ so no need
to do it by hand.

> +		return ccode;
> +	}
> +
> +	if (threads > 0 && threads <= MAX_NUM_COPY_THREADS) {
> +		mutex_lock(&migratecfg_mutex);
> +		limit_mt_num = threads;
> +		mutex_unlock(&migratecfg_mutex);
> +		pr_debug("MT threads set to %u\n", limit_mt_num);
> +	} else {

I'd flip the logic to test first for in range and exit if not. Then
no indent on the good path.

> +		pr_debug("Invalid thread count. Must be between 1 and %d\n", MAX_NUM_COPY_THREADS);
> +		return -EINVAL;
> +	}
> +
> +	return count;
> +}

> +int copy_page_lists_mt(struct list_head *dst_folios,
> +		struct list_head *src_folios, unsigned int nr_items)
> +{
> +	struct copy_page_info *work_items[MAX_NUM_COPY_THREADS] = {0};

{} or { NULL } perhaps given it's an array of pointers.

> +	unsigned int total_mt_num = limit_mt_num;
> +	struct folio *src, *src2, *dst, *dst2;
> +	int max_items_per_thread;
> +	int item_idx;
> +	int err = 0;
> +	int cpu;
> +	int i;
> +
> +	if (IS_ENABLED(CONFIG_HIGHMEM))
> +		return -EOPNOTSUPP;
> +
> +	/* Each threads get part of each page, if nr_items < totla_mt_num */
Each thread gets part of each page

total_mt_num  Though isn't the comment talking about when it's greater than or equal?


> +	if (nr_items < total_mt_num)
> +		max_items_per_thread = nr_items;
> +	else
> +		max_items_per_thread = (nr_items / total_mt_num) +
> +				((nr_items % total_mt_num) ? 1 : 0);
> +
> +
> +	for (cpu = 0; cpu < total_mt_num; ++cpu) {
> +		work_items[cpu] = kzalloc(sizeof(struct copy_page_info) +
> +						sizeof(struct copy_item) *
> +							max_items_per_thread,

struct_size() looks appropriate here.

> +					  GFP_NOWAIT);
> +		if (!work_items[cpu]) {
> +			err = -ENOMEM;
> +			goto free_work_items;
> +		}
> +	}
> +
> +	if (nr_items < total_mt_num) {
> +		for (cpu = 0; cpu < total_mt_num; ++cpu) {
> +			INIT_WORK((struct work_struct *)work_items[cpu],
Why not avoid having to know it is at start of structure by using
work_items[cpu]->copy_page_work instead.

> +					  copy_page_work_queue_thread);
> +			work_items[cpu]->num_items = max_items_per_thread;
> +		}
> +
> +		item_idx = 0;
> +		dst = list_first_entry(dst_folios, struct folio, lru);
> +		dst2 = list_next_entry(dst, lru);
> +		list_for_each_entry_safe(src, src2, src_folios, lru) {
> +			unsigned long chunk_size = PAGE_SIZE * folio_nr_pages(src) / total_mt_num;
> +			char *vfrom = page_address(&src->page);
> +			char *vto = page_address(&dst->page);
> +
> +			VM_WARN_ON(PAGE_SIZE * folio_nr_pages(src) % total_mt_num);
> +			VM_WARN_ON(folio_nr_pages(dst) != folio_nr_pages(src));
> +
> +			for (cpu = 0; cpu < total_mt_num; ++cpu) {
> +				work_items[cpu]->item_list[item_idx].to =
> +					vto + chunk_size * cpu;
> +				work_items[cpu]->item_list[item_idx].from =
> +					vfrom + chunk_size * cpu;
> +				work_items[cpu]->item_list[item_idx].chunk_size =
> +					chunk_size;
> +			}
> +
> +			item_idx++;
> +			dst = dst2;
> +			dst2 = list_next_entry(dst, lru);
> +		}
> +
> +		for (cpu = 0; cpu < total_mt_num; ++cpu)
> +			queue_work(system_unbound_wq,
> +				   (struct work_struct *)work_items[cpu]);

As above. If you want the work struct, using the member that is the right type.

> +	} else {
> +		int num_xfer_per_thread = nr_items / total_mt_num;
> +		int per_cpu_item_idx;
> +
> +
> +		for (cpu = 0; cpu < total_mt_num; ++cpu) {
> +			INIT_WORK((struct work_struct *)work_items[cpu],

Same again.

> +					  copy_page_work_queue_thread);
> +
> +			work_items[cpu]->num_items = num_xfer_per_thread +
> +					(cpu < (nr_items % total_mt_num));
> +		}
> +
> +		cpu = 0;
> +		per_cpu_item_idx = 0;
> +		item_idx = 0;
> +		dst = list_first_entry(dst_folios, struct folio, lru);
> +		dst2 = list_next_entry(dst, lru);
> +		list_for_each_entry_safe(src, src2, src_folios, lru) {
> +			work_items[cpu]->item_list[per_cpu_item_idx].to =
> +				page_address(&dst->page);
> +			work_items[cpu]->item_list[per_cpu_item_idx].from =
> +				page_address(&src->page);
> +			work_items[cpu]->item_list[per_cpu_item_idx].chunk_size =
> +				PAGE_SIZE * folio_nr_pages(src);
> +
> +			VM_WARN_ON(folio_nr_pages(dst) !=
> +				   folio_nr_pages(src));
> +
> +			per_cpu_item_idx++;
> +			item_idx++;
> +			dst = dst2;
> +			dst2 = list_next_entry(dst, lru);
> +
> +			if (per_cpu_item_idx == work_items[cpu]->num_items) {
> +				queue_work(system_unbound_wq,
> +					(struct work_struct *)work_items[cpu]);
and one more.
> +				per_cpu_item_idx = 0;
> +				cpu++;
> +			}
> +		}
> +		if (item_idx != nr_items)
> +			pr_warn("%s: only %d out of %d pages are transferred\n",
> +				__func__, item_idx - 1, nr_items);
> +	}
> +
> +	/* Wait until it finishes  */
> +	for (i = 0; i < total_mt_num; ++i) {
> +		flush_work((struct work_struct *)work_items[i]);
> +		/* retry if any copy fails */
> +		if (work_items[i]->ret)
> +			err = -EAGAIN;
> +	}
> +
> +free_work_items:
> +	for (cpu = 0; cpu < total_mt_num; ++cpu)
> +		kfree(work_items[cpu]);
> +
> +	return err;
> +}
> +
> +static struct kobject *mt_kobj_ref;
> +static struct kobj_attribute mt_offloading_attribute = __ATTR(offloading, 0664,
> +		mt_offloading_show, mt_offloading_set);
> +static struct kobj_attribute mt_threads_attribute = __ATTR(threads, 0664,
> +		mt_threads_show, mt_threads_set);
> +
> +static int __init cpu_mt_module_init(void)
> +{
> +	int ret = 0;

Always set before use so don't init here.

> +
> +	mt_kobj_ref = kobject_create_and_add("cpu_mt", kernel_kobj);
> +	if (!mt_kobj_ref)
> +		return -ENOMEM;
> +
> +	ret = sysfs_create_file(mt_kobj_ref, &mt_offloading_attribute.attr);
> +	if (ret)
> +		goto out_offloading;
> +
> +	ret = sysfs_create_file(mt_kobj_ref, &mt_threads_attribute.attr);
> +	if (ret)
> +		goto out_threads;
> +
> +	is_dispatching = 0;
> +
> +	return 0;
> +
> +out_threads:
> +	sysfs_remove_file(mt_kobj_ref, &mt_offloading_attribute.attr);
> +out_offloading:
> +	kobject_put(mt_kobj_ref);
> +	return ret;
> +}

> +module_init(cpu_mt_module_init);
> +module_exit(cpu_mt_module_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Zi Yan");
> +MODULE_DESCRIPTION("CPU_MT_COPY"); /* CPU Multithreaded Batch Migrator */

If a module description needs a comment after it I'd rewrite that description!