The driver will create a file of `/dev/hisi_l3c` on init, mmap
operations to it will allocate a memory region that is guaranteed to be
placed in L3 cache.
The driver also provides unmap() to deallocated the locked memory.
The driver also provides an ioctl interface for user to get cache lock
information, such as lock restrictions and locked sizes.
Signed-off-by: Yushan Wang <wangyushan12@huawei.com>
---
.../userspace-api/ioctl/ioctl-number.rst | 1 +
MAINTAINERS | 6 +
drivers/soc/hisilicon/Kconfig | 11 +
drivers/soc/hisilicon/Makefile | 2 +
drivers/soc/hisilicon/hisi_soc_l3c.c | 357 ++++++++++++++++++
include/uapi/misc/hisi_l3c.h | 28 ++
6 files changed, 405 insertions(+)
create mode 100644 drivers/soc/hisilicon/hisi_soc_l3c.c
create mode 100644 include/uapi/misc/hisi_l3c.h
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 7232b3544cec..439c5bcbfa94 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -387,6 +387,7 @@ Code Seq# Include File Comments
<mailto:linux-hyperv@vger.kernel.org>
0xBA 00-0F uapi/linux/liveupdate.h Pasha Tatashin
<mailto:pasha.tatashin@soleen.com>
+0xBB all uapi/misc/hisi_soc_cache.h HiSilicon SoC cache driver
0xC0 00-0F linux/usb/iowarrior.h
0xCA 00-0F uapi/misc/cxl.h Dead since 6.15
0xCA 10-2F uapi/misc/ocxl.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 0efa8cc6775b..247df6e69c10 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11375,6 +11375,12 @@ F: Documentation/ABI/testing/sysfs-devices-platform-kunpeng_hccs
F: drivers/soc/hisilicon/kunpeng_hccs.c
F: drivers/soc/hisilicon/kunpeng_hccs.h
+HISILICON SOC L3C DRIVER
+M: Yushan Wang <wangyushan12@huawei.com>
+S: Maintained
+F: drivers/soc/hisilicon/hisi_soc_l3c.c
+F: include/uapi/misc/hisi_l3c.h
+
HISILICON LPC BUS DRIVER
M: Jay Fang <f.fangjian@huawei.com>
S: Maintained
diff --git a/drivers/soc/hisilicon/Kconfig b/drivers/soc/hisilicon/Kconfig
index 6d7c244d2e78..8f4202e2d8d9 100644
--- a/drivers/soc/hisilicon/Kconfig
+++ b/drivers/soc/hisilicon/Kconfig
@@ -21,4 +21,15 @@ config KUNPENG_HCCS
health status and port information of HCCS, or reducing system
power consumption on Kunpeng SoC.
+config HISI_SOC_L3C
+ bool "HiSilicon L3 Cache device driver"
+ depends on ACPI
+ depends on ARM64 || COMPILE_TEST
+ help
+ This driver provides the functions to lock L3 cache entries from
+ being evicted for better performance.
+
+ This driver can be built as a module. If so, the module will be
+ called hisi_soc_l3c.
+
endmenu
diff --git a/drivers/soc/hisilicon/Makefile b/drivers/soc/hisilicon/Makefile
index 226e747e70d6..16ff2c73c4a5 100644
--- a/drivers/soc/hisilicon/Makefile
+++ b/drivers/soc/hisilicon/Makefile
@@ -1,2 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_KUNPENG_HCCS) += kunpeng_hccs.o
+
+obj-$(CONFIG_HISI_SOC_L3C) += hisi_soc_l3c.o
diff --git a/drivers/soc/hisilicon/hisi_soc_l3c.c b/drivers/soc/hisilicon/hisi_soc_l3c.c
new file mode 100644
index 000000000000..b6f6d5bdd4e5
--- /dev/null
+++ b/drivers/soc/hisilicon/hisi_soc_l3c.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for HiSilicon L3 cache.
+ *
+ * Copyright (c) 2025 HiSilicon Technologies Co., Ltd.
+ * Author: Yushan Wang <wangyushan12@huawei.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cleanup.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/spinlock_types.h>
+#include <linux/types.h>
+
+#include <uapi/misc/hisi_l3c.h>
+
+#define to_hisi_l3c(p) container_of((p), struct hisi_l3c, comp)
+
+/**
+ * struct hisi_soc_comp - Struct of HiSilicon SoC cache components.
+ *
+ * @node: list node of hisi_soc_comp_list.
+ * @ops: possible operations a component may perform.
+ * @affinity_mask: cpus that associate with this component.
+ * @private: component specific data.
+ */
+struct hisi_soc_comp {
+ struct list_head node;
+ struct hisi_soc_comp_ops *ops;
+ cpumask_t affinity_mask;
+ void *private;
+};
+
+/**
+ * struct hisi_soc_comp_ops - Callbacks for SoC cache drivers to handle
+ * operation requests.
+ *
+ * @do_lock: lock certain region of L3 cache from being evicted.
+ * @poll_lock_done: check if the lock operation has succeeded.
+ * @do_unlock: unlock the locked region of L3 cache back to normal.
+ * @poll_unlock_done: check if the unlock operation has succeeded.
+ operation requests.
+ *
+ * Operations are decoupled into two phases so that framework does not have
+ * to wait for one operation to finish before calling the next when multiple
+ * hardwares onboard.
+ *
+ * Implementers must implement the functions in pairs. Implementation should
+ * return -EBUSY when:
+ * - insufficient resources are available to perform the operation.
+ * - previously raised operation is not finished.
+ * - new operations (do_lock(), do_unlock() etc.) to the same address
+ * before corresponding done functions being called.
+ */
+struct hisi_soc_comp_ops {
+ int (*do_lock)(struct hisi_soc_comp *comp, phys_addr_t addr, size_t size);
+ int (*poll_lock_done)(struct hisi_soc_comp *comp, phys_addr_t addr, size_t size);
+ int (*do_unlock)(struct hisi_soc_comp *comp, phys_addr_t addr);
+ int (*poll_unlock_done)(struct hisi_soc_comp *comp, phys_addr_t addr);
+};
+
+struct hisi_l3c_lock_region {
+ /* physical address of the arena allocated for aligned address */
+ unsigned long arena_start;
+ /* VMA region of locked memory for future release */
+ unsigned long vm_start;
+ unsigned long vm_end;
+ phys_addr_t addr;
+ size_t size;
+ /* Return value of cache lock call */
+ int status;
+ int cpu;
+};
+
+struct hisi_soc_comp_list {
+ struct list_head node;
+ /* protects list of HiSilicon SoC cache components */
+ spinlock_t lock;
+};
+
+static struct hisi_soc_comp_list l3c_devs;
+
+static int hisi_l3c_lock(int cpu, phys_addr_t addr, size_t size)
+{
+ struct hisi_soc_comp *comp;
+ int ret;
+
+ guard(spinlock)(&l3c_devs.lock);
+
+ /* When there is no instance onboard, no locked memory is available. */
+ if (list_empty(&l3c_devs.node))
+ return -ENOMEM;
+
+ /* Lock need to be performed on each channel of associated L3 cache. */
+ list_for_each_entry(comp, &l3c_devs.node, node) {
+ if (!cpumask_test_cpu(cpu, &comp->affinity_mask))
+ continue;
+ ret = comp->ops->do_lock(comp, addr, size);
+ if (ret)
+ return ret;
+ }
+
+ list_for_each_entry(comp, &l3c_devs.node, node) {
+ if (!cpumask_test_cpu(cpu, &comp->affinity_mask))
+ continue;
+ ret = comp->ops->poll_lock_done(comp, addr, size);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int hisi_l3c_unlock(int cpu, phys_addr_t addr)
+{
+ struct hisi_soc_comp *comp;
+ int ret;
+
+ guard(spinlock)(&l3c_devs.lock);
+
+ if (list_empty(&l3c_devs.node))
+ return -EINVAL;
+
+ /* Perform unlock on each channel of associated L3 cache. */
+ list_for_each_entry(comp, &l3c_devs.node, node) {
+ if (!cpumask_test_cpu(cpu, &comp->affinity_mask))
+ continue;
+ ret = comp->ops->do_unlock(comp, addr);
+ if (ret)
+ return ret;
+ }
+
+ list_for_each_entry(comp, &l3c_devs.node, node) {
+ if (!cpumask_test_cpu(cpu, &comp->affinity_mask))
+ continue;
+ ret = comp->ops->poll_unlock_done(comp, addr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void hisi_soc_comp_add(struct hisi_soc_comp *comp)
+{
+ guard(spinlock)(&l3c_devs.lock);
+ list_add_tail(&comp->node, &l3c_devs.node);
+}
+
+/* Null @comp means to delete all instances. */
+static int hisi_soc_comp_del(struct hisi_soc_comp *comp)
+{
+ struct hisi_soc_comp *entry, *tmp;
+
+ guard(spinlock)(&l3c_devs.lock);
+ list_for_each_entry_safe(entry, tmp, &l3c_devs.node, node) {
+ if (comp && comp != entry)
+ continue;
+
+ list_del(&entry->node);
+
+ /* Only continue to delete nodes when @comp is NULL */
+ if (comp)
+ break;
+ }
+
+ return 0;
+}
+
+static void hisi_l3c_vm_open(struct vm_area_struct *vma)
+{
+ struct hisi_l3c_lock_region *clr = vma->vm_private_data;
+
+ /*
+ * Only perform cache lock when the vma passed in is created in
+ * hisi_l3c_mmap.
+ */
+ if (clr->vm_start != vma->vm_start || clr->vm_end != vma->vm_end)
+ return;
+
+ clr->status = hisi_l3c_lock(clr->cpu, clr->addr, clr->size);
+}
+
+static void hisi_l3c_vm_close(struct vm_area_struct *vma)
+{
+ struct hisi_l3c_lock_region *clr = vma->vm_private_data;
+ int order = get_order(clr->size);
+
+ /*
+ * Only perform cache unlock when the vma passed in is created
+ * in hisi_l3c_mmap.
+ */
+ if (clr->vm_start != vma->vm_start || clr->vm_end != vma->vm_end)
+ return;
+
+ hisi_l3c_unlock(clr->cpu, clr->addr);
+
+ free_contig_range(PHYS_PFN(clr->addr), 1 << order);
+ kfree(clr);
+ vma->vm_private_data = NULL;
+}
+
+/* mremap operation is not supported for HiSilicon SoC cache. */
+static int hisi_l3c_vm_mremap(struct vm_area_struct *vma)
+{
+ struct hisi_l3c_lock_region *clr = vma->vm_private_data;
+
+ /*
+ * vma region size will be changed as requested by mremap despite the
+ * callback failure in this function. Thus, change the vma region
+ * stored in clr according to the parameters to verify if the pages
+ * should be freed when unmapping.
+ */
+ clr->vm_end = clr->vm_start + (vma->vm_end - vma->vm_start);
+ pr_err("mremap for HiSilicon SoC locked cache is not supported\n");
+
+ return -EOPNOTSUPP;
+}
+
+static int hisi_l3c_may_split(struct vm_area_struct *area, unsigned long addr)
+{
+ pr_err("HiSilicon SoC locked cache may not be split.\n");
+ return -EINVAL;
+}
+
+static const struct vm_operations_struct hisi_l3c_vm_ops = {
+ .open = hisi_l3c_vm_open,
+ .close = hisi_l3c_vm_close,
+ .may_split = hisi_l3c_may_split,
+ .mremap = hisi_l3c_vm_mremap,
+};
+
+static int hisi_l3c_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long size = vma->vm_end - vma->vm_start;
+ int order = get_order(size);
+ unsigned long addr;
+ struct page *pg;
+ int ret;
+
+ struct hisi_l3c_lock_region *clr __free(kfree) = kzalloc(sizeof(*clr), GFP_KERNEL);
+ if (!clr)
+ return -ENOMEM;
+
+ /* Continuous physical memory is required for L3 cache lock. */
+ pg = alloc_contig_pages(1 << order, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+ cpu_to_node(smp_processor_id()), NULL);
+ if (!pg)
+ return -ENOMEM;
+
+ addr = page_to_phys(pg);
+ *clr = (struct hisi_l3c_lock_region) {
+ .addr = addr,
+ .size = size,
+ .cpu = smp_processor_id(),
+ /* vma should not be moved, store here for validation */
+ .vm_start = vma->vm_start,
+ .vm_end = vma->vm_end,
+ };
+
+ vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND);
+ vma->vm_ops = &hisi_l3c_vm_ops;
+ vma->vm_private_data = clr;
+
+ hisi_l3c_vm_ops.open(vma);
+ if (clr->status) {
+ ret = clr->status;
+ goto out_page;
+ }
+
+ ret = remap_pfn_range(vma, vma->vm_start, PFN_DOWN(addr), size,
+ vma->vm_page_prot);
+ if (ret)
+ goto out_page;
+
+ /* Save clr from being freed when lock succeeds. */
+ vma->vm_private_data = no_free_ptr(clr);
+
+ return 0;
+
+out_page:
+ free_contig_range(PHYS_PFN(clr->addr), 1 << order);
+ return ret;
+}
+
+static int hisi_l3c_lock_restriction(unsigned long arg)
+{
+ void __user *uarg = (void __user *)arg;
+ int cpu = smp_processor_id();
+ struct hisi_soc_comp *comp;
+
+ if (list_empty(&l3c_devs.node))
+ return -ENODEV;
+
+ list_for_each_entry(comp, &l3c_devs.node, node) {
+ if (!cpumask_test_cpu(cpu, &comp->affinity_mask))
+ continue;
+
+ if (!comp->private)
+ return -ENOENT;
+
+ if (copy_to_user(uarg, comp->private, sizeof(struct hisi_l3c_lock_info)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ return -ENODEV;
+}
+
+static long hisi_l3c_ioctl(struct file *file, u32 cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case HISI_L3C_LOCK_INFO:
+ return hisi_l3c_lock_restriction(arg);
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct file_operations l3c_dev_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = hisi_l3c_ioctl,
+ .mmap = hisi_l3c_mmap,
+};
+
+static struct miscdevice l3c_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "hisi_l3c",
+ .fops = &l3c_dev_fops,
+ .mode = 0600,
+};
+
+static int __init hisi_l3c_init(void)
+{
+ spin_lock_init(&l3c_devs.lock);
+ INIT_LIST_HEAD(&l3c_devs.node);
+
+ return misc_register(&l3c_miscdev);
+}
+module_init(hisi_l3c_init);
+
+static void __exit hisi_l3c_exit(void)
+{
+ misc_deregister(&l3c_miscdev);
+ hisi_soc_comp_del(NULL);
+}
+module_exit(hisi_l3c_exit);
+
+MODULE_DESCRIPTION("Hisilicon L3 Cache Driver");
+MODULE_AUTHOR("Yushan Wang <wangyushan12@huawei.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/misc/hisi_l3c.h b/include/uapi/misc/hisi_l3c.h
new file mode 100644
index 000000000000..6555be18aa1c
--- /dev/null
+++ b/include/uapi/misc/hisi_l3c.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later WITH Linux-syscall-note */
+/* Copyright (c) 2024 HiSilicon Technologies Co., Ltd. */
+#ifndef _UAPI_HISI_SOC_L3C_H
+#define _UAPI_HISI_SOC_L3C_H
+
+#include <linux/types.h>
+
+/* HISI_L3C_INFO: cache lock info for HiSilicon SoC */
+#define HISI_L3C_LOCK_INFO _IOW(0xBB, 1, unsigned long)
+
+/**
+ * struct hisi_l3c_info - User data for hisi cache operates.
+ * @lock_region_num: available locked memory region on a L3C instance
+ * @lock_size: available size to be locked of the L3C instance.
+ * @address_alignment: if the L3C lock requires locked region physical start
+ * address to be aligned with the memory region size.
+ * @max_lock_size: maximum locked memory size on a L3C instance.
+ * @min_lock_size: minimum locked memory size on a L3C instance.
+ */
+struct hisi_l3c_lock_info {
+ __u32 lock_region_num;
+ __u64 lock_size;
+ __u8 address_alignment;
+ __u64 max_lock_size;
+ __u64 min_lock_size;
+};
+
+#endif
--
2.33.0
Hi Yushan,
kernel test robot noticed the following build errors:
[auto build test ERROR on linus/master]
[also build test ERROR on v6.19-rc8]
[cannot apply to soc/for-next next-20260203]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Yushan-Wang/soc-cache-L3-cache-driver-for-HiSilicon-SoC/20260204-004656
base: linus/master
patch link: https://lore.kernel.org/r/20260203161843.649417-2-wangyushan12%40huawei.com
patch subject: [PATCH 1/3] soc cache: L3 cache driver for HiSilicon SoC
config: loongarch-randconfig-r131-20260204 (https://download.01.org/0day-ci/archive/20260204/202602041006.7Hb46Sl8-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 9b8addffa70cee5b2acc5454712d9cf78ce45710)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260204/202602041006.7Hb46Sl8-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602041006.7Hb46Sl8-lkp@intel.com/
All errors (new ones prefixed by >>):
>> drivers/soc/hisilicon/hisi_soc_l3c.c:251:7: error: call to undeclared function 'alloc_contig_pages'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
251 | pg = alloc_contig_pages(1 << order, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
| ^
>> drivers/soc/hisilicon/hisi_soc_l3c.c:251:5: error: incompatible integer to pointer conversion assigning to 'struct page *' from 'int' [-Wint-conversion]
251 | pg = alloc_contig_pages(1 << order, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
252 | cpu_to_node(smp_processor_id()), NULL);
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2 errors generated.
vim +/alloc_contig_pages +251 drivers/soc/hisilicon/hisi_soc_l3c.c
237
238 static int hisi_l3c_mmap(struct file *file, struct vm_area_struct *vma)
239 {
240 unsigned long size = vma->vm_end - vma->vm_start;
241 int order = get_order(size);
242 unsigned long addr;
243 struct page *pg;
244 int ret;
245
246 struct hisi_l3c_lock_region *clr __free(kfree) = kzalloc(sizeof(*clr), GFP_KERNEL);
247 if (!clr)
248 return -ENOMEM;
249
250 /* Continuous physical memory is required for L3 cache lock. */
> 251 pg = alloc_contig_pages(1 << order, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
252 cpu_to_node(smp_processor_id()), NULL);
253 if (!pg)
254 return -ENOMEM;
255
256 addr = page_to_phys(pg);
257 *clr = (struct hisi_l3c_lock_region) {
258 .addr = addr,
259 .size = size,
260 .cpu = smp_processor_id(),
261 /* vma should not be moved, store here for validation */
262 .vm_start = vma->vm_start,
263 .vm_end = vma->vm_end,
264 };
265
266 vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND);
267 vma->vm_ops = &hisi_l3c_vm_ops;
268 vma->vm_private_data = clr;
269
270 hisi_l3c_vm_ops.open(vma);
271 if (clr->status) {
272 ret = clr->status;
273 goto out_page;
274 }
275
276 ret = remap_pfn_range(vma, vma->vm_start, PFN_DOWN(addr), size,
277 vma->vm_page_prot);
278 if (ret)
279 goto out_page;
280
281 /* Save clr from being freed when lock succeeds. */
282 vma->vm_private_data = no_free_ptr(clr);
283
284 return 0;
285
286 out_page:
287 free_contig_range(PHYS_PFN(clr->addr), 1 << order);
288 return ret;
289 }
290
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Yushan, thanks for your patch! On Tue, Feb 3, 2026 at 5:18 PM Yushan Wang <wangyushan12@huawei.com> wrote: > > The driver will create a file of `/dev/hisi_l3c` on init, mmap > operations to it will allocate a memory region that is guaranteed to be > placed in L3 cache. > > The driver also provides unmap() to deallocated the locked memory. > > The driver also provides an ioctl interface for user to get cache lock > information, such as lock restrictions and locked sizes. > > Signed-off-by: Yushan Wang <wangyushan12@huawei.com> The commit message does not say *why* you are doing this? > +config HISI_SOC_L3C > + bool "HiSilicon L3 Cache device driver" > + depends on ACPI > + depends on ARM64 || COMPILE_TEST > + help > + This driver provides the functions to lock L3 cache entries from > + being evicted for better performance. Here is the reason though. Things like this need to be CC to linux-mm@vger.kernel.org. I don't see why userspace would be so well informed as to make decisions about what should be locked in the L3 cache and not? I see the memory hierarchy as any other hardware: a resource that is allocated and arbitrated by the kernel. The MM subsytem knows which memory is most cache hot. Especially when you use DAMON DAMOS, which has the sole purpose of executing actions like that. Here is a good YouTube. https://www.youtube.com/watch?v=xKJO4kLTHOI Shouldn't the MM subsystem be in charge of determining, locking down and freeing up hot regions in L3 cache? This looks more like userspace is going to determine that but how exactly? By running DAMON? Then it's better to keep the whole mechanism in the kernel where it belongs and let the MM subsystem adapt locked L3 cache to the usage patterns. Yours, Linus Walleij
On Wed, 4 Feb 2026 01:10:01 +0100 Linus Walleij <linusw@kernel.org> wrote: > Hi Yushan, > > thanks for your patch! > > On Tue, Feb 3, 2026 at 5:18 PM Yushan Wang <wangyushan12@huawei.com> wrote: > > > > The driver will create a file of `/dev/hisi_l3c` on init, mmap > > operations to it will allocate a memory region that is guaranteed to be > > placed in L3 cache. > > > > The driver also provides unmap() to deallocated the locked memory. > > > > The driver also provides an ioctl interface for user to get cache lock > > information, such as lock restrictions and locked sizes. > > > > Signed-off-by: Yushan Wang <wangyushan12@huawei.com> > > The commit message does not say *why* you are doing this? > > > +config HISI_SOC_L3C > > + bool "HiSilicon L3 Cache device driver" > > + depends on ACPI > > + depends on ARM64 || COMPILE_TEST > > + help > > + This driver provides the functions to lock L3 cache entries from > > + being evicted for better performance. > > Here is the reason though. > > Things like this need to be CC to linux-mm@vger.kernel.org. > > I don't see why userspace would be so well informed as to make decisions > about what should be locked in the L3 cache and not? > > I see the memory hierarchy as any other hardware: a resource that is > allocated and arbitrated by the kernel. > > The MM subsytem knows which memory is most cache hot. > Especially when you use DAMON DAMOS, which has the sole > purpose of executing actions like that. Here is a good YouTube. > https://www.youtube.com/watch?v=xKJO4kLTHOI Hi Linus, This typically isn't about cache hot. It it were, the data would be in the cache without this. It's about ensuring something that would otherwise unlikely to be there is in the cache. Normally that's a latency critical region. In general the kernel has no chance of figuring out what those are ahead of time, only userspace can know (based on profiling etc) that is per workload. The first hit matters in these use cases and it's not something the prefetchers can help with. The only thing we could do if this was in kernel would be to have userspace pass some hints and then let the kernel actually kick off the process. That just boils down to using a different interface to do what this driver is doing (and that's the conversaion this series is trying to get going) It's a finite resource and you absolutely need userspace to be able to tell if it got what it asked for or not. Damon might be useful for that preanalysis though but it can't do anything for the infrequent extremely latency sensitive accesses. Normally this is fleet wide stuff based on intensive benchmarking of a few nodes. Same sort of approach as the original warehouse scale computing paper on tuning zswap capacity across a fleet. Its an extreme form of profile guided optimization (and not currently automatic I think?). If we are putting code in this locked region, the program has been carefully recompiled / linked to group the critical parts so that we can use the minimum number of these locked regions. Data is a little simpler. It's kind of similar to resctl but at a sub process granularity. > > Shouldn't the MM subsystem be in charge of determining, locking > down and freeing up hot regions in L3 cache? > > This looks more like userspace is going to determine that but > how exactly? By running DAMON? Then it's better to keep the > whole mechanism in the kernel where it belongs and let the > MM subsystem adapt locked L3 cache to the usage patterns. I haven't yet come up with any plausible scheme by which the MM subsystem could do this. I think what we need here Yushan, is more detail on end to end use cases for this. Some examples etc as clearer motivation. Jonathan > > Yours, > Linus Walleij >
On 2/4/2026 9:40 PM, Jonathan Cameron wrote: > On Wed, 4 Feb 2026 01:10:01 +0100 > Linus Walleij <linusw@kernel.org> wrote: > >> Shouldn't the MM subsystem be in charge of determining, locking >> down and freeing up hot regions in L3 cache? >> >> This looks more like userspace is going to determine that but >> how exactly? By running DAMON? Then it's better to keep the >> whole mechanism in the kernel where it belongs and let the >> MM subsystem adapt locked L3 cache to the usage patterns. > I haven't yet come up with any plausible scheme by which the MM > subsystem could do this. > > I think what we need here Yushan, is more detail on end to end > use cases for this. Some examples etc as clearer motivation. > Hi, Let me try to explain the use case here. The idea is similar to this article: https://www.cl.cam.ac.uk/~rnw24/papers/201708-sigcomm-diskcryptnet.pdf Suppose we have data on SSD that need to be transferred through network. We have technologies like DDIO and IO stash to make data flow through L3 cache instead of DDR to avoid the influence of DDR bandwidth. But if something is to be done to the data instead of merely copying, and cores needs to participate, we'd like to make data to climb a bit higher up through the memory hierarchy and stay there before data processing is done. That is, correct amount of data being fetched to L3 cache, and consumed just in time, then free L3 for next batch. It is more of a userspace defined pipeline that utilizes capability provided by kernel, where cache locks are allocated and freed quickly with batches. In above use case, C2C latency is chosen to avoid DDR latency, precisely which L3 cache to store the data is not required. (For this part maybe including steering tag as the hint to choose the correct L3 is a smarter way, like AMD SDCIAE). Memory management is, in many way, independent to architecture and vendors, we might not want to take hardware specific feature into account when kernel makes decisions of, say, swapping a page or not, but we can control the hardware resource to lean more on a process, like resctl. Thanks, Yushan
On Fri, Feb 6, 2026 at 11:08 AM wangyushan <wangyushan12@huawei.com> wrote: > Suppose we have data on SSD that need to be transferred through network. > We have technologies like DDIO and IO stash to make data flow through > L3 cache instead of DDR to avoid the influence of DDR bandwidth. [https://www.cl.cam.ac.uk/~rnw24/papers/201708-sigcomm-diskcryptnet.pdf] So as to decode, encrypt or run some AI training/inference stuff on the data, I get it. The paper immediately gives at hand a use case the Linux kernel (not userspace) could use: lock down the code and constants used by in-kernel cipher algorithms to reduce latency on encrypted disk or networks. [Added in Ard and Herbert who may be interested] Which means that if this could actually be used for these "hard kernels" in Linux the proper way to abstract this is to give the kernel a generic interface to request L3 cacheline lockdown no matter if that is employed by the kernel or userspace. > But if something is to be done to the data instead of merely copying, > and cores needs to participate, When you say this, is it "CPU cores" or others cores such as DSPs or GPU/NPUs you are thinking of, or any kind of data processing core (all of them)? This surely need to be abstracted in such a way that either of these can use it, Arnd mentions dma-buf which is a way devices think about data that the CPU cores doesn't necessarily (but may) touch, and resctrl could very well integrate into that I think. What I think is important is that the modeling in the kernel is consistent and that l3 cache lockdown is something any part of the kernel needing it can request. Yours, Linus Walleij
On Fri, Feb 6, 2026, at 11:07, wangyushan wrote:
>
> Let me try to explain the use case here.
>
> The idea is similar to this article:
> https://www.cl.cam.ac.uk/~rnw24/papers/201708-sigcomm-diskcryptnet.pdf
>
> Suppose we have data on SSD that need to be transferred through network.
> We have technologies like DDIO and IO stash to make data flow through
> L3 cache instead of DDR to avoid the influence of DDR bandwidth.
>
> But if something is to be done to the data instead of merely copying,
> and cores needs to participate, we'd like to make data to climb a bit
> higher up through the memory hierarchy and stay there before data
> processing is done. That is, correct amount of data being fetched to
> L3 cache, and consumed just in time, then free L3 for next batch.
> It is more of a userspace defined pipeline that utilizes capability
> provided by kernel, where cache locks are allocated and freed quickly
> with batches.
>
> In above use case, C2C latency is chosen to avoid DDR latency, precisely
> which L3 cache to store the data is not required. (For this part maybe
> including steering tag as the hint to choose the correct L3 is a smarter
> way, like AMD SDCIAE).
>
> Memory management is, in many way, independent to architecture and
> vendors, we might not want to take hardware specific feature into
> account when kernel makes decisions of, say, swapping a page or not,
> but we can control the hardware resource to lean more on a process,
> like resctl.
Ah, so if the main purpose here is to access the memory from
devices, I wonder if this should be structured as a dma-buf
driver. This would still allow you to mmap() a character
device, but in addition allow passing the file descriptor
to driver interfaces that take a dmabuf instead of a user
memory pointer.
Arnd
Hi Jonathan, thanks for stepping in, I'm trying to be healthy sceptical here... What you and others need to do is to tell me if I'm being too critical. But right now it feels like I need some more senior MM developers to tell me to be a good boy and let this hack patch slip before I shut up ;) On Wed, Feb 4, 2026 at 2:40 PM Jonathan Cameron <jonathan.cameron@huawei.com> wrote: > > The MM subsytem knows which memory is most cache hot. > > Especially when you use DAMON DAMOS, which has the sole > > purpose of executing actions like that. Here is a good YouTube. > > https://www.youtube.com/watch?v=xKJO4kLTHOI > > This typically isn't about cache hot. It it were, the data would > be in the cache without this. It's about ensuring something that would > otherwise unlikely to be there is in the cache. OK I get it. > Normally that's a latency critical region. In general the kernel > has no chance of figuring out what those are ahead of time, only > userspace can know (based on profiling etc) that is per workload. (...) > The only thing we could do if this was in kernel would be to > have userspace pass some hints and then let the kernel actually > kick off the process. (...) > and you absolutely need userspace to be able to tell if it > got what it asked for or not. (...) > Its an extreme form of profile guided optimization (and not > currently automatic I think?). If we are putting code in this > locked region, the program has been carefully recompiled / linked > to group the critical parts so that we can use the minimum number > of these locked regions. Data is a little simpler. OK so the argument is "only userspace knows what cache lines are performance critical, and therefore this info must be passed from userspace". Do I understand correctly? What I'm worried about here is that "an extreme form of profile guided optimization" is a bit handwavy. I would accept if it is based on simulation or simply human know-how, such as if a developer puts signal-processing algorithm kernels there because they know it is going to be the hard kernel of the process. But does the developer know if that hard kernel is importantest taken into account all other processes running on the system, and what happens if several processes say they have such hard kernels? Who will arbitrate? That is usually the kernels job. > I haven't yet come up with any plausible scheme by which the MM > subsystem could do this. I find it kind of worrying if userspace knows which lines are most performance-critical but the kernel MM subsystem does not. That strongly inidicates that if only userspace knows that, then madvise() is the way to go. The MM might need and use this information for other reasons than just locking down lines in the L3 cache. In my mind: Userspace madvise -> Linux MM -> arch cache-line lockdown So the MM needs to take the decision that this indication from userspace is something that should result in asking the arch to lock down these cache lines, as well as re-evaluate it if new processes start sending the same madise() calls and we run out in lock-downable cache lines. L3 lock-downs is a finite resource after all, and it needs to be arbitrated. Just OTOMH, maybe if several processes ask for this simultaneously and we run out of lockdownable cache lines, who wins? First come first served? The process with the highest nice value or realtime priority? Etc. I.e. the kernel MM needs to arbitrate any cache lockdown. Bypassing the whole MM like this patch does is a hack designed for one single process that the user "knows" is "importantest" and will be the only process asking for cache lines to be locked down. And this isn't abstract and it does not scale. We can't do that. That's the kind of resource management we expect from the kernel. MM might want to use that information for other things. > I think what we need here Yushan, is more detail on end to end > use cases for this. Some examples etc as clearer motivation. I agree. Yours, Linus Walleij
On Thu, 5 Feb 2026 10:12:33 +0100 Linus Walleij <linusw@kernel.org> wrote: > Hi Jonathan, > > thanks for stepping in, I'm trying to be healthy sceptical here... > > What you and others need to do is to tell me if I'm being too > critical. But right now it feels like I need some more senior > MM developers to tell me to be a good boy and let this > hack patch slip before I shut up ;) It's good to have these discussions as it makes us actually explain what they want to do much more clearly! wangyushan and I have both been taking about this for too long so it's easy to miss that it's not been explained properly. Note I was absolutely expecting a non trivial discussion on how to do this and in particular how generic it should be. +CC a various resctl / mpam related people. > > On Wed, Feb 4, 2026 at 2:40 PM Jonathan Cameron > <jonathan.cameron@huawei.com> wrote: > > > > The MM subsytem knows which memory is most cache hot. > > > Especially when you use DAMON DAMOS, which has the sole > > > purpose of executing actions like that. Here is a good YouTube. > > > https://www.youtube.com/watch?v=xKJO4kLTHOI > > > > This typically isn't about cache hot. It it were, the data would > > be in the cache without this. It's about ensuring something that would > > otherwise unlikely to be there is in the cache. > > OK I get it. > > > Normally that's a latency critical region. In general the kernel > > has no chance of figuring out what those are ahead of time, only > > userspace can know (based on profiling etc) that is per workload. > (...) > > The only thing we could do if this was in kernel would be to > > have userspace pass some hints and then let the kernel actually > > kick off the process. > (...) > > and you absolutely need userspace to be able to tell if it > > got what it asked for or not. > (...) > > Its an extreme form of profile guided optimization (and not > > currently automatic I think?). If we are putting code in this > > locked region, the program has been carefully recompiled / linked > > to group the critical parts so that we can use the minimum number > > of these locked regions. Data is a little simpler. > > OK so the argument is "only userspace knows what cache lines > are performance critical, and therefore this info must be passed > from userspace". Do I understand correctly? Yes. > > What I'm worried about here is that "an extreme form of profile > guided optimization" is a bit handwavy. I would accept if it is > based on simulation or simply human know-how, such as > if a developer puts signal-processing algorithm kernels > there because they know it is going to be the hard kernel > of the process. Those methods are part of what I'd consider profile guided optimization. I wasn't meaning to only including the automatic methods. For all the ways to tune this, you get lots of data from simulation or real hardware and use that to understand what makes sense to lock in cache. The human involved is often going to guide those simulations - but follow that with a lot of testing and data gathering. One existing user I'm aware did a lot of work to identify exactly what they needed to pin. It's an appliance type situation where they know exactly what the workloads are on that server. I'm not sure how much more we can share on that customer use case / case study beyond this vague description, so will leave it to Yushan to maybe provide more info. > > But does the developer know if that hard kernel is importantest > taken into account all other processes running on the system, > and what happens if several processes say they have > such hard kernels? Who will arbitrate? That is usually the > kernels job. Take the closest example to this which is resctl (mpam on arm). This actually has a feature that smells a bit like this. Pseudo-cache locking. https://docs.kernel.org/filesystems/resctrl.html#cache-pseudo-locking My understanding is that the semantics of that don't align perfectly with what we have here. Yushan can you add more on why we didn't try to fit into that scheme? Other than the obvious bit that more general upstream support for the arch definitions of MPAM is a work in progress and fitting vendor specific features on top will be tricky for a while at least. The hardware here is also independent of the MPAM support. Resctl puts the control on resource allocation into the hands of userspace (in that case via cgroups etc as it's process level controls). The cache lockdown is a weird because you have go through a dance of creating a temporary setup, demand fetching the lines into cache and then rely on various operations not occuring that might push them out again. Resctl provides many footguns and is (I believe) used by administrators who are very careful in how they use it. Note that there are some guards in this new code to only allow locking a portion of the l3. We also rely somewhat on the uarch and cache design to ensure it is safe to do this type of locking (other than reducing perf of other tasks). I'm dancing around uarch details here that I would need to go seek agreement to share more on. > > > I haven't yet come up with any plausible scheme by which the MM > > subsystem could do this. > > I find it kind of worrying if userspace knows which lines are most > performance-critical but the kernel MM subsystem does not. > > That strongly inidicates that if only userspace knows that, then > madvise() is the way to go. The MM might need and use this > information for other reasons than just locking down lines in > the L3 cache. I agree that something like madvise() may well be more suitable. We do need paths to know how many regions are left etc though so it will need a few other bits of interface. I'm also not sure what appetite there will be for an madvise() for something that today we have no idea if anyone else actually has hardware for. If people do, then please shout and we can look at how something like this can be generalized. > > In my mind: > > Userspace madvise -> Linux MM -> arch cache-line lockdown > > So the MM needs to take the decision that this indication from > userspace is something that should result in asking the arch > to lock down these cache lines, as well as re-evaluate it if > new processes start sending the same madise() calls and we > run out in lock-downable cache lines. > > L3 lock-downs is a finite resource after all, and it needs to be > arbitrated. Just OTOMH, maybe if several processes ask for this > simultaneously and we run out of lockdownable cache lines, > who wins? First come first served? The process with the highest > nice value or realtime priority? Etc. My current thinking is first come first served with a path to clearly tell an application it didn't get what it wanted. Scheduling, priority etc being involved would all interfere with the strong guarantees lock down provides. That's kind of why we ended up with a device type model as it's common to have finite resources and just say no if they have run out (accelerator queues etc). It's up to the userspace code to know what to do if they can't get what they asked for. > > I.e. the kernel MM needs to arbitrate any cache lockdown. > > Bypassing the whole MM like this patch does is a hack designed > for one single process that the user "knows" is "importantest" > and will be the only process asking for cache lines to be locked > down. > > And this isn't abstract and it does not scale. We can't do that. > > That's the kind of resource management we expect from the > kernel. I'm with you in many ways on this, but there are other things for which we absolutely do allocate from a finite resource and don't let the kernel make decisions - typically because there is no right way to arbitrate. If we can invent a scheme for arbitration for this then great, right now I can't envision anything other than 1st come 1st served being appropriate. Maybe there are other use cases where other schemes work (e.g. if we ever figure out how to use this as a form of live optimization) > > MM might want to use that information for other things. Absolutely, though I'm doubtful about trying to design a generic way of conveying latency criticality without knowing more of those use cases from the start. Thanks, Jonathan > > > I think what we need here Yushan, is more detail on end to end > > use cases for this. Some examples etc as clearer motivation. > > I agree. > > Yours, > Linus Walleij >
On 2/5/2026 6:18 PM, Jonathan Cameron wrote: > On Thu, 5 Feb 2026 10:12:33 +0100 > Linus Walleij <linusw@kernel.org> wrote: > >> But does the developer know if that hard kernel is importantest >> taken into account all other processes running on the system, >> and what happens if several processes say they have >> such hard kernels? Who will arbitrate? That is usually the >> kernels job. > > Take the closest example to this which is resctl (mpam on arm). > This actually has a feature that smells a bit like this. > Pseudo-cache locking. > > https://docs.kernel.org/filesystems/resctrl.html#cache-pseudo-locking > > My understanding is that the semantics of that don't align perfectly > with what we have here. Yushan can you add more on why we didn't > try to fit into that scheme? Other than the obvious bit that more > general upstream support for the arch definitions of MPAM is a work in > progress and fitting vendor specific features on top will be tricky > for a while at least. The hardware here is also independent of the > MPAM support. Intel cache pseudo lock requires help of IA32_PQR_ASSOC MSR, according to [1], that register can save necessary information for processes acquired cache pseudo locks, but Arm64 does not have the equivalent register. [1]: https://www.intel.com/content/www/us/en/developer/articles/technical/cache-allocation-technology-usage-models.html > > Resctl puts the control on resource allocation into the hands of > userspace (in that case via cgroups etc as it's process level controls). > The cache lockdown is a weird because you have go through a dance of > creating a temporary setup, demand fetching the lines into cache and > then rely on various operations not occuring that might push them out > again. > > Resctl provides many footguns and is (I believe) used by administrators > who are very careful in how they use it. Note that there are some guards > in this new code to only allow locking a portion of the l3. We also rely > somewhat on the uarch and cache design to ensure it is safe to do this > type of locking (other than reducing perf of other tasks). > I'm dancing around uarch details here that I would need to go seek > agreement to share more on. > >> >>> I haven't yet come up with any plausible scheme by which the MM >>> subsystem could do this. >> >> I find it kind of worrying if userspace knows which lines are most >> performance-critical but the kernel MM subsystem does not. >> >> That strongly inidicates that if only userspace knows that, then >> madvise() is the way to go. The MM might need and use this >> information for other reasons than just locking down lines in >> the L3 cache. > > I agree that something like madvise() may well be more suitable. > We do need paths to know how many regions are left etc though so > it will need a few other bits of interface. > > I'm also not sure what appetite there will be for an madvise() > for something that today we have no idea if anyone else actually > has hardware for. If people do, then please shout and we can > look at how something like this can be generalized. Currently madvise() "only operates on whole pages", maybe madvise() will not be happy with the semantic change of page / cacheline. Cache size available for lock may be far less than the size madvise() can handle. Though madvise() can always speculatively call cache lock once appropriate and get back to original track if refused, but that's a hack that need more deep discussion. I think resctl is more suitable for this, as this serves the same purpose as MPAM etc, to save QoS of a task, and the way to achieve it, by tweaking hardware capability. Thanks, Yushan
Hi Yushan, On 2/6/26 09:54, wangyushan wrote: > > On 2/5/2026 6:18 PM, Jonathan Cameron wrote: >> On Thu, 5 Feb 2026 10:12:33 +0100 >> Linus Walleij <linusw@kernel.org> wrote: >> >>> But does the developer know if that hard kernel is importantest >>> taken into account all other processes running on the system, >>> and what happens if several processes say they have >>> such hard kernels? Who will arbitrate? That is usually the >>> kernels job. >> >> Take the closest example to this which is resctl (mpam on arm). >> This actually has a feature that smells a bit like this. >> Pseudo-cache locking. >> >> https://docs.kernel.org/filesystems/resctrl.html#cache-pseudo-locking >> >> My understanding is that the semantics of that don't align perfectly >> with what we have here. Yushan can you add more on why we didn't >> try to fit into that scheme? Other than the obvious bit that more >> general upstream support for the arch definitions of MPAM is a work in >> progress and fitting vendor specific features on top will be tricky >> for a while at least. The hardware here is also independent of the >> MPAM support. > > Intel cache pseudo lock requires help of IA32_PQR_ASSOC MSR, according > to [1], that register can save necessary information for processes acquired > cache pseudo locks, but Arm64 does not have the equivalent register. If you have MPAM, the per exception level MPAMx_ELy registers are somewhat equivalent. They tell you which partid and pmg identifiers the CPU is using and IA32_PQR_ASSOC tells you the closid and rmid which are much the same thing. Is there a difference that stops being equivalent in this scenario? > > [1]: https://www.intel.com/content/www/us/en/developer/articles/technical/cache-allocation-technology-usage-models.html > [...] Thanks, Ben
On 2/5/26 10:18, Jonathan Cameron wrote: > On Thu, 5 Feb 2026 10:12:33 +0100 > Linus Walleij <linusw@kernel.org> wrote: > >> Hi Jonathan, >> >> thanks for stepping in, I'm trying to be healthy sceptical here... >> >> What you and others need to do is to tell me if I'm being too >> critical. But right now it feels like I need some more senior >> MM developers to tell me to be a good boy and let this >> hack patch slip before I shut up ;) > > It's good to have these discussions as it makes us actually > explain what they want to do much more clearly! > wangyushan and I have both been taking about this for too long so > it's easy to miss that it's not been explained properly. > > Note I was absolutely expecting a non trivial discussion on how to do > this and in particular how generic it should be. > > +CC a various resctl / mpam related people. [...] > >> >> But does the developer know if that hard kernel is importantest >> taken into account all other processes running on the system, >> and what happens if several processes say they have >> such hard kernels? Who will arbitrate? That is usually the >> kernels job. > > Take the closest example to this which is resctl (mpam on arm). > This actually has a feature that smells a bit like this. > Pseudo-cache locking. > > https://docs.kernel.org/filesystems/resctrl.html#cache-pseudo-locking > > My understanding is that the semantics of that don't align perfectly > with what we have here. Yushan can you add more on why we didn't > try to fit into that scheme? Other than the obvious bit that more > general upstream support for the arch definitions of MPAM is a work in > progress and fitting vendor specific features on top will be tricky > for a while at least. The hardware here is also independent of the > MPAM support. > > Resctl puts the control on resource allocation into the hands of > userspace (in that case via cgroups etc as it's process level controls). > The cache lockdown is a weird because you have go through a dance of > creating a temporary setup, demand fetching the lines into cache and > then rely on various operations not occuring that might push them out > again. > > Resctl provides many footguns and is (I believe) used by administrators > who are very careful in how they use it. Note that there are some guards > in this new code to only allow locking a portion of the l3. We also rely > somewhat on the uarch and cache design to ensure it is safe to do this > type of locking (other than reducing perf of other tasks). > I'm dancing around uarch details here that I would need to go seek > agreement to share more on. > Just wondering about the compatiblity of cache lockdown and resctrl/mpam. If this is done outside resctrl then how would this interact with the cache portion bitmaps used in resctrl/mpam? For instance, how would a user know whether or not a resctrl/mpam cache portion is unusable because it has been locked? Thanks, Ben
On Thu, Feb 5, 2026 at 11:18 AM Jonathan Cameron <jonathan.cameron@huawei.com> wrote: > Take the closest example to this which is resctl (mpam on arm). > This actually has a feature that smells a bit like this. > Pseudo-cache locking. > > https://docs.kernel.org/filesystems/resctrl.html#cache-pseudo-locking That was very interesting. And more than a little bit complex. IIUC MPAM is mostly about requesting bandwidth to/from the memory. But maybe cache lockdown can build on top? > I'm also not sure what appetite there will be for an madvise() > for something that today we have no idea if anyone else actually > has hardware for. If people do, then please shout and we can > look at how something like this can be generalized. Cache lockdown is an ages old concept, I think others can do it too, you are just the first to try to support it upstream. Personally I'm all for this, as long as we can come up with something generic for others to use as well. No custom device + ioctl stuff. There are adjacent stuff that vendors are doing is about prefetch and which I mentioned briefly: Fujitus prefetch: https://lore.kernel.org/linux-arm-kernel/20220607120530.2447112-1-tarumizu.kohei@fujitsu.com/ AmpereOne prefetch: https://lore.kernel.org/linux-arm-kernel/20231122092855.4440-1-shijie@os.amperecomputing.com/ Maybe that is more related to MPAM actually. What it has in common with cache lockdown is "significatly indicate memore areas of special interest". But notice Will Deacons reply: https://lore.kernel.org/linux-arm-kernel/ZV3omRGtVS9l-tKk@FVFF77S0Q05N/ "We tend to shy away from micro-architecture specific optimisations in the arm64 kernel as they're pretty unmaintainable, hard to test properly, generally lead to bloat and add additional obstacles to updating our library routines." > My current thinking is first come first served with a path to > clearly tell an application it didn't get what it wanted. > Scheduling, priority etc being involved would all interfere > with the strong guarantees lock down provides. That sounds more like mdemand() than madvise() doesn't it ;) But surely an all-or-nothing ABI can be specified, and maybe a please-if-you-can ABI as well. > > MM might want to use that information for other things. > > Absolutely, though I'm doubtful about trying to design a generic > way of conveying latency criticality without knowing more of those > use cases from the start. Well, abstracting is about boiling the world down to a few facts that can be used for making general decisions. But for one I suppose if someone locks down some cache lines in L3 and then not actually use them much at long intervals because of misc, I suppose it's not very nice if the kernel decide to swap out the page with these cache lines in it, because that would have adverse impact on the performace once it hits for example? Or did someone think about that already? Is that avoided in the current patch set? (Maybe a stupid question...) Likewise I see that this code is keeping track of which CPU the l3 cache line were locked from, but I don't see anything in this code blocking task migration for whoever called this ABI or am I wrong? What happens if the scheduler moves the process to another CPU? Or is it implicit that this is nailed to the current CPU? Then surely that need to be enforced? I just get the overall feeling that this was just tested on a scenario such as: 1. Boot 2. Run a process calling this code, hey it works 3. Terminate process No sleeping and swapping under memory pressure etc happing. Designing for the generic case and in a central part of the kernel (inside MM not in drivers/soc...) would avoid such snags I think. Yours, Linus Walleij
Fixed linux-mm address that got added a few emails back. On Wed, 4 Feb 2026 13:40:20 +0000 Jonathan Cameron <jonathan.cameron@huawei.com> wrote: > On Wed, 4 Feb 2026 01:10:01 +0100 > Linus Walleij <linusw@kernel.org> wrote: > > > Hi Yushan, > > > > thanks for your patch! > > > > On Tue, Feb 3, 2026 at 5:18 PM Yushan Wang <wangyushan12@huawei.com> wrote: > > > > > > The driver will create a file of `/dev/hisi_l3c` on init, mmap > > > operations to it will allocate a memory region that is guaranteed to be > > > placed in L3 cache. > > > > > > The driver also provides unmap() to deallocated the locked memory. > > > > > > The driver also provides an ioctl interface for user to get cache lock > > > information, such as lock restrictions and locked sizes. > > > > > > Signed-off-by: Yushan Wang <wangyushan12@huawei.com> > > > > The commit message does not say *why* you are doing this? > > > > > +config HISI_SOC_L3C > > > + bool "HiSilicon L3 Cache device driver" > > > + depends on ACPI > > > + depends on ARM64 || COMPILE_TEST > > > + help > > > + This driver provides the functions to lock L3 cache entries from > > > + being evicted for better performance. > > > > Here is the reason though. > > > > Things like this need to be CC to linux-mm@vger.kernel.org. > > > > I don't see why userspace would be so well informed as to make decisions > > about what should be locked in the L3 cache and not? > > > > I see the memory hierarchy as any other hardware: a resource that is > > allocated and arbitrated by the kernel. > > > > The MM subsytem knows which memory is most cache hot. > > Especially when you use DAMON DAMOS, which has the sole > > purpose of executing actions like that. Here is a good YouTube. > > https://www.youtube.com/watch?v=xKJO4kLTHOI > Hi Linus, > > This typically isn't about cache hot. It it were, the data would > be in the cache without this. It's about ensuring something that would > otherwise unlikely to be there is in the cache. > > Normally that's a latency critical region. In general the kernel > has no chance of figuring out what those are ahead of time, only > userspace can know (based on profiling etc) that is per workload. > The first hit matters in these use cases and it's not something > the prefetchers can help with. > > The only thing we could do if this was in kernel would be to > have userspace pass some hints and then let the kernel actually > kick off the process. That just boils down to using a different > interface to do what this driver is doing (and that's the conversaion > this series is trying to get going) It's a finite resource > and you absolutely need userspace to be able to tell if it > got what it asked for or not. > > Damon might be useful for that preanalysis though but it can't do > anything for the infrequent extremely latency sensitive accesses. > Normally this is fleet wide stuff based on intensive benchmarking > of a few nodes. Same sort of approach as the original warehouse > scale computing paper on tuning zswap capacity across a fleet. > Its an extreme form of profile guided optimization (and not > currently automatic I think?). If we are putting code in this > locked region, the program has been carefully recompiled / linked > to group the critical parts so that we can use the minimum number > of these locked regions. Data is a little simpler. > > It's kind of similar to resctl but at a sub process granularity. > > > > > Shouldn't the MM subsystem be in charge of determining, locking > > down and freeing up hot regions in L3 cache? > > > > This looks more like userspace is going to determine that but > > how exactly? By running DAMON? Then it's better to keep the > > whole mechanism in the kernel where it belongs and let the > > MM subsystem adapt locked L3 cache to the usage patterns. > > I haven't yet come up with any plausible scheme by which the MM > subsystem could do this. > > I think what we need here Yushan, is more detail on end to end > use cases for this. Some examples etc as clearer motivation. > > Jonathan > > > > > Yours, > > Linus Walleij > > >
On Wed, 4 Feb 2026 13:44:47 +0000 Jonathan Cameron <jonathan.cameron@huawei.com> wrote: > > Fixed linux-mm address that got added a few emails back. > > On Wed, 4 Feb 2026 13:40:20 +0000 > Jonathan Cameron <jonathan.cameron@huawei.com> wrote: > > > On Wed, 4 Feb 2026 01:10:01 +0100 > > Linus Walleij <linusw@kernel.org> wrote: > > > > > Hi Yushan, > > > > > > thanks for your patch! > > > > > > On Tue, Feb 3, 2026 at 5:18 PM Yushan Wang <wangyushan12@huawei.com> wrote: > > > > > > > > The driver will create a file of `/dev/hisi_l3c` on init, mmap > > > > operations to it will allocate a memory region that is guaranteed to be > > > > placed in L3 cache. > > > > > > > > The driver also provides unmap() to deallocated the locked memory. > > > > > > > > The driver also provides an ioctl interface for user to get cache lock > > > > information, such as lock restrictions and locked sizes. > > > > > > > > Signed-off-by: Yushan Wang <wangyushan12@huawei.com> > > > > > > The commit message does not say *why* you are doing this? > > > > > > > +config HISI_SOC_L3C > > > > + bool "HiSilicon L3 Cache device driver" > > > > + depends on ACPI > > > > + depends on ARM64 || COMPILE_TEST > > > > + help > > > > + This driver provides the functions to lock L3 cache entries from > > > > + being evicted for better performance. > > > > > > Here is the reason though. > > > > > > Things like this need to be CC to linux-mm@vger.kernel.org. > > > > > > I don't see why userspace would be so well informed as to make decisions > > > about what should be locked in the L3 cache and not? > > > > > > I see the memory hierarchy as any other hardware: a resource that is > > > allocated and arbitrated by the kernel. > > > > > > The MM subsytem knows which memory is most cache hot. > > > Especially when you use DAMON DAMOS, which has the sole > > > purpose of executing actions like that. Here is a good YouTube. > > > https://www.youtube.com/watch?v=xKJO4kLTHOI Thank you for Cc-ing me, Linus. > > Hi Linus, > > > > This typically isn't about cache hot. It it were, the data would > > be in the cache without this. It's about ensuring something that would > > otherwise unlikely to be there is in the cache. > > > > Normally that's a latency critical region. In general the kernel > > has no chance of figuring out what those are ahead of time, only > > userspace can know (based on profiling etc) that is per workload. > > The first hit matters in these use cases and it's not something > > the prefetchers can help with. > > > > The only thing we could do if this was in kernel would be to > > have userspace pass some hints and then let the kernel actually > > kick off the process. That just boils down to using a different > > interface to do what this driver is doing (and that's the conversaion > > this series is trying to get going) It's a finite resource > > and you absolutely need userspace to be able to tell if it > > got what it asked for or not. And thank you for clarifying, Jonathan. > > > > Damon might be useful for that preanalysis though but it can't do > > anything for the infrequent extremely latency sensitive accesses. I also find no good idea to let DAMON help in this scenario. If I have to make a brain storming idea off the top of my humble head, though. Maybe we can ask DAMON to monitor address ranges that assumed to have the latency sensitive data. And further ask DAMOS to find sub regions of the area that getting colder than desired, and make an access to cache lines of the sub regions so that they can be in the cache for "most cases". It is just a brain storming idea off the top of my head and probably not work for your case, since... It ain't work if there is no good way to know or guarantee the address ranges for the latency sensitive data. It ain't work for extremely latency sensitive case, as DAMON is just a best effort. It ain't work with DAMON of today because DAMOS doesn't support such kind of cache-granularity access generation action. So, it sounds like not a good idea. Nonetheless, if you get any question for DAMON in future, please feel free to reach out :) Thanks, SJ [...]
On 2/4/2026 8:10 AM, Linus Walleij wrote: > Hi Yushan, > > thanks for your patch! Thanks for review! > On Tue, Feb 3, 2026 at 5:18 PM Yushan Wang <wangyushan12@huawei.com> wrote: >> The driver will create a file of `/dev/hisi_l3c` on init, mmap >> operations to it will allocate a memory region that is guaranteed to be >> placed in L3 cache. >> >> The driver also provides unmap() to deallocated the locked memory. >> >> The driver also provides an ioctl interface for user to get cache lock >> information, such as lock restrictions and locked sizes. >> >> Signed-off-by: Yushan Wang <wangyushan12@huawei.com> > The commit message does not say *why* you are doing this? >> +config HISI_SOC_L3C >> + bool "HiSilicon L3 Cache device driver" >> + depends on ACPI >> + depends on ARM64 || COMPILE_TEST >> + help >> + This driver provides the functions to lock L3 cache entries from >> + being evicted for better performance. > Here is the reason though. Sorry, I will include this into the commit message. > Things like this need to be CC to linux-mm@vger.kernel.org. > > I don't see why userspace would be so well informed as to make decisions > about what should be locked in the L3 cache and not? This question is actually: should it be kernel or user space application to decide if a cache lock should be applied? Maybe the ideal situation is that this capability should be reserved into kernel space as a vendor specific optimization option. With the lack of knowledge of memory interleave etc the best move of an application might be allocate cache lock as much as possible. > I see the memory hierarchy as any other hardware: a resource that is > allocated and arbitrated by the kernel. > > The MM subsytem knows which memory is most cache hot. > Especially when you use DAMON DAMOS, which has the sole > purpose of executing actions like that. Here is a good YouTube. > https://www.youtube.com/watch?v=xKJO4kLTHOI > > Shouldn't the MM subsystem be in charge of determining, locking > down and freeing up hot regions in L3 cache? Thanks for the link, I will see if there's any chance this can cooperate with DAMON. Gaps still exists here because DAMON operate with pages, but cache works with cachelines, though the cache lock here supports cache lock size larger than a page. > This looks more like userspace is going to determine that but > how exactly? By running DAMON? Then it's better to keep the > whole mechanism in the kernel where it belongs and let the > MM subsystem adapt locked L3 cache to the usage patterns. Currently the patchset simply trusts that user knows well what he is doing, which might not be good enough. I will try to see if this could work with DAMON or madvice() maybe :) > Yours, > Linus Walleij Thanks, Yushan
On Wed, Feb 4, 2026 at 10:53 AM wangyushan <wangyushan12@huawei.com> wrote: > > I don't see why userspace would be so well informed as to make decisions > > about what should be locked in the L3 cache and not? > > This question is actually: should it be kernel or user space > application to decide if a cache lock should be applied? > > Maybe the ideal situation is that this capability should be reserved into kernel > space as a vendor specific optimization option. With the lack of knowledge > of memory interleave etc the best move of an application might be allocate > cache lock as much as possible. If it is a vendor-specific optimization that has no generic applicability outside of this specific system, dependent on a specific userspace that only exist on this system, what is the value for the generic kernel to carry and maintain this code? In that case maybe the code should be maintained outside of the mainline kernel tree. What we want to see as maintainers are things that are reusable across several systems. Integrating this with DAMOS in a generic way is what will help the next silicon that comes down the road. I have already seen similar things from Fujitsu (IIRC). We need this mechanism to be kernel-driven and generic, not custom and system-specific, least of all driven from userspace by sysfs. Yours, Linus Walleij
On Tue, Feb 3, 2026, at 17:18, Yushan Wang wrote:
> The driver will create a file of `/dev/hisi_l3c` on init, mmap
> operations to it will allocate a memory region that is guaranteed to be
> placed in L3 cache.
>
> The driver also provides unmap() to deallocated the locked memory.
>
> The driver also provides an ioctl interface for user to get cache lock
> information, such as lock restrictions and locked sizes.
>
> Signed-off-by: Yushan Wang <wangyushan12@huawei.com>
Hi Yushan,
Thanks for your submission. Since we are in the last week of
the merge window, this is not going to be linux-7.0 material,
but I'll have a quick look for now.
> .../userspace-api/ioctl/ioctl-number.rst | 1 +
> MAINTAINERS | 6 +
> drivers/soc/hisilicon/Kconfig | 11 +
> drivers/soc/hisilicon/Makefile | 2 +
> drivers/soc/hisilicon/hisi_soc_l3c.c | 357 ++++++++++++++++++
> include/uapi/misc/hisi_l3c.h | 28 ++
I don't think this should be in drivers/soc/, since I want
to reserve that for internal drivers without a user visible
interface other than the soc_device information. (yes, there
are a few historic counterexamples)
I also don't think this should be a hilicon specific interface,
if possible. The functionality is not that unusual in the end.
We had similar concepts using the numactl system calls in the
part, but I don't think we should do that here because you may
need the numa interfaces for other purposes as well, and it
may be confusing to existing callers.
Having a generic madvise() based interface would be great,
not sure if hardware support for that is common enough for that.
> + /* Continuous physical memory is required for L3 cache lock. */
> + pg = alloc_contig_pages(1 << order, GFP_KERNEL | __GFP_NOWARN |
> __GFP_ZERO,
> + cpu_to_node(smp_processor_id()), NULL);
Since this is a user allocation, should that be GFP_USER instead
of GFP_KERNEL?
> +/* HISI_L3C_INFO: cache lock info for HiSilicon SoC */
> +#define HISI_L3C_LOCK_INFO _IOW(0xBB, 1, unsigned long)
The specification here looks wrong, please see
Documentation/driver-api/ioctl.rst
I think for your implementation it should be
#define HISI_L3C_LOCK_INFO _IOR(0xBB, 1, hisi_l3c_lock_info)
> +struct hisi_l3c_lock_info {
> + __u32 lock_region_num;
> + __u64 lock_size;
> + __u8 address_alignment;
> + __u64 max_lock_size;
> + __u64 min_lock_size;
> +};
You are leaking kernel data because of the padding in this structure,
please rearrange the members to avoid padding.
It may be better to use a different interface instead of ioctl(),
possibly exporting global data in sysfs.
Arnd
On 03/02/2026 18:19, Arnd Bergmann wrote: > On Tue, Feb 3, 2026, at 17:18, Yushan Wang wrote: >> The driver will create a file of `/dev/hisi_l3c` on init, mmap >> operations to it will allocate a memory region that is guaranteed to be >> placed in L3 cache. >> >> The driver also provides unmap() to deallocated the locked memory. >> >> The driver also provides an ioctl interface for user to get cache lock >> information, such as lock restrictions and locked sizes. >> >> Signed-off-by: Yushan Wang <wangyushan12@huawei.com> > > Hi Yushan, > > Thanks for your submission. Since we are in the last week of > the merge window, this is not going to be linux-7.0 material, > but I'll have a quick look for now. To be clear - this is a v3 but with removed previous history... Previous version: https://lore.kernel.org/all/20251217102357.1730573-2-wangyushan12@huawei.com/ Or even v4? https://lore.kernel.org/all/20250122065803.3363926-2-wangyushan12@huawei.com/ Yushan, please start versioning your patches correctly. Use b4 or git format-patch -vx Otherwise, please explain us how can we compare it with `b4 diff` with previous version? Sending something AGAIN as v1 ignoring entire previous submission is clear no go. Like you are trying till it succeeds. Negative review? Let's try from v1 this time... This is not correct and it should not be my task to find your previous discussions and decipher this v1. Best regards, Krzysztof
On 2/5/2026 5:37 PM, Krzysztof Kozlowski wrote: > On 03/02/2026 18:19, Arnd Bergmann wrote: >> On Tue, Feb 3, 2026, at 17:18, Yushan Wang wrote: >>> The driver will create a file of `/dev/hisi_l3c` on init, mmap >>> operations to it will allocate a memory region that is guaranteed to be >>> placed in L3 cache. >>> >>> The driver also provides unmap() to deallocated the locked memory. >>> >>> The driver also provides an ioctl interface for user to get cache lock >>> information, such as lock restrictions and locked sizes. >>> >>> Signed-off-by: Yushan Wang <wangyushan12@huawei.com> >> >> Hi Yushan, >> >> Thanks for your submission. Since we are in the last week of >> the merge window, this is not going to be linux-7.0 material, >> but I'll have a quick look for now. > > > To be clear - this is a v3 but with removed previous history... > > Previous version: > https://lore.kernel.org/all/20251217102357.1730573-2-wangyushan12@huawei.com/ > > Or even v4? > > https://lore.kernel.org/all/20250122065803.3363926-2-wangyushan12@huawei.com/ > > Yushan, please start versioning your patches correctly. Use b4 or git > format-patch -vx Hi Krzysztof, Sorry about the confusing versions, the complete history is as below: Link to v1: https://lore.kernel.org/all/20250107132907.3521574-1-wangyushan12@huawei.com Link to v2: https://lore.kernel.org/all/20250122065803.3363926-1-wangyushan12@huawei.com/ Link to RFC v1: https://lore.kernel.org/all/20251125080542.3721829-1-wangyushan12@huawei.com/ Link to RFC v2: https://lore.kernel.org/all/20251217102357.1730573-1-wangyushan12@huawei.com/ Link to v1 again (this message): https://lore.kernel.org/all/20260203161843.649417-1-wangyushan12@huawei.com/ > > Otherwise, please explain us how can we compare it with `b4 diff` with > previous version? > > Sending something AGAIN as v1 ignoring entire previous submission is > clear no go. Like you are trying till it succeeds. Negative review? > Let's try from v1 this time... > > This is not correct and it should not be my task to find your previous > discussions and decipher this v1. I did spin 2 versions to mainline as the actual v1, the thread was quiet. Then I made a major refactor to it and sent it as RFC, the thread went quiet again but some compile check issues popped up. I spinned 2 versions of RFC for the compile issues and removed RFC in this version since no strong objection showed up. Apologize that I broke the rules and any inconvenience caused by it. As there's little discussion in previous patches, is it OK that we start here as v1? Anyway I will include the whole history in the coverletter to prevent more confusion. > > > Best regards, > Krzysztof > Thanks, Yushan
On 05/02/2026 12:19, wangyushan wrote: > > On 2/5/2026 5:37 PM, Krzysztof Kozlowski wrote: >> On 03/02/2026 18:19, Arnd Bergmann wrote: >>> On Tue, Feb 3, 2026, at 17:18, Yushan Wang wrote: >>>> The driver will create a file of `/dev/hisi_l3c` on init, mmap >>>> operations to it will allocate a memory region that is guaranteed to be >>>> placed in L3 cache. >>>> >>>> The driver also provides unmap() to deallocated the locked memory. >>>> >>>> The driver also provides an ioctl interface for user to get cache lock >>>> information, such as lock restrictions and locked sizes. >>>> >>>> Signed-off-by: Yushan Wang <wangyushan12@huawei.com> >>> >>> Hi Yushan, >>> >>> Thanks for your submission. Since we are in the last week of >>> the merge window, this is not going to be linux-7.0 material, >>> but I'll have a quick look for now. >> >> >> To be clear - this is a v3 but with removed previous history... >> >> Previous version: >> https://lore.kernel.org/all/20251217102357.1730573-2-wangyushan12@huawei.com/ >> >> Or even v4? >> >> https://lore.kernel.org/all/20250122065803.3363926-2-wangyushan12@huawei.com/ >> >> Yushan, please start versioning your patches correctly. Use b4 or git >> format-patch -vx > > Hi Krzysztof, > > Sorry about the confusing versions, the complete history is as below: > > Link to v1: https://lore.kernel.org/all/20250107132907.3521574-1-wangyushan12@huawei.com > > Link to v2: https://lore.kernel.org/all/20250122065803.3363926-1-wangyushan12@huawei.com/ > > Link to RFC v1: https://lore.kernel.org/all/20251125080542.3721829-1-wangyushan12@huawei.com/ > > Link to RFC v2: https://lore.kernel.org/all/20251217102357.1730573-1-wangyushan12@huawei.com/ > > Link to v1 again (this message): https://lore.kernel.org/all/20260203161843.649417-1-wangyushan12@huawei.com/ > >> >> Otherwise, please explain us how can we compare it with `b4 diff` with >> previous version? >> >> Sending something AGAIN as v1 ignoring entire previous submission is >> clear no go. Like you are trying till it succeeds. Negative review? >> Let's try from v1 this time... >> >> This is not correct and it should not be my task to find your previous >> discussions and decipher this v1. > > I did spin 2 versions to mainline as the actual v1, the thread was quiet. > Then I made a major refactor to it and sent it as RFC, the thread went > quiet again but some compile check issues popped up. I spinned 2 > versions of RFC for the compile issues and removed RFC in this version > since no strong objection showed up. > > Apologize that I broke the rules and any inconvenience caused by it. > As there's little discussion in previous patches, is it OK that we start > here as v1? No, it is not okay. Your patchset continues and entire previous feedback and history is important. Otherwise why would I review this if I can as well ignore it and wait for next year you sending another v1? Best regards, Krzysztof
On 2/5/2026 7:23 PM, Krzysztof Kozlowski wrote: > On 05/02/2026 12:19, wangyushan wrote: >> >> On 2/5/2026 5:37 PM, Krzysztof Kozlowski wrote: >>> On 03/02/2026 18:19, Arnd Bergmann wrote: >>>> On Tue, Feb 3, 2026, at 17:18, Yushan Wang wrote: >>>>> The driver will create a file of `/dev/hisi_l3c` on init, mmap >>>>> operations to it will allocate a memory region that is guaranteed to be >>>>> placed in L3 cache. >>>>> >>>>> The driver also provides unmap() to deallocated the locked memory. >>>>> >>>>> The driver also provides an ioctl interface for user to get cache lock >>>>> information, such as lock restrictions and locked sizes. >>>>> >>>>> Signed-off-by: Yushan Wang <wangyushan12@huawei.com> >>>> >>>> Hi Yushan, >>>> >>>> Thanks for your submission. Since we are in the last week of >>>> the merge window, this is not going to be linux-7.0 material, >>>> but I'll have a quick look for now. >>> >>> >>> To be clear - this is a v3 but with removed previous history... >>> >>> Previous version: >>> https://lore.kernel.org/all/20251217102357.1730573-2-wangyushan12@huawei.com/ >>> >>> Or even v4? >>> >>> https://lore.kernel.org/all/20250122065803.3363926-2-wangyushan12@huawei.com/ >>> >>> Yushan, please start versioning your patches correctly. Use b4 or git >>> format-patch -vx >> >> Hi Krzysztof, >> >> Sorry about the confusing versions, the complete history is as below: >> >> Link to v1: https://lore.kernel.org/all/20250107132907.3521574-1-wangyushan12@huawei.com >> >> Link to v2: https://lore.kernel.org/all/20250122065803.3363926-1-wangyushan12@huawei.com/ >> >> Link to RFC v1: https://lore.kernel.org/all/20251125080542.3721829-1-wangyushan12@huawei.com/ >> >> Link to RFC v2: https://lore.kernel.org/all/20251217102357.1730573-1-wangyushan12@huawei.com/ >> >> Link to v1 again (this message): https://lore.kernel.org/all/20260203161843.649417-1-wangyushan12@huawei.com/ >> >>> >>> Otherwise, please explain us how can we compare it with `b4 diff` with >>> previous version? >>> >>> Sending something AGAIN as v1 ignoring entire previous submission is >>> clear no go. Like you are trying till it succeeds. Negative review? >>> Let's try from v1 this time... >>> >>> This is not correct and it should not be my task to find your previous >>> discussions and decipher this v1. >> >> I did spin 2 versions to mainline as the actual v1, the thread was quiet. >> Then I made a major refactor to it and sent it as RFC, the thread went >> quiet again but some compile check issues popped up. I spinned 2 >> versions of RFC for the compile issues and removed RFC in this version >> since no strong objection showed up. >> >> Apologize that I broke the rules and any inconvenience caused by it. >> As there's little discussion in previous patches, is it OK that we start >> here as v1? > > No, it is not okay. Your patchset continues and entire previous feedback > and history is important. Otherwise why would I review this if I can as > well ignore it and wait for next year you sending another v1? Sorry, I will correct the version numbers in next versions, with whole history and explanations about the mess. Sincerely apologies, I won't hide the versions again. Yushan
On 2/4/2026 1:19 AM, Arnd Bergmann wrote:
> On Tue, Feb 3, 2026, at 17:18, Yushan Wang wrote:
>> The driver will create a file of `/dev/hisi_l3c` on init, mmap
>> operations to it will allocate a memory region that is guaranteed to be
>> placed in L3 cache.
>>
>> The driver also provides unmap() to deallocated the locked memory.
>>
>> The driver also provides an ioctl interface for user to get cache lock
>> information, such as lock restrictions and locked sizes.
>>
>> Signed-off-by: Yushan Wang <wangyushan12@huawei.com>
> Hi Yushan,
>
> Thanks for your submission. Since we are in the last week of
> the merge window, this is not going to be linux-7.0 material,
> but I'll have a quick look for now.
Many thanks for review!
>> .../userspace-api/ioctl/ioctl-number.rst | 1 +
>> MAINTAINERS | 6 +
>> drivers/soc/hisilicon/Kconfig | 11 +
>> drivers/soc/hisilicon/Makefile | 2 +
>> drivers/soc/hisilicon/hisi_soc_l3c.c | 357 ++++++++++++++++++
>> include/uapi/misc/hisi_l3c.h | 28 ++
> I don't think this should be in drivers/soc/, since I want
> to reserve that for internal drivers without a user visible
> interface other than the soc_device information. (yes, there
> are a few historic counterexamples)
>
> I also don't think this should be a hilicon specific interface,
> if possible. The functionality is not that unusual in the end.
I hesitated about the directory as well, exporting a user aware
interface directly from driver may not be a good practice and I
am hoping for advice :)
The driver itself doesn't provide much more than a mere register configuration which is platform specific, what do you think about a kernel space interface somewhere else which implemented here?
> We had similar concepts using the numactl system calls in the
> part, but I don't think we should do that here because you may
> need the numa interfaces for other purposes as well, and it
> may be confusing to existing callers.
>
> Having a generic madvise() based interface would be great,
> not sure if hardware support for that is common enough for that.
As above, cache lock could be an option for optimization of numactl or madvise(), maybe a more generic interface in kernel space which is speculatively called by other infrastructure is better, and of course extra support of the infrastructures is needed. I can try to propose an interface and maybe an example of the caller in the next version.
>> + /* Continuous physical memory is required for L3 cache lock. */
>> + pg = alloc_contig_pages(1 << order, GFP_KERNEL | __GFP_NOWARN |
>> __GFP_ZERO,
>> + cpu_to_node(smp_processor_id()), NULL);
> Since this is a user allocation, should that be GFP_USER instead
> of GFP_KERNEL?
Yes, it should be GFP_USER in this version since it is used in userspace.
>> +/* HISI_L3C_INFO: cache lock info for HiSilicon SoC */
>> +#define HISI_L3C_LOCK_INFO _IOW(0xBB, 1, unsigned long)
> The specification here looks wrong, please see
> Documentation/driver-api/ioctl.rst
>
> I think for your implementation it should be
>
> #define HISI_L3C_LOCK_INFO _IOR(0xBB, 1, hisi_l3c_lock_info)
Sorry, I will correct that in the next version.
>> +struct hisi_l3c_lock_info {
>> + __u32 lock_region_num;
>> + __u64 lock_size;
>> + __u8 address_alignment;
>> + __u64 max_lock_size;
>> + __u64 min_lock_size;
>> +};
> You are leaking kernel data because of the padding in this structure,
> please rearrange the members to avoid padding.
>
> It may be better to use a different interface instead of ioctl(),
> possibly exporting global data in sysfs.
Yes, information through sysfs should do the job.
I will fix that in the next version.
> Arnd
Thanks,
Yushan
© 2016 - 2026 Red Hat, Inc.