[v3] selinux: speed up avc_search_node() with large number of avc nodes

[PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Hongru Zhang 4 months, 2 weeks ago

From: Hongru Zhang <zhanghongru@xiaomi.com>

On mobile device high-load situations, permission check can happen
more than 90,000/s (8 core system). With default 512 cache nodes
configuration, avc cache miss happens more often and occasionally
leads to long time (>2ms) irqs off on both big and little cores,
which decreases system real-time capability.

An actual call stack is as follows:
 => avc_compute_av
 => avc_perm_nonode
 => avc_has_perm_noaudit
 => selinux_capable
 => security_capable
 => capable
 => __sched_setscheduler
 => do_sched_setscheduler
 => __arm64_sys_sched_setscheduler
 => invoke_syscall
 => el0_svc_common
 => do_el0_svc
 => el0_svc
 => el0t_64_sync_handler
 => el0t_64_sync

Although we can expand avc nodes through /sys/fs/selinux/cache_threshold
to mitigate long time irqs off, hash conflicts make the bucket average
length longer because of the fixed size of cache slots, leading to
avc_search_node latency increase.

Make avc cache slot size also configurable, and with fine tuning, we can
mitigate long time irqs off with slightly avc_search_node performance
regression.

Theoretically, the main overhead is memory consumption.

avc_search_node avg latency test results (about 100,000,000 times) on
Qcom SM8750, 6.6.30-android15-8:

Case 1:
+---------+---------------------+------------------------+
|         | no-patch (512/512)  | with-patch (512/512)   |
+---------+---------------------+------------------------+
| latency |        85 ns        |         87 ns          |
+---------+---------------------+------------------------+

Case 2:
+---------+---------------------+------------------------+
|         | no-patch (8192/512) | with-patch (8192/8192) |
+---------+---------------------+------------------------+
| latency |        277 ns       |         106 ns         |
+---------+---------------------+------------------------+

Case 1 shows 512 nodes configuration has ~2% performance regression
with patch.
Case 2 shows 8192 nodes configuration has ~61% latency benifit with
patch.

Signed-off-by: Hongru Zhang <zhanghongru@xiaomi.com>
---
 .../admin-guide/kernel-parameters.txt         |  4 ++
 security/selinux/avc.c                        | 68 +++++++++++++------
 2 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 747a55abf494..70dc6d659117 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6620,6 +6620,10 @@
 			1 -- enable.
 			Default value is 1.
 
+	selinux_avc_cache_slots= [SELINUX] Set the avc cache slot size.
+			Format: <int> (must be >0, power of 2)
+			Default: 512
+
 	serialnumber	[BUGS=X86-32]
 
 	sev=option[,option...] [X86-64]
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 430b0e23ee00..7a7f88012865 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -34,7 +34,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/avc.h>
 
-#define AVC_CACHE_SLOTS			512
+static int avc_cache_slots __ro_after_init = 512;
 #define AVC_DEF_CACHE_THRESHOLD		512
 #define AVC_CACHE_RECLAIM		16
 
@@ -68,9 +68,13 @@ struct avc_xperms_node {
 	struct list_head xpd_head; /* list head of extended_perms_decision */
 };
 
+struct avc_slot {
+	struct hlist_head	slot;		/* head for avc_node->list */
+	spinlock_t		slot_lock;	/* lock for writes */
+};
+
 struct avc_cache {
-	struct hlist_head	slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
-	spinlock_t		slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
+	struct avc_slot		*slots;
 	atomic_t		lru_hint;	/* LRU hint for reclaim scan */
 	atomic_t		active_nodes;
 	u32			latest_notif;	/* latest revocation notification */
@@ -93,14 +97,34 @@ struct selinux_avc {
 
 static struct selinux_avc selinux_avc;
 
+static int __init set_selinux_avc_cache_slots(char *str)
+{
+	int val;
+
+	if ((kstrtoint(str, 0, &val)) || !is_power_of_2(val)) {
+		pr_warn("Unable to set selinux_avc_cache_slots, use default value\n");
+		return 1;
+	}
+
+	avc_cache_slots = val;
+
+	return 1;
+}
+__setup("selinux_avc_cache_slots=", set_selinux_avc_cache_slots);
+
 void selinux_avc_init(void)
 {
 	int i;
 
+	selinux_avc.avc_cache.slots =
+		kmalloc_array(avc_cache_slots, sizeof(struct avc_slot), GFP_KERNEL);
+	if (!selinux_avc.avc_cache.slots)
+		panic("SELinux: No memory to alloc avc cache slots\n");
+
 	selinux_avc.avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD;
-	for (i = 0; i < AVC_CACHE_SLOTS; i++) {
-		INIT_HLIST_HEAD(&selinux_avc.avc_cache.slots[i]);
-		spin_lock_init(&selinux_avc.avc_cache.slots_lock[i]);
+	for (i = 0; i < avc_cache_slots; i++) {
+		INIT_HLIST_HEAD(&selinux_avc.avc_cache.slots[i].slot);
+		spin_lock_init(&selinux_avc.avc_cache.slots[i].slot_lock);
 	}
 	atomic_set(&selinux_avc.avc_cache.active_nodes, 0);
 	atomic_set(&selinux_avc.avc_cache.lru_hint, 0);
@@ -124,7 +148,7 @@ static struct kmem_cache *avc_xperms_cachep __ro_after_init;
 
 static inline u32 avc_hash(u32 ssid, u32 tsid, u16 tclass)
 {
-	return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1);
+	return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (avc_cache_slots - 1);
 }
 
 /**
@@ -150,8 +174,8 @@ int avc_get_hash_stats(char *page)
 
 	slots_used = 0;
 	max_chain_len = 0;
-	for (i = 0; i < AVC_CACHE_SLOTS; i++) {
-		head = &selinux_avc.avc_cache.slots[i];
+	for (i = 0; i < avc_cache_slots; i++) {
+		head = &selinux_avc.avc_cache.slots[i].slot;
 		if (!hlist_empty(head)) {
 			slots_used++;
 			chain_len = 0;
@@ -167,7 +191,7 @@ int avc_get_hash_stats(char *page)
 	return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
 			 "longest chain: %d\n",
 			 atomic_read(&selinux_avc.avc_cache.active_nodes),
-			 slots_used, AVC_CACHE_SLOTS, max_chain_len);
+			 slots_used, avc_cache_slots, max_chain_len);
 }
 
 /*
@@ -463,11 +487,11 @@ static inline int avc_reclaim_node(void)
 	struct hlist_head *head;
 	spinlock_t *lock;
 
-	for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) {
+	for (try = 0, ecx = 0; try < avc_cache_slots; try++) {
 		hvalue = atomic_inc_return(&selinux_avc.avc_cache.lru_hint) &
-			(AVC_CACHE_SLOTS - 1);
-		head = &selinux_avc.avc_cache.slots[hvalue];
-		lock = &selinux_avc.avc_cache.slots_lock[hvalue];
+			(avc_cache_slots - 1);
+		head = &selinux_avc.avc_cache.slots[hvalue].slot;
+		lock = &selinux_avc.avc_cache.slots[hvalue].slot_lock;
 
 		if (!spin_trylock_irqsave(lock, flags))
 			continue;
@@ -524,7 +548,7 @@ static inline struct avc_node *avc_search_node(u32 ssid, u32 tsid, u16 tclass)
 	struct hlist_head *head;
 
 	hvalue = avc_hash(ssid, tsid, tclass);
-	head = &selinux_avc.avc_cache.slots[hvalue];
+	head = &selinux_avc.avc_cache.slots[hvalue].slot;
 	hlist_for_each_entry_rcu(node, head, list) {
 		if (ssid == node->ae.ssid &&
 		    tclass == node->ae.tclass &&
@@ -625,8 +649,8 @@ static void avc_insert(u32 ssid, u32 tsid, u16 tclass,
 	}
 
 	hvalue = avc_hash(ssid, tsid, tclass);
-	head = &selinux_avc.avc_cache.slots[hvalue];
-	lock = &selinux_avc.avc_cache.slots_lock[hvalue];
+	head = &selinux_avc.avc_cache.slots[hvalue].slot;
+	lock = &selinux_avc.avc_cache.slots[hvalue].slot_lock;
 	spin_lock_irqsave(lock, flag);
 	hlist_for_each_entry(pos, head, list) {
 		if (pos->ae.ssid == ssid &&
@@ -846,8 +870,8 @@ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 base_perm,
 	/* Lock the target slot */
 	hvalue = avc_hash(ssid, tsid, tclass);
 
-	head = &selinux_avc.avc_cache.slots[hvalue];
-	lock = &selinux_avc.avc_cache.slots_lock[hvalue];
+	head = &selinux_avc.avc_cache.slots[hvalue].slot;
+	lock = &selinux_avc.avc_cache.slots[hvalue].slot_lock;
 
 	spin_lock_irqsave(lock, flag);
 
@@ -929,9 +953,9 @@ static void avc_flush(void)
 	unsigned long flag;
 	int i;
 
-	for (i = 0; i < AVC_CACHE_SLOTS; i++) {
-		head = &selinux_avc.avc_cache.slots[i];
-		lock = &selinux_avc.avc_cache.slots_lock[i];
+	for (i = 0; i < avc_cache_slots; i++) {
+		head = &selinux_avc.avc_cache.slots[i].slot;
+		lock = &selinux_avc.avc_cache.slots[i].slot_lock;
 
 		spin_lock_irqsave(lock, flag);
 		/*
-- 
2.43.0

Re: [PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Paul Moore 3 months, 3 weeks ago

On Sep 26, 2025 Hongru Zhang <zhanghongru06@gmail.com> wrote:
> 
> On mobile device high-load situations, permission check can happen
> more than 90,000/s (8 core system). With default 512 cache nodes
> configuration, avc cache miss happens more often and occasionally
> leads to long time (>2ms) irqs off on both big and little cores,
> which decreases system real-time capability.
> 
> An actual call stack is as follows:
>  => avc_compute_av
>  => avc_perm_nonode
>  => avc_has_perm_noaudit
>  => selinux_capable
>  => security_capable
>  => capable
>  => __sched_setscheduler
>  => do_sched_setscheduler
>  => __arm64_sys_sched_setscheduler
>  => invoke_syscall
>  => el0_svc_common
>  => do_el0_svc
>  => el0_svc
>  => el0t_64_sync_handler
>  => el0t_64_sync
> 
> Although we can expand avc nodes through /sys/fs/selinux/cache_threshold
> to mitigate long time irqs off, hash conflicts make the bucket average
> length longer because of the fixed size of cache slots, leading to
> avc_search_node latency increase.
> 
> Make avc cache slot size also configurable, and with fine tuning, we can
> mitigate long time irqs off with slightly avc_search_node performance
> regression.
> 
> Theoretically, the main overhead is memory consumption.
> 
> avc_search_node avg latency test results (about 100,000,000 times) on
> Qcom SM8750, 6.6.30-android15-8:
> 
> Case 1:
> +---------+---------------------+------------------------+
> |         | no-patch (512/512)  | with-patch (512/512)   |
> +---------+---------------------+------------------------+
> | latency |        85 ns        |         87 ns          |
> +---------+---------------------+------------------------+
> 
> Case 2:
> +---------+---------------------+------------------------+
> |         | no-patch (8192/512) | with-patch (8192/8192) |
> +---------+---------------------+------------------------+
> | latency |        277 ns       |         106 ns         |
> +---------+---------------------+------------------------+
> 
> Case 1 shows 512 nodes configuration has ~2% performance regression
> with patch.
> Case 2 shows 8192 nodes configuration has ~61% latency benifit with
> patch.
> 
> Signed-off-by: Hongru Zhang <zhanghongru@xiaomi.com>
> Reviewed-by: Stephen Smalley <stephen.smalley.work@gmail.com>
> ---
>  .../admin-guide/kernel-parameters.txt         |  4 ++
>  security/selinux/avc.c                        | 68 +++++++++++++------
>  2 files changed, 50 insertions(+), 22 deletions(-)

I would expect the number of active AVC nodes, and AVC churn in general,
to be very policy dependent; some policies and use cases simply result in
more AVC nodes than others.  With that in mind, I'm wondering if instead
of using a kernel command line parameter to specify the number of AVC
buckets, we should instead include an AVC size "hint" in the policy that
we can use to size the AVC when loading a new policy.

Thoughts?

I think it would be important to consider it strictly as a "hint" as
that would make life easier, e.g. if the previous policy hinted at a
larger AVC we may not want to bother with reducing the number of buckets.
I would suggest starting with an implementation that uses the hint as a
power of two for the number of AVC slots/buckets, with a value of '0'
indicating a default value (512 slots, e.g. '2^9').

--
paul-moore.com

Re: [PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Stephen Smalley 3 months, 3 weeks ago

On Thu, Oct 16, 2025 at 5:18 PM Paul Moore <paul@paul-moore.com> wrote:
>
> On Sep 26, 2025 Hongru Zhang <zhanghongru06@gmail.com> wrote:
> >
> > On mobile device high-load situations, permission check can happen
> > more than 90,000/s (8 core system). With default 512 cache nodes
> > configuration, avc cache miss happens more often and occasionally
> > leads to long time (>2ms) irqs off on both big and little cores,
> > which decreases system real-time capability.
> >
> > An actual call stack is as follows:
> >  => avc_compute_av
> >  => avc_perm_nonode
> >  => avc_has_perm_noaudit
> >  => selinux_capable
> >  => security_capable
> >  => capable
> >  => __sched_setscheduler
> >  => do_sched_setscheduler
> >  => __arm64_sys_sched_setscheduler
> >  => invoke_syscall
> >  => el0_svc_common
> >  => do_el0_svc
> >  => el0_svc
> >  => el0t_64_sync_handler
> >  => el0t_64_sync
> >
> > Although we can expand avc nodes through /sys/fs/selinux/cache_threshold
> > to mitigate long time irqs off, hash conflicts make the bucket average
> > length longer because of the fixed size of cache slots, leading to
> > avc_search_node latency increase.
> >
> > Make avc cache slot size also configurable, and with fine tuning, we can
> > mitigate long time irqs off with slightly avc_search_node performance
> > regression.
> >
> > Theoretically, the main overhead is memory consumption.
> >
> > avc_search_node avg latency test results (about 100,000,000 times) on
> > Qcom SM8750, 6.6.30-android15-8:
> >
> > Case 1:
> > +---------+---------------------+------------------------+
> > |         | no-patch (512/512)  | with-patch (512/512)   |
> > +---------+---------------------+------------------------+
> > | latency |        85 ns        |         87 ns          |
> > +---------+---------------------+------------------------+
> >
> > Case 2:
> > +---------+---------------------+------------------------+
> > |         | no-patch (8192/512) | with-patch (8192/8192) |
> > +---------+---------------------+------------------------+
> > | latency |        277 ns       |         106 ns         |
> > +---------+---------------------+------------------------+
> >
> > Case 1 shows 512 nodes configuration has ~2% performance regression
> > with patch.
> > Case 2 shows 8192 nodes configuration has ~61% latency benifit with
> > patch.
> >
> > Signed-off-by: Hongru Zhang <zhanghongru@xiaomi.com>
> > Reviewed-by: Stephen Smalley <stephen.smalley.work@gmail.com>
> > ---
> >  .../admin-guide/kernel-parameters.txt         |  4 ++
> >  security/selinux/avc.c                        | 68 +++++++++++++------
> >  2 files changed, 50 insertions(+), 22 deletions(-)
>
> I would expect the number of active AVC nodes, and AVC churn in general,
> to be very policy dependent; some policies and use cases simply result in
> more AVC nodes than others.  With that in mind, I'm wondering if instead
> of using a kernel command line parameter to specify the number of AVC
> buckets, we should instead include an AVC size "hint" in the policy that
> we can use to size the AVC when loading a new policy.
>
> Thoughts?
>
> I think it would be important to consider it strictly as a "hint" as
> that would make life easier, e.g. if the previous policy hinted at a
> larger AVC we may not want to bother with reducing the number of buckets.
> I would suggest starting with an implementation that uses the hint as a
> power of two for the number of AVC slots/buckets, with a value of '0'
> indicating a default value (512 slots, e.g. '2^9').

So, aside from Hongru's points about this requiring a change to the
binary policy format and compiler and introducing possible
atomicity/locking issues in the AVC code when accessing the number of
buckets, I am also uncertain that this is something that is fully
determinable from policy alone. A small AVC might be fine even with a
large policy depending on the actual workload that is running on the
system in question. Hence, I was fine with this being a kernel
parameter. If we did want to introduce dynamism here despite these
considerations, then two possibilities come to mind:
1. Allow it to be set/modified via a new /sys/fs/selinux/avc node in
the same manner as other existing tunables in selinuxfs. This avoids
the need to modify the policy format / compiler and allows tuning on
an end system based on its workload.
and/or
2. Compute the AVC number of buckets based on the policy avtab size
which is already available from existing binary policies. This
likewise avoids the need to modify the policy format / compiler while
automatically tuning the number of buckets based on the size of the
policy.

Re: [PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Paul Moore 3 months, 3 weeks ago

On Fri, Oct 17, 2025 at 7:59 AM Stephen Smalley
<stephen.smalley.work@gmail.com> wrote:
> On Thu, Oct 16, 2025 at 5:18 PM Paul Moore <paul@paul-moore.com> wrote:
> > On Sep 26, 2025 Hongru Zhang <zhanghongru06@gmail.com> wrote:

...

> > I would expect the number of active AVC nodes, and AVC churn in general,
> > to be very policy dependent; some policies and use cases simply result in
> > more AVC nodes than others.  With that in mind, I'm wondering if instead
> > of using a kernel command line parameter to specify the number of AVC
> > buckets, we should instead include an AVC size "hint" in the policy that
> > we can use to size the AVC when loading a new policy.
> >
> > Thoughts?
> >
> > I think it would be important to consider it strictly as a "hint" as
> > that would make life easier, e.g. if the previous policy hinted at a
> > larger AVC we may not want to bother with reducing the number of buckets.
> > I would suggest starting with an implementation that uses the hint as a
> > power of two for the number of AVC slots/buckets, with a value of '0'
> > indicating a default value (512 slots, e.g. '2^9').
>
> So, aside from Hongru's points about this requiring a change to the
> binary policy format and compiler and introducing possible
> atomicity/locking issues in the AVC code when accessing the number of
> buckets ...

I know you've heard me say this before, but for the sake of those who
haven't, "because it's a lot of work" isn't something that I consider
to be a valid excuse.  It's fine, and good (!), to explain the work
needed to successfully make a change, but I have an almost allergic
reaction to those who use the amount of work needed as an argument
against doing The Right Thing.

> I am also uncertain that this is something that is fully
> determinable from policy alone.

Agreed, but if we are going to make this changeable, I'd rather see it
as something that could be changed without requiring a reboot.  Not
wanting to add yet another selinuxfs node, and seeing *some*
relationship between AVC size and policy, adding a AVC size hint to
the policy seems reasonable.

However, as I mentioned in my reply to Hongru, we may be able to solve
the immediate problem with a Kconfig tunable.

-- 
paul-moore.com

Re: [PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Hongru Zhang 3 months, 3 weeks ago

> On Sep 26, 2025 Hongru Zhang <zhanghongru06@gmail.com> wrote:
> > 
> > On mobile device high-load situations, permission check can happen
> > more than 90,000/s (8 core system). With default 512 cache nodes
> > configuration, avc cache miss happens more often and occasionally
> > leads to long time (>2ms) irqs off on both big and little cores,
> > which decreases system real-time capability.
> > 
> > An actual call stack is as follows:
> >  => avc_compute_av
> >  => avc_perm_nonode
> >  => avc_has_perm_noaudit
> >  => selinux_capable
> >  => security_capable
> >  => capable
> >  => __sched_setscheduler
> >  => do_sched_setscheduler
> >  => __arm64_sys_sched_setscheduler
> >  => invoke_syscall
> >  => el0_svc_common
> >  => do_el0_svc
> >  => el0_svc
> >  => el0t_64_sync_handler
> >  => el0t_64_sync
> > 
> > Although we can expand avc nodes through /sys/fs/selinux/cache_threshold
> > to mitigate long time irqs off, hash conflicts make the bucket average
> > length longer because of the fixed size of cache slots, leading to
> > avc_search_node latency increase.
> > 
> > Make avc cache slot size also configurable, and with fine tuning, we can
> > mitigate long time irqs off with slightly avc_search_node performance
> > regression.
> > 
> > Theoretically, the main overhead is memory consumption.
> > 
> > avc_search_node avg latency test results (about 100,000,000 times) on
> > Qcom SM8750, 6.6.30-android15-8:
> > 
> > Case 1:
> > +---------+---------------------+------------------------+
> > |         | no-patch (512/512)  | with-patch (512/512)   |
> > +---------+---------------------+------------------------+
> > | latency |        85 ns        |         87 ns          |
> > +---------+---------------------+------------------------+
> > 
> > Case 2:
> > +---------+---------------------+------------------------+
> > |         | no-patch (8192/512) | with-patch (8192/8192) |
> > +---------+---------------------+------------------------+
> > | latency |        277 ns       |         106 ns         |
> > +---------+---------------------+------------------------+
> > 
> > Case 1 shows 512 nodes configuration has ~2% performance regression
> > with patch.
> > Case 2 shows 8192 nodes configuration has ~61% latency benifit with
> > patch.
> > 
> > Signed-off-by: Hongru Zhang <zhanghongru@xiaomi.com>
> > Reviewed-by: Stephen Smalley <stephen.smalley.work@gmail.com>
> > ---
> >  .../admin-guide/kernel-parameters.txt         |  4 ++
> >  security/selinux/avc.c                        | 68 +++++++++++++------
> >  2 files changed, 50 insertions(+), 22 deletions(-)
> 
> I would expect the number of active AVC nodes, and AVC churn in general,
> to be very policy dependent; some policies and use cases simply result in
> more AVC nodes than others.  With that in mind, I'm wondering if instead
> of using a kernel command line parameter to specify the number of AVC
> buckets, we should instead include an AVC size "hint" in the policy that
> we can use to size the AVC when loading a new policy.
> 
> Thoughts?

I previously considered supporting dynamic adjustment of slot size during
runtime, but this seems to introduce code complexity and overhead. Every
time avc_lookup() or avc_insert() happens, we would need to check if the
table exists. Adjusting slot size and accessing a specific slot might
occur simultaneously, potentially requiring additional lock protection.

When increasing slot size, we could directly copy the contents from the
old table. When decreasing slot size, nodes exceeding the new slot size
would need to be re-hashed and attached to appropriate positions.

On my Android device, policies are fixed before system image release and
don't change or load dynamically during system running. Using kernel
parameters for adjustment ensures no additional locks or checks are needed
during runtime table access, maintaining simplicity and efficiency of the
lookup code.

If this is necessary, I would appreciate some assistance and need time
to think and develop it. I'm not a senior developer in this field and
lack sufficient knowledge in this area.

To support this, I think I need to:
1. Update the relevant specifications to add descriptions for this new
    field;
2. Modify the compiler to support this new field while ensuring
    compatibility in the generated policy file;
3. Modify the kernel to parse this field and adjust the slot size
    accordingly based on its value.

I'm unsure if this is correct and comprehensive. I would be very grateful
for any guidance.

> I think it would be important to consider it strictly as a "hint" as
> that would make life easier, e.g. if the previous policy hinted at a
> larger AVC we may not want to bother with reducing the number of buckets.
> I would suggest starting with an implementation that uses the hint as a
> power of two for the number of AVC slots/buckets, with a value of '0'
> indicating a default value (512 slots, e.g. '2^9').

Would this approach lead to slot size monotonically increasing, always 
remaining at the maximum size? This could potentially cause some memory
waste, though the total amount might not be very large.

Re: [PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Paul Moore 3 months, 3 weeks ago

On Fri, Oct 17, 2025 at 4:10 AM Hongru Zhang <zhanghongru06@gmail.com> wrote:
> > On Sep 26, 2025 Hongru Zhang <zhanghongru06@gmail.com> wrote:

...

> > I would expect the number of active AVC nodes, and AVC churn in general,
> > to be very policy dependent; some policies and use cases simply result in
> > more AVC nodes than others.  With that in mind, I'm wondering if instead
> > of using a kernel command line parameter to specify the number of AVC
> > buckets, we should instead include an AVC size "hint" in the policy that
> > we can use to size the AVC when loading a new policy.
> >
> > Thoughts?
>
> I previously considered supporting dynamic adjustment of slot size during
> runtime, but this seems to introduce code complexity and overhead. Every
> time avc_lookup() or avc_insert() happens, we would need to check if the
> table exists. Adjusting slot size and accessing a specific slot might
> occur simultaneously, potentially requiring additional lock protection.

I would imagine that a very simple implementation would simply convert
the selinux_avc variable from an instance of selinux_avc to a RCU
protected selinux_avc pointer.  As the AVC already uses RCU, I think
the number of changes should be relatively minimal:

* Ensure we wrap selinux_avc derefs with rcu_dereference().  This
should be the only real change needed for lookups and insertions as
every search through the AVC will start with deref'ing the selinux_avc
pointer.

* Update avc_init() to allocate the cache slots with a default value,
fail if unable to allocate the cache memory.  If we ensure that the
selinux_avc pointer will always be valid, we can avoid having to check
it.

* Policy (re)loads which would change the number of AVC cache slots
would allocate and initialize a new selinux_avc then swap the global
selinux_avc pointer under spinlock.  The old AVC cache could then be
free'd according to RCU rules.  I haven't thought about it too much,
but I suspect we could do away with flushing the old AVC in these
cases, even if we can't, flushing the old AVC is easy enough.

> When increasing slot size, we could directly copy the contents from the
> old table. When decreasing slot size, nodes exceeding the new slot size
> would need to be re-hashed and attached to appropriate positions.

Changing the number of cache slots should happen infrequently enough
that I see no need to migrate the old entries to the new cache
instance.  It's a cache, it will fill back up naturally.

> On my Android device, policies are fixed before system image release and
> don't change or load dynamically during system running. Using kernel
> parameters for adjustment ensures no additional locks or checks are needed
> during runtime table access, maintaining simplicity and efficiency of the
> lookup code.

If your system does not update its policy over the course of a single
boot, and presumably doesn't drastically change its behavior during
that time, there is another, simpler option that we should consider:
setting AVC_CACHE_SLOTS at compile time based on a Kconfig tunable.
The code change would essentially be one line:

 #define AVC_CACHE_SLOTS   (2 << CONFIG_SECURITY_SELINUX_AVC_HASH_BITS)

... with a corresponding entry in security/selinux/Kconfig.  That
should be a very easy change, and if you set the default value such
that AVC_CACHE_SLOTS remains at 512, there should be no impact on
existing systems.

-- 
paul-moore.com

Re: [PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Hongru Zhang 3 months, 2 weeks ago

> I would imagine that a very simple implementation would simply convert
> the selinux_avc variable from an instance of selinux_avc to a RCU
> protected selinux_avc pointer.  As the AVC already uses RCU, I think
> the number of changes should be relatively minimal:
> 
> * Ensure we wrap selinux_avc derefs with rcu_dereference().  This
> should be the only real change needed for lookups and insertions as
> every search through the AVC will start with deref'ing the selinux_avc
> pointer.
> 
> * Update avc_init() to allocate the cache slots with a default value,
> fail if unable to allocate the cache memory.  If we ensure that the
> selinux_avc pointer will always be valid, we can avoid having to check
> it.
> 
> * Policy (re)loads which would change the number of AVC cache slots
> would allocate and initialize a new selinux_avc then swap the global
> selinux_avc pointer under spinlock.  The old AVC cache could then be
> free'd according to RCU rules.  I haven't thought about it too much,
> but I suspect we could do away with flushing the old AVC in these
> cases, even if we can't, flushing the old AVC is easy enough.
> 
> > When increasing slot size, we could directly copy the contents from the
> > old table. When decreasing slot size, nodes exceeding the new slot size
> > would need to be re-hashed and attached to appropriate positions.
> 
> Changing the number of cache slots should happen infrequently enough
> that I see no need to migrate the old entries to the new cache
> instance.  It's a cache, it will fill back up naturally.
> 
> > On my Android device, policies are fixed before system image release and
> > don't change or load dynamically during system running. Using kernel
> > parameters for adjustment ensures no additional locks or checks are neede=
> d
> > during runtime table access, maintaining simplicity and efficiency of the
> > lookup code.
> 
> If your system does not update its policy over the course of a single
> boot, and presumably doesn't drastically change its behavior during
> that time, there is another, simpler option that we should consider:
> setting AVC_CACHE_SLOTS at compile time based on a Kconfig tunable.
> The code change would essentially be one line:
> 
>  #define AVC_CACHE_SLOTS   (2 << CONFIG_SECURITY_SELINUX_AVC_HASH_BITS)
> 
> ... with a corresponding entry in security/selinux/Kconfig.  That
> should be a very easy change, and if you set the default value such
> that AVC_CACHE_SLOTS remains at 512, there should be no impact on
> existing systems.
> 

Alright，I will add a CONFIG_SECURITY_SELINUX_AVC_HASH_BITS in
security/selinux/Kconfig, the range is between 9 and 14 (512 : 16384),
with a default value of 9. And then I will send a new patchset version.

I will try to submit the final version in Q1 2026 based on the discussion
(Because I have some planned Q4 work that hasn't been completed yet).

Re: [PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Paul Moore 3 months, 2 weeks ago

On Tue, Oct 21, 2025 at 8:38 AM Hongru Zhang <zhanghongru06@gmail.com> wrote:
> > I would imagine that a very simple implementation would simply convert
> > the selinux_avc variable from an instance of selinux_avc to a RCU
> > protected selinux_avc pointer.  As the AVC already uses RCU, I think
> > the number of changes should be relatively minimal:
> >
> > * Ensure we wrap selinux_avc derefs with rcu_dereference().  This
> > should be the only real change needed for lookups and insertions as
> > every search through the AVC will start with deref'ing the selinux_avc
> > pointer.
> >
> > * Update avc_init() to allocate the cache slots with a default value,
> > fail if unable to allocate the cache memory.  If we ensure that the
> > selinux_avc pointer will always be valid, we can avoid having to check
> > it.
> >
> > * Policy (re)loads which would change the number of AVC cache slots
> > would allocate and initialize a new selinux_avc then swap the global
> > selinux_avc pointer under spinlock.  The old AVC cache could then be
> > free'd according to RCU rules.  I haven't thought about it too much,
> > but I suspect we could do away with flushing the old AVC in these
> > cases, even if we can't, flushing the old AVC is easy enough.
> >
> > > When increasing slot size, we could directly copy the contents from the
> > > old table. When decreasing slot size, nodes exceeding the new slot size
> > > would need to be re-hashed and attached to appropriate positions.
> >
> > Changing the number of cache slots should happen infrequently enough
> > that I see no need to migrate the old entries to the new cache
> > instance.  It's a cache, it will fill back up naturally.
> >
> > > On my Android device, policies are fixed before system image release and
> > > don't change or load dynamically during system running. Using kernel
> > > parameters for adjustment ensures no additional locks or checks are neede=
> > d
> > > during runtime table access, maintaining simplicity and efficiency of the
> > > lookup code.
> >
> > If your system does not update its policy over the course of a single
> > boot, and presumably doesn't drastically change its behavior during
> > that time, there is another, simpler option that we should consider:
> > setting AVC_CACHE_SLOTS at compile time based on a Kconfig tunable.
> > The code change would essentially be one line:
> >
> >  #define AVC_CACHE_SLOTS   (2 << CONFIG_SECURITY_SELINUX_AVC_HASH_BITS)
> >
> > ... with a corresponding entry in security/selinux/Kconfig.  That
> > should be a very easy change, and if you set the default value such
> > that AVC_CACHE_SLOTS remains at 512, there should be no impact on
> > existing systems.
>
> Alright，I will add a CONFIG_SECURITY_SELINUX_AVC_HASH_BITS in
> security/selinux/Kconfig, the range is between 9 and 14 (512 : 16384),
> with a default value of 9. And then I will send a new patchset version.

That seems reasonable.  I'm sure you've seen it already, but you'll
likely need to modify AVC_DEF_CACHE_THRESHOLD as well ... or honestly
just remove it in favor of AVC_CACHE_SLOTS, it's only used to set an
initial value for the cache threshold.

> I will try to submit the final version in Q1 2026 based on the discussion
> (Because I have some planned Q4 work that hasn't been completed yet).

No worries, thank you!

-- 
paul-moore.com

Re: [PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Hongru Zhang 3 months, 2 weeks ago

> That seems reasonable.  I'm sure you've seen it already, but you'll
> likely need to modify AVC_DEF_CACHE_THRESHOLD as well ... or honestly
> just remove it in favor of AVC_CACHE_SLOTS, it's only used to set an
> initial value for the cache threshold.

How about this?

#define AVC_DEF_CACHE_THRESHOLD AVC_CACHE_SLOTS

We can preserve the original semantics this way.

> 
> > I will try to submit the final version in Q1 2026 based on the discussion
> > (Because I have some planned Q4 work that hasn't been completed yet).
> 
> No worries, thank you!

Re: [PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot

Posted by Stephen Smalley 4 months, 2 weeks ago

On Fri, Sep 26, 2025 at 2:23 AM Hongru Zhang <zhanghongru06@gmail.com> wrote:
>
> From: Hongru Zhang <zhanghongru@xiaomi.com>
>
> On mobile device high-load situations, permission check can happen
> more than 90,000/s (8 core system). With default 512 cache nodes
> configuration, avc cache miss happens more often and occasionally
> leads to long time (>2ms) irqs off on both big and little cores,
> which decreases system real-time capability.
>
> An actual call stack is as follows:
>  => avc_compute_av
>  => avc_perm_nonode
>  => avc_has_perm_noaudit
>  => selinux_capable
>  => security_capable
>  => capable
>  => __sched_setscheduler
>  => do_sched_setscheduler
>  => __arm64_sys_sched_setscheduler
>  => invoke_syscall
>  => el0_svc_common
>  => do_el0_svc
>  => el0_svc
>  => el0t_64_sync_handler
>  => el0t_64_sync
>
> Although we can expand avc nodes through /sys/fs/selinux/cache_threshold
> to mitigate long time irqs off, hash conflicts make the bucket average
> length longer because of the fixed size of cache slots, leading to
> avc_search_node latency increase.
>
> Make avc cache slot size also configurable, and with fine tuning, we can
> mitigate long time irqs off with slightly avc_search_node performance
> regression.
>
> Theoretically, the main overhead is memory consumption.
>
> avc_search_node avg latency test results (about 100,000,000 times) on
> Qcom SM8750, 6.6.30-android15-8:
>
> Case 1:
> +---------+---------------------+------------------------+
> |         | no-patch (512/512)  | with-patch (512/512)   |
> +---------+---------------------+------------------------+
> | latency |        85 ns        |         87 ns          |
> +---------+---------------------+------------------------+
>
> Case 2:
> +---------+---------------------+------------------------+
> |         | no-patch (8192/512) | with-patch (8192/8192) |
> +---------+---------------------+------------------------+
> | latency |        277 ns       |         106 ns         |
> +---------+---------------------+------------------------+
>
> Case 1 shows 512 nodes configuration has ~2% performance regression
> with patch.
> Case 2 shows 8192 nodes configuration has ~61% latency benifit with
> patch.
>
> Signed-off-by: Hongru Zhang <zhanghongru@xiaomi.com>

Reviewed-by: Stephen Smalley <stephen.smalley.work@gmail.com>

> ---
>  .../admin-guide/kernel-parameters.txt         |  4 ++
>  security/selinux/avc.c                        | 68 +++++++++++++------
>  2 files changed, 50 insertions(+), 22 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 747a55abf494..70dc6d659117 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -6620,6 +6620,10 @@
>                         1 -- enable.
>                         Default value is 1.
>
> +       selinux_avc_cache_slots= [SELINUX] Set the avc cache slot size.
> +                       Format: <int> (must be >0, power of 2)
> +                       Default: 512
> +
>         serialnumber    [BUGS=X86-32]
>
>         sev=option[,option...] [X86-64]
> diff --git a/security/selinux/avc.c b/security/selinux/avc.c
> index 430b0e23ee00..7a7f88012865 100644
> --- a/security/selinux/avc.c
> +++ b/security/selinux/avc.c
> @@ -34,7 +34,7 @@
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/avc.h>
>
> -#define AVC_CACHE_SLOTS                        512
> +static int avc_cache_slots __ro_after_init = 512;
>  #define AVC_DEF_CACHE_THRESHOLD                512
>  #define AVC_CACHE_RECLAIM              16
>
> @@ -68,9 +68,13 @@ struct avc_xperms_node {
>         struct list_head xpd_head; /* list head of extended_perms_decision */
>  };
>
> +struct avc_slot {
> +       struct hlist_head       slot;           /* head for avc_node->list */
> +       spinlock_t              slot_lock;      /* lock for writes */
> +};
> +
>  struct avc_cache {
> -       struct hlist_head       slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
> -       spinlock_t              slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
> +       struct avc_slot         *slots;
>         atomic_t                lru_hint;       /* LRU hint for reclaim scan */
>         atomic_t                active_nodes;
>         u32                     latest_notif;   /* latest revocation notification */
> @@ -93,14 +97,34 @@ struct selinux_avc {
>
>  static struct selinux_avc selinux_avc;
>
> +static int __init set_selinux_avc_cache_slots(char *str)
> +{
> +       int val;
> +
> +       if ((kstrtoint(str, 0, &val)) || !is_power_of_2(val)) {
> +               pr_warn("Unable to set selinux_avc_cache_slots, use default value\n");
> +               return 1;
> +       }
> +
> +       avc_cache_slots = val;
> +
> +       return 1;
> +}
> +__setup("selinux_avc_cache_slots=", set_selinux_avc_cache_slots);
> +
>  void selinux_avc_init(void)
>  {
>         int i;
>
> +       selinux_avc.avc_cache.slots =
> +               kmalloc_array(avc_cache_slots, sizeof(struct avc_slot), GFP_KERNEL);
> +       if (!selinux_avc.avc_cache.slots)
> +               panic("SELinux: No memory to alloc avc cache slots\n");
> +
>         selinux_avc.avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD;
> -       for (i = 0; i < AVC_CACHE_SLOTS; i++) {
> -               INIT_HLIST_HEAD(&selinux_avc.avc_cache.slots[i]);
> -               spin_lock_init(&selinux_avc.avc_cache.slots_lock[i]);
> +       for (i = 0; i < avc_cache_slots; i++) {
> +               INIT_HLIST_HEAD(&selinux_avc.avc_cache.slots[i].slot);
> +               spin_lock_init(&selinux_avc.avc_cache.slots[i].slot_lock);
>         }
>         atomic_set(&selinux_avc.avc_cache.active_nodes, 0);
>         atomic_set(&selinux_avc.avc_cache.lru_hint, 0);
> @@ -124,7 +148,7 @@ static struct kmem_cache *avc_xperms_cachep __ro_after_init;
>
>  static inline u32 avc_hash(u32 ssid, u32 tsid, u16 tclass)
>  {
> -       return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1);
> +       return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (avc_cache_slots - 1);
>  }
>
>  /**
> @@ -150,8 +174,8 @@ int avc_get_hash_stats(char *page)
>
>         slots_used = 0;
>         max_chain_len = 0;
> -       for (i = 0; i < AVC_CACHE_SLOTS; i++) {
> -               head = &selinux_avc.avc_cache.slots[i];
> +       for (i = 0; i < avc_cache_slots; i++) {
> +               head = &selinux_avc.avc_cache.slots[i].slot;
>                 if (!hlist_empty(head)) {
>                         slots_used++;
>                         chain_len = 0;
> @@ -167,7 +191,7 @@ int avc_get_hash_stats(char *page)
>         return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
>                          "longest chain: %d\n",
>                          atomic_read(&selinux_avc.avc_cache.active_nodes),
> -                        slots_used, AVC_CACHE_SLOTS, max_chain_len);
> +                        slots_used, avc_cache_slots, max_chain_len);
>  }
>
>  /*
> @@ -463,11 +487,11 @@ static inline int avc_reclaim_node(void)
>         struct hlist_head *head;
>         spinlock_t *lock;
>
> -       for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) {
> +       for (try = 0, ecx = 0; try < avc_cache_slots; try++) {
>                 hvalue = atomic_inc_return(&selinux_avc.avc_cache.lru_hint) &
> -                       (AVC_CACHE_SLOTS - 1);
> -               head = &selinux_avc.avc_cache.slots[hvalue];
> -               lock = &selinux_avc.avc_cache.slots_lock[hvalue];
> +                       (avc_cache_slots - 1);
> +               head = &selinux_avc.avc_cache.slots[hvalue].slot;
> +               lock = &selinux_avc.avc_cache.slots[hvalue].slot_lock;
>
>                 if (!spin_trylock_irqsave(lock, flags))
>                         continue;
> @@ -524,7 +548,7 @@ static inline struct avc_node *avc_search_node(u32 ssid, u32 tsid, u16 tclass)
>         struct hlist_head *head;
>
>         hvalue = avc_hash(ssid, tsid, tclass);
> -       head = &selinux_avc.avc_cache.slots[hvalue];
> +       head = &selinux_avc.avc_cache.slots[hvalue].slot;
>         hlist_for_each_entry_rcu(node, head, list) {
>                 if (ssid == node->ae.ssid &&
>                     tclass == node->ae.tclass &&
> @@ -625,8 +649,8 @@ static void avc_insert(u32 ssid, u32 tsid, u16 tclass,
>         }
>
>         hvalue = avc_hash(ssid, tsid, tclass);
> -       head = &selinux_avc.avc_cache.slots[hvalue];
> -       lock = &selinux_avc.avc_cache.slots_lock[hvalue];
> +       head = &selinux_avc.avc_cache.slots[hvalue].slot;
> +       lock = &selinux_avc.avc_cache.slots[hvalue].slot_lock;
>         spin_lock_irqsave(lock, flag);
>         hlist_for_each_entry(pos, head, list) {
>                 if (pos->ae.ssid == ssid &&
> @@ -846,8 +870,8 @@ static int avc_update_node(u32 event, u32 perms, u8 driver, u8 base_perm,
>         /* Lock the target slot */
>         hvalue = avc_hash(ssid, tsid, tclass);
>
> -       head = &selinux_avc.avc_cache.slots[hvalue];
> -       lock = &selinux_avc.avc_cache.slots_lock[hvalue];
> +       head = &selinux_avc.avc_cache.slots[hvalue].slot;
> +       lock = &selinux_avc.avc_cache.slots[hvalue].slot_lock;
>
>         spin_lock_irqsave(lock, flag);
>
> @@ -929,9 +953,9 @@ static void avc_flush(void)
>         unsigned long flag;
>         int i;
>
> -       for (i = 0; i < AVC_CACHE_SLOTS; i++) {
> -               head = &selinux_avc.avc_cache.slots[i];
> -               lock = &selinux_avc.avc_cache.slots_lock[i];
> +       for (i = 0; i < avc_cache_slots; i++) {
> +               head = &selinux_avc.avc_cache.slots[i].slot;
> +               lock = &selinux_avc.avc_cache.slots[i].slot_lock;
>
>                 spin_lock_irqsave(lock, flag);
>                 /*
> --
> 2.43.0
>

[PATCH v3 1/2] selinux: Make avc cache slot size configurable during boot
[PATCH v3 2/2] selinux: improve bucket distribution uniformity of avc_hash()