From nobody Thu Apr  2 20:28:03 2026
Received: from out-177.mta0.migadu.com (out-177.mta0.migadu.com
 [91.218.175.177])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 134B21F5842
	for <linux-kernel@vger.kernel.org>; Wed, 25 Feb 2026 12:15:21 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=91.218.175.177
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1772021723; cv=none;
 b=IkUsERflJ4Vj/4OZmhrLbe6DKfh9ZyzhGY4KKus/r9vYiYA5t/A2UTLFZkMHU+DvvGEXUrR9aABsphFtA2rVJF1y7BwqLjVqZKMf9FyVfZXaU+wjuPntz5yuaLgZa0QKDu+xcxngeaoY1fJCam8xpDDbERwaQowmXkrif3VTurI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1772021723; c=relaxed/simple;
	bh=SoIASlQHQkl2l2grY9dFi8gICeiVCKREbAsjLqwKoCw=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=AYDRArKwJWZminGV3/SbzvR0QDeCoTJhG8A6tjhSTPd6L2q3LZPoSkmJUKxLD9BpWxnVwqAIxXf3uCmjY2i4NDbYCusORRvS/L0ckbbjWsOJynvlFFXXMhM61hHOne+VWHYw7gAEKTKK6v7/F7Pscq1jWcf2EyolpcsRLHp8sRY=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.dev;
 spf=pass smtp.mailfrom=linux.dev;
 dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev
 header.b=m3eC2bCW; arc=none smtp.client-ip=91.218.175.177
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.dev
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=linux.dev
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev
 header.b="m3eC2bCW"
X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and
 include these headers.
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1;
	t=1772021719;
	h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
	 to:to:cc:cc:mime-version:mime-version:
	 content-transfer-encoding:content-transfer-encoding:
	 in-reply-to:in-reply-to:references:references;
	bh=CSBZzSEqUJvM4KWzWVEwXGx1jyBYabVLcqc6HiuSlLA=;
	b=m3eC2bCWO6AO1U3jzWKMT3OOQzW83GzyuMmktszlvw+E43I1R/1KIGM73H9DC2K02Si/jE
	muAL3KY0eeTOvMKYX1V6z10ZmqjOdV96yjD9lpR4Aq8YVXiZuS5f9jZnHd0XNPzn3umM3x
	XV2GaBSkQhnjDadJFjqp9Viq+fAss5E=
From: Jiayuan Chen <jiayuan.chen@linux.dev>
To: bpf@vger.kernel.org
Cc: jiayuan.chen@linux.dev,
	Jiayuan Chen <jiayuan.chen@shopee.com>,
	syzbot+2b3391f44313b3983e91@syzkaller.appspotmail.com,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	"David S. Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>,
	Jesper Dangaard Brouer <hawk@kernel.org>,
	John Fastabend <john.fastabend@gmail.com>,
	Stanislav Fomichev <sdf@fomichev.me>,
	Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <martin.lau@linux.dev>,
	Eduard Zingerman <eddyz87@gmail.com>,
	Song Liu <song@kernel.org>,
	Yonghong Song <yonghong.song@linux.dev>,
	KP Singh <kpsingh@kernel.org>,
	Hao Luo <haoluo@google.com>,
	Jiri Olsa <jolsa@kernel.org>,
	Clark Williams <clrkwllms@kernel.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	Thomas Gleixner <tglx@kernel.org>,
	netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-rt-devel@lists.linux.dev
Subject: [PATCH bpf v4 1/2] bpf: cpumap: fix race in bq_flush_to_queue on
 PREEMPT_RT
Date: Wed, 25 Feb 2026 20:14:55 +0800
Message-ID: <20260225121459.183121-2-jiayuan.chen@linux.dev>
In-Reply-To: <20260225121459.183121-1-jiayuan.chen@linux.dev>
References: <20260225121459.183121-1-jiayuan.chen@linux.dev>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Migadu-Flow: FLOW_OUT
Content-Type: text/plain; charset="utf-8"

From: Jiayuan Chen <jiayuan.chen@shopee.com>

On PREEMPT_RT kernels, the per-CPU xdp_bulk_queue (bq) can be accessed
concurrently by multiple preemptible tasks on the same CPU.

The original code assumes bq_enqueue() and __cpu_map_flush() run
atomically with respect to each other on the same CPU, relying on
local_bh_disable() to prevent preemption. However, on PREEMPT_RT,
local_bh_disable() only calls migrate_disable() (when
PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable
preemption, which allows CFS scheduling to preempt a task during
bq_flush_to_queue(), enabling another task on the same CPU to enter
bq_enqueue() and operate on the same per-CPU bq concurrently.

This leads to several races:

1. Double __list_del_clearprev(): after bq->count is reset in
   bq_flush_to_queue(), a preempting task can call bq_enqueue() ->
   bq_flush_to_queue() on the same bq when bq->count reaches
   CPU_MAP_BULK_SIZE. Both tasks then call __list_del_clearprev()
   on the same bq->flush_node, the second call dereferences the
   prev pointer that was already set to NULL by the first.

2. bq->count and bq->q[] races: concurrent bq_enqueue() can corrupt
   the packet queue while bq_flush_to_queue() is processing it.

The race between task A (__cpu_map_flush -> bq_flush_to_queue) and
task B (bq_enqueue -> bq_flush_to_queue) on the same CPU:

  Task A (xdp_do_flush)          Task B (cpu_map_enqueue)
  ----------------------         ------------------------
  bq_flush_to_queue(bq)
    spin_lock(&q->producer_lock)
    /* flush bq->q[] to ptr_ring */
    bq->count =3D 0
    spin_unlock(&q->producer_lock)
                                   bq_enqueue(rcpu, xdpf)
    <-- CFS preempts Task A -->      bq->q[bq->count++] =3D xdpf
                                     /* ... more enqueues until full ... */
                                     bq_flush_to_queue(bq)
                                       spin_lock(&q->producer_lock)
                                       /* flush to ptr_ring */
                                       spin_unlock(&q->producer_lock)
                                       __list_del_clearprev(flush_node)
                                         /* sets flush_node.prev =3D NULL */
    <-- Task A resumes -->
    __list_del_clearprev(flush_node)
      flush_node.prev->next =3D ...
      /* prev is NULL -> kernel oops */

Fix this by adding a local_lock_t to xdp_bulk_queue and acquiring it
in bq_enqueue() and __cpu_map_flush(). These paths already run under
local_bh_disable(), so use local_lock_nested_bh() which on non-RT is
a pure annotation with no overhead, and on PREEMPT_RT provides a
per-CPU sleeping lock that serializes access to the bq.

To reproduce, insert an mdelay(100) between bq->count =3D 0 and
__list_del_clearprev() in bq_flush_to_queue(), then run reproducer
provided by syzkaller.

Fixes: 3253cb49cbad ("softirq: Allow to drop the softirq-BKL lock on PREEMP=
T_RT")
Reported-by: syzbot+2b3391f44313b3983e91@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/69369331.a70a0220.38f243.009d.GAE@googl=
e.com/T/
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
 kernel/bpf/cpumap.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 04171fbc39cb..32b43cb9061b 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -29,6 +29,7 @@
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
+#include <linux/local_lock.h>
 #include <linux/completion.h>
 #include <trace/events/xdp.h>
 #include <linux/btf_ids.h>
@@ -52,6 +53,7 @@ struct xdp_bulk_queue {
 	struct list_head flush_node;
 	struct bpf_cpu_map_entry *obj;
 	unsigned int count;
+	local_lock_t bq_lock;
 };
=20
 /* Struct for every remote "destination" CPU in map */
@@ -451,6 +453,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_c=
pumap_val *value,
 	for_each_possible_cpu(i) {
 		bq =3D per_cpu_ptr(rcpu->bulkq, i);
 		bq->obj =3D rcpu;
+		local_lock_init(&bq->bq_lock);
 	}
=20
 	/* Alloc queue */
@@ -722,6 +725,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
 	struct ptr_ring *q;
 	int i;
=20
+	lockdep_assert_held(&bq->bq_lock);
+
 	if (unlikely(!bq->count))
 		return;
=20
@@ -749,11 +754,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *=
bq)
 }
=20
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+ * Thus, safe percpu variable access. PREEMPT_RT relies on
+ * local_lock_nested_bh() to serialise access to the per-CPU bq.
  */
 static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *x=
dpf)
 {
-	struct xdp_bulk_queue *bq =3D this_cpu_ptr(rcpu->bulkq);
+	struct xdp_bulk_queue *bq;
+
+	local_lock_nested_bh(&rcpu->bulkq->bq_lock);
+	bq =3D this_cpu_ptr(rcpu->bulkq);
=20
 	if (unlikely(bq->count =3D=3D CPU_MAP_BULK_SIZE))
 		bq_flush_to_queue(bq);
@@ -774,6 +783,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, =
struct xdp_frame *xdpf)
=20
 		list_add(&bq->flush_node, flush_list);
 	}
+
+	local_unlock_nested_bh(&rcpu->bulkq->bq_lock);
 }
=20
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -810,7 +821,9 @@ void __cpu_map_flush(struct list_head *flush_list)
 	struct xdp_bulk_queue *bq, *tmp;
=20
 	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+		local_lock_nested_bh(&bq->obj->bulkq->bq_lock);
 		bq_flush_to_queue(bq);
+		local_unlock_nested_bh(&bq->obj->bulkq->bq_lock);
=20
 		/* If already running, costs spin_lock_irqsave + smb_mb */
 		wake_up_process(bq->obj->kthread);
--=20
2.43.0
From nobody Thu Apr  2 20:28:03 2026
Received: from out-173.mta0.migadu.com (out-173.mta0.migadu.com
 [91.218.175.173])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5D7D43AE6FC
	for <linux-kernel@vger.kernel.org>; Wed, 25 Feb 2026 12:15:30 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=91.218.175.173
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1772021738; cv=none;
 b=CMFC/rnQBPJu3HZC6dmiPgRLqmAZ/wLKnH3Fcnpehay4SAkGZRrc8hXWu7Hn6f1DalRYsY/5uJBc2aWJXvpyju0/H24D5rOv40gbqmH+MRgZoHV3OIYkfrauedg3wd8IiToDoBWYNxoxFEtXLWp73pQNUvk7pNZWVwZjKYZHFjI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1772021738; c=relaxed/simple;
	bh=g7RtNiAx6OTRSX929EyC0r/iXrDYL5Y7WW6UrVu4DCU=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=N/93IJlqtcC0rRMv8PQgt93sinK0szTTY+1+5Vy75IM+FNj9W38PsOQTX0a6RNBf0ZEIb6Ve3s22GchsQiDcjhF4Zn3/CqahRYgNU+gssQmijmBD2JKpuuQR/q71cXt3daXoUQ0x0EzPubFljHsCmYDwLoqlzE1duO218iGIhoo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.dev;
 spf=pass smtp.mailfrom=linux.dev;
 dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev
 header.b=ZpUpXtdD; arc=none smtp.client-ip=91.218.175.173
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=linux.dev
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=linux.dev
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (1024-bit key) header.d=linux.dev header.i=@linux.dev
 header.b="ZpUpXtdD"
X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and
 include these headers.
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1;
	t=1772021728;
	h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
	 to:to:cc:cc:mime-version:mime-version:
	 content-transfer-encoding:content-transfer-encoding:
	 in-reply-to:in-reply-to:references:references;
	bh=gSfPXHAV066qAmlgrbJqoJU7Uy3WFkTtHw/fiXihJJg=;
	b=ZpUpXtdD1uTGYL1toz6g5W+8P7vgAd8fXc9mFjK1zXzkP/am2EJWQxjMnpm6HhMnPdSEaQ
	MAo3u4YCg6Is3ybEREujvEFsk0+gneS3n7zZ5PxVR13rBXoynOZpjVthmvjSZQuC6im/Dg
	MSAF21LDjnwmruw4kb5M77VonflVuqs=
From: Jiayuan Chen <jiayuan.chen@linux.dev>
To: bpf@vger.kernel.org
Cc: jiayuan.chen@linux.dev,
	Jiayuan Chen <jiayuan.chen@shopee.com>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	"David S. Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>,
	Jesper Dangaard Brouer <hawk@kernel.org>,
	John Fastabend <john.fastabend@gmail.com>,
	Stanislav Fomichev <sdf@fomichev.me>,
	Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <martin.lau@linux.dev>,
	Eduard Zingerman <eddyz87@gmail.com>,
	Song Liu <song@kernel.org>,
	Yonghong Song <yonghong.song@linux.dev>,
	KP Singh <kpsingh@kernel.org>,
	Hao Luo <haoluo@google.com>,
	Jiri Olsa <jolsa@kernel.org>,
	Clark Williams <clrkwllms@kernel.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	Thomas Gleixner <tglx@kernel.org>,
	netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-rt-devel@lists.linux.dev
Subject: [PATCH bpf v4 2/2] bpf: devmap: fix race in bq_xmit_all on PREEMPT_RT
Date: Wed, 25 Feb 2026 20:14:56 +0800
Message-ID: <20260225121459.183121-3-jiayuan.chen@linux.dev>
In-Reply-To: <20260225121459.183121-1-jiayuan.chen@linux.dev>
References: <20260225121459.183121-1-jiayuan.chen@linux.dev>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Migadu-Flow: FLOW_OUT
Content-Type: text/plain; charset="utf-8"

From: Jiayuan Chen <jiayuan.chen@shopee.com>

On PREEMPT_RT kernels, the per-CPU xdp_dev_bulk_queue (bq) can be
accessed concurrently by multiple preemptible tasks on the same CPU.

The original code assumes bq_enqueue() and __dev_flush() run atomically
with respect to each other on the same CPU, relying on
local_bh_disable() to prevent preemption. However, on PREEMPT_RT,
local_bh_disable() only calls migrate_disable() (when
PREEMPT_RT_NEEDS_BH_LOCK is not set) and does not disable
preemption, which allows CFS scheduling to preempt a task during
bq_xmit_all(), enabling another task on the same CPU to enter
bq_enqueue() and operate on the same per-CPU bq concurrently.

This leads to several races:

1. Double-free / use-after-free on bq->q[]: bq_xmit_all() snapshots
   cnt =3D bq->count, then iterates bq->q[0..cnt-1] to transmit frames.
   If preempted after the snapshot, a second task can call bq_enqueue()
   -> bq_xmit_all() on the same bq, transmitting (and freeing) the
   same frames. When the first task resumes, it operates on stale
   pointers in bq->q[], causing use-after-free.

2. bq->count and bq->q[] corruption: concurrent bq_enqueue() modifying
   bq->count and bq->q[] while bq_xmit_all() is reading them.

3. dev_rx/xdp_prog teardown race: __dev_flush() clears bq->dev_rx and
   bq->xdp_prog after bq_xmit_all(). If preempted between
   bq_xmit_all() return and bq->dev_rx =3D NULL, a preempting
   bq_enqueue() sees dev_rx still set (non-NULL), skips adding bq to
   the flush_list, and enqueues a frame. When __dev_flush() resumes,
   it clears dev_rx and removes bq from the flush_list, orphaning the
   newly enqueued frame.

4. __list_del_clearprev() on flush_node: similar to the cpumap race,
   both tasks can call __list_del_clearprev() on the same flush_node,
   the second dereferences the prev pointer already set to NULL.

The race between task A (__dev_flush -> bq_xmit_all) and task B
(bq_enqueue -> bq_xmit_all) on the same CPU:

  Task A (xdp_do_flush)          Task B (ndo_xdp_xmit redirect)
  ----------------------         --------------------------------
  __dev_flush(flush_list)
    bq_xmit_all(bq)
      cnt =3D bq->count  /* e.g. 16 */
      /* start iterating bq->q[] */
    <-- CFS preempts Task A -->
                                   bq_enqueue(dev, xdpf)
                                     bq->count =3D=3D DEV_MAP_BULK_SIZE
                                     bq_xmit_all(bq, 0)
                                       cnt =3D bq->count  /* same 16! */
                                       ndo_xdp_xmit(bq->q[])
                                       /* frames freed by driver */
                                       bq->count =3D 0
    <-- Task A resumes -->
      ndo_xdp_xmit(bq->q[])
      /* use-after-free: frames already freed! */

Fix this by adding a local_lock_t to xdp_dev_bulk_queue and acquiring
it in bq_enqueue() and __dev_flush(). These paths already run under
local_bh_disable(), so use local_lock_nested_bh() which on non-RT is
a pure annotation with no overhead, and on PREEMPT_RT provides a
per-CPU sleeping lock that serializes access to the bq.

Fixes: 3253cb49cbad ("softirq: Allow to drop the softirq-BKL lock on PREEMP=
T_RT")
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
 kernel/bpf/devmap.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2625601de76e..10cf0731f91d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -45,6 +45,7 @@
  * types of devmap; only the lookup and insertion is different.
  */
 #include <linux/bpf.h>
+#include <linux/local_lock.h>
 #include <net/xdp.h>
 #include <linux/filter.h>
 #include <trace/events/xdp.h>
@@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue {
 	struct net_device *dev_rx;
 	struct bpf_prog *xdp_prog;
 	unsigned int count;
+	local_lock_t bq_lock;
 };
=20
 struct bpf_dtab_netdev {
@@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, =
u32 flags)
 	int to_send =3D cnt;
 	int i;
=20
+	lockdep_assert_held(&bq->bq_lock);
+
 	if (unlikely(!cnt))
 		return;
=20
@@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list)
 	struct xdp_dev_bulk_queue *bq, *tmp;
=20
 	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+		local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
 		bq_xmit_all(bq, XDP_XMIT_FLUSH);
 		bq->dev_rx =3D NULL;
 		bq->xdp_prog =3D NULL;
 		__list_del_clearprev(&bq->flush_node);
+		local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock);
 	}
 }
=20
@@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *ma=
p, u32 key)
=20
 /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
  * variable access, and map elements stick around. See comment above
- * xdp_do_flush() in filter.c.
+ * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh()
+ * to serialise access to the per-CPU bq.
  */
 static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
 		       struct net_device *dev_rx, struct bpf_prog *xdp_prog)
 {
-	struct xdp_dev_bulk_queue *bq =3D this_cpu_ptr(dev->xdp_bulkq);
+	struct xdp_dev_bulk_queue *bq;
+
+	local_lock_nested_bh(&dev->xdp_bulkq->bq_lock);
+	bq =3D this_cpu_ptr(dev->xdp_bulkq);
=20
 	if (unlikely(bq->count =3D=3D DEV_MAP_BULK_SIZE))
 		bq_xmit_all(bq, 0);
@@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct x=
dp_frame *xdpf,
 	}
=20
 	bq->q[bq->count++] =3D xdpf;
+
+	local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock);
 }
=20
 static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *=
xdpf,
@@ -1115,8 +1127,13 @@ static int dev_map_notification(struct notifier_bloc=
k *notifier,
 		if (!netdev->xdp_bulkq)
 			return NOTIFY_BAD;
=20
-		for_each_possible_cpu(cpu)
-			per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev =3D netdev;
+		for_each_possible_cpu(cpu) {
+			struct xdp_dev_bulk_queue *bq;
+
+			bq =3D per_cpu_ptr(netdev->xdp_bulkq, cpu);
+			bq->dev =3D netdev;
+			local_lock_init(&bq->bq_lock);
+		}
 		break;
 	case NETDEV_UNREGISTER:
 		/* This rcu_read_lock/unlock pair is needed because
--=20
2.43.0