From nobody Sat Oct  4 06:37:16 2025
Received: from lankhorst.se (lankhorst.se [141.105.120.124])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 03D42335BB8;
	Tue, 19 Aug 2025 11:56:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=141.105.120.124
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755604602; cv=none;
 b=iUSB4UDWm5Y0dJh53FQUHOlNAvBSjAsuGr4GKLikcXA3ynbDd/hGd1u6z5uF++G0LUlk6n7p22Kf4S23lde0cHM/N0IYcFI9CgLHWo8ku9LTguaKIldx73/fFjucUd91o7WAuEbNAkY1EHhGeS7lXKKF9RN+53FdcMKw422jjBU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755604602; c=relaxed/simple;
	bh=8AXHE5jbwASC1W5bAEt48pYetlJWFs3R9WMwqxL2zTg=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=qs5o4w/+DcXW6IumN5I4eIfK5GwcgIjgYRM/yjAKjmjrb2vCoQGwP7cF7OoQ92L28HihHHSxhOSw9AjkHffhc2z5YjNx2ZC5hj/hxF5FucWFi3jHb05YDguHNoGhV4VbMwdCr6v0GL03kaf76XNWky2izHweUTXv1C8Oy6p4//w=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=lankhorst.se;
 spf=pass smtp.mailfrom=lankhorst.se; arc=none smtp.client-ip=141.105.120.124
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=lankhorst.se
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=lankhorst.se
From: Maarten Lankhorst <dev@lankhorst.se>
To: Lucas De Marchi <lucas.demarchi@intel.com>,
	=?UTF-8?q?=27Thomas=20Hellstr=C3=B6m=27?= <thomas.hellstrom@linux.intel.com>,
	Rodrigo Vivi <rodrigo.vivi@intel.com>,
	David Airlie <airlied@gmail.com>,
	Simona Vetter <simona@ffwll.ch>,
	Maarten Lankhorst <dev@lankhorst.se>,
	Maxime Ripard <mripard@kernel.org>,
	Natalie Vock <natalie.vock@gmx.de>,
	Tejun Heo <tj@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	=?UTF-8?q?=27Michal=20Koutn=C3=BD=27?= <mkoutny@suse.com>,
	Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Muchun Song <muchun.song@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@redhat.com>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	"'Liam R . Howlett'" <Liam.Howlett@oracle.com>,
	Vlastimil Babka <vbabka@suse.cz>,
	Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Thomas Zimmermann <tzimmermann@suse.de>
Cc: Michal Hocko <mhocko@suse.com>,
	intel-xe@lists.freedesktop.org,
	dri-devel@lists.freedesktop.org,
	linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org,
	linux-mm@kvack.org
Subject: [RFC 1/3] page_counter: Allow for pinning some amount of memory
Date: Tue, 19 Aug 2025 13:49:34 +0200
Message-ID: <20250819114932.597600-6-dev@lankhorst.se>
X-Mailer: git-send-email 2.50.0
In-Reply-To: <20250819114932.597600-5-dev@lankhorst.se>
References: <20250819114932.597600-5-dev@lankhorst.se>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Add a pinned member, and use it for implementing pinning accounting.
Memory to be pinned has to already be accounted for as normally used,
and only memory up to the 'min' limit is allowed to be pinned.

This limit is chosen because cgroups already guarantees that memory
up to that limit will not evicted.

Pinned memory affects min and low calculations, so alter those slightly.

Signed-off-by: Maarten Lankhorst <dev@lankhorst.se>
---
 include/linux/page_counter.h |  8 +++
 mm/page_counter.c            | 98 +++++++++++++++++++++++++++++++++---
 2 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index d649b6bbbc871..5836c6dfb3c76 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -13,6 +13,7 @@ struct page_counter {
 	 * v2. The memcg->memory.usage is a hot member of struct mem_cgroup.
 	 */
 	atomic_long_t usage;
+	atomic_long_t pinned;
 	unsigned long failcnt; /* v1-only field */
=20
 	CACHELINE_PADDING(_pad1_);
@@ -68,11 +69,18 @@ static inline unsigned long page_counter_read(struct pa=
ge_counter *counter)
 	return atomic_long_read(&counter->usage);
 }
=20
+static inline unsigned long page_counter_pinned(struct page_counter *count=
er)
+{
+	return atomic_long_read(&counter->pinned);
+}
+
 void page_counter_cancel(struct page_counter *counter, unsigned long nr_pa=
ges);
 void page_counter_charge(struct page_counter *counter, unsigned long nr_pa=
ges);
 bool page_counter_try_charge(struct page_counter *counter,
 			     unsigned long nr_pages,
 			     struct page_counter **fail);
+bool page_counter_try_pin(struct page_counter *counter, unsigned long nr_p=
ages);
+void page_counter_unpin(struct page_counter *counter, unsigned long nr_pag=
es);
 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_=
pages);
 void page_counter_set_min(struct page_counter *counter, unsigned long nr_p=
ages);
 void page_counter_set_low(struct page_counter *counter, unsigned long nr_p=
ages);
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 661e0f2a5127a..d29d0ed01ec18 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -184,6 +184,82 @@ void page_counter_uncharge(struct page_counter *counte=
r, unsigned long nr_pages)
 		page_counter_cancel(c, nr_pages);
 }
=20
+static void page_counter_unpin_one(struct page_counter *counter, unsigned =
long nr_pages)
+{
+	long new;
+
+	new =3D atomic_long_sub_return(nr_pages, &counter->pinned);
+	/* More uncharges than charges? */
+	if (WARN_ONCE(new < 0, "page_counter pinned underflow: %ld nr_pages=3D%lu=
\n",
+		      new, nr_pages))
+		atomic_long_set(&counter->pinned, 0);
+}
+
+/**
+ * page_counter_try_pin - try to hierarchically pin pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ *
+ * Returns %true on success, or %false if any counter goes above,
+ * the 'min' limit. Failing cgroup is not returned, as pinned memory
+ * cannot be evicted.
+ */
+bool page_counter_try_pin(struct page_counter *counter,
+			     unsigned long nr_pages)
+{
+	struct page_counter *c, *fail;
+	bool track_failcnt =3D counter->track_failcnt;
+
+	for (c =3D counter; c; c =3D c->parent) {
+		long new;
+		/*
+		 * Pin speculatively to avoid an expensive CAS.  If
+		 * a bigger charge fails, it might falsely lock out a
+		 * racing smaller charge and send it into reclaim
+		 * early, but the error is limited to the difference
+		 * between the two sizes, which is less than 2M/4M in
+		 * case of a THP locking out a regular page charge.
+		 *
+		 * The atomic_long_add_return() implies a full memory
+		 * barrier between incrementing the count and reading
+		 * the limit.  When racing with page_counter_set_max(),
+		 * we either see the new limit or the setter sees the
+		 * counter has changed and retries.
+		 */
+		new =3D atomic_long_add_return(nr_pages, &c->pinned);
+		if (new > READ_ONCE(c->min)) {
+			atomic_long_sub(nr_pages, &c->pinned);
+			/*
+			 * This is racy, but we can live with some
+			 * inaccuracy in the failcnt which is only used
+			 * to report stats.
+			 */
+			if (track_failcnt)
+				data_race(c->failcnt++);
+			fail =3D c;
+			goto failed;
+		}
+	}
+	return true;
+
+failed:
+	for (c =3D counter; c !=3D fail; c =3D c->parent)
+		page_counter_unpin_one(c, nr_pages);
+
+	return false;
+}
+
+/**
+ * page_counter_unpin - hierarchically unpin pages
+ * @counter: counter
+ * @nr_pages: number of pages to uncharge
+ */
+void page_counter_unpin(struct page_counter *counter, unsigned long nr_pag=
es)
+{
+	for (struct page_counter *c =3D counter; c; c =3D c->parent)
+		page_counter_unpin_one(c, nr_pages);
+}
+
 /**
  * page_counter_set_max - set the maximum number of pages allowed
  * @counter: counter
@@ -425,7 +501,7 @@ void page_counter_calculate_protection(struct page_coun=
ter *root,
 				       struct page_counter *counter,
 				       bool recursive_protection)
 {
-	unsigned long usage, parent_usage;
+	unsigned long usage, parent_usage, pinned, min, low;
 	struct page_counter *parent =3D counter->parent;
=20
 	/*
@@ -442,23 +518,31 @@ void page_counter_calculate_protection(struct page_co=
unter *root,
 	if (!usage)
 		return;
=20
+	pinned =3D page_counter_pinned(counter);
+
+	/* For calculation purposes, pinned memory is subtracted from min/low */
+	min =3D READ_ONCE(counter->min);
+	if (pinned > min)
+		min =3D 0;
+	low =3D READ_ONCE(counter->low);
+	if (pinned > low)
+		low =3D 0;
+
 	if (parent =3D=3D root) {
-		counter->emin =3D READ_ONCE(counter->min);
-		counter->elow =3D READ_ONCE(counter->low);
+		counter->emin =3D min;
+		counter->elow =3D low;
 		return;
 	}
=20
 	parent_usage =3D page_counter_read(parent);
=20
 	WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage,
-			READ_ONCE(counter->min),
-			READ_ONCE(parent->emin),
+			min, READ_ONCE(parent->emin),
 			atomic_long_read(&parent->children_min_usage),
 			recursive_protection));
=20
 	WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage,
-			READ_ONCE(counter->low),
-			READ_ONCE(parent->elow),
+			low, READ_ONCE(parent->elow),
 			atomic_long_read(&parent->children_low_usage),
 			recursive_protection));
 }
--=20
2.50.0
From nobody Sat Oct  4 06:37:16 2025
Received: from lankhorst.se (lankhorst.se [141.105.120.124])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 03CA8335BB1;
	Tue, 19 Aug 2025 11:56:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=141.105.120.124
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755604602; cv=none;
 b=GQAeQRxWq5+2qWEif4asrv8C8a/RGo9WPIpe2XvyOMHOcRfeY2CLd2TpOHPf9hYZ9KSMrckJnT4C40tiE+AUp5Gc+r5iaV1XIQ8u5kp52jB9HZrVU/8zLd4qV1iXqDQoLQS+R0MOp0lCsPV+Xjiu8ib6Ta5ZnMQEdDqCI4ELqXw=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755604602; c=relaxed/simple;
	bh=BBc4qktafzeeoB3et84zl/DrK/MKQK1W4gtygBAL5d8=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=N+bMJEyIg3vdfzA2cwETdyG8l8KluUdJX1dYjMa8acD/kWthmVdA2dJMoYAAxDmOhLGztfVGqmfI9Zw+4vAeL0lSwHX7sQwgn4gQXOd8PadiwP8bNtjTw+S5HaPfB9VuiRdhB64ZhVDX/D/cB9Z0T5jblkipLfHEE2uVIAkoYlI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=lankhorst.se;
 spf=pass smtp.mailfrom=lankhorst.se; arc=none smtp.client-ip=141.105.120.124
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=lankhorst.se
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=lankhorst.se
From: Maarten Lankhorst <dev@lankhorst.se>
To: Lucas De Marchi <lucas.demarchi@intel.com>,
	=?UTF-8?q?=27Thomas=20Hellstr=C3=B6m=27?= <thomas.hellstrom@linux.intel.com>,
	Rodrigo Vivi <rodrigo.vivi@intel.com>,
	David Airlie <airlied@gmail.com>,
	Simona Vetter <simona@ffwll.ch>,
	Maarten Lankhorst <dev@lankhorst.se>,
	Maxime Ripard <mripard@kernel.org>,
	Natalie Vock <natalie.vock@gmx.de>,
	Tejun Heo <tj@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	=?UTF-8?q?=27Michal=20Koutn=C3=BD=27?= <mkoutny@suse.com>,
	Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Muchun Song <muchun.song@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@redhat.com>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	"'Liam R . Howlett'" <Liam.Howlett@oracle.com>,
	Vlastimil Babka <vbabka@suse.cz>,
	Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Thomas Zimmermann <tzimmermann@suse.de>
Cc: Michal Hocko <mhocko@suse.com>,
	intel-xe@lists.freedesktop.org,
	dri-devel@lists.freedesktop.org,
	linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org,
	linux-mm@kvack.org
Subject: [RFC 2/3] cgroup/dmem: Implement pinning device memory
Date: Tue, 19 Aug 2025 13:49:35 +0200
Message-ID: <20250819114932.597600-7-dev@lankhorst.se>
X-Mailer: git-send-email 2.50.0
In-Reply-To: <20250819114932.597600-5-dev@lankhorst.se>
References: <20250819114932.597600-5-dev@lankhorst.se>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Add a function to pin, and to unipn memory and adjust the calculations
in dmem_cgroup_state_evict_valuable().

Signed-off-by: Maarten Lankhorst <dev@lankhorst.se>
---
 include/linux/cgroup_dmem.h |  2 ++
 kernel/cgroup/dmem.c        | 57 +++++++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/include/linux/cgroup_dmem.h b/include/linux/cgroup_dmem.h
index dd4869f1d736e..a981bb692ba22 100644
--- a/include/linux/cgroup_dmem.h
+++ b/include/linux/cgroup_dmem.h
@@ -21,6 +21,8 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *reg=
ion, u64 size,
 			   struct dmem_cgroup_pool_state **ret_pool,
 			   struct dmem_cgroup_pool_state **ret_limit_pool);
 void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size);
+int dmem_cgroup_try_pin(struct dmem_cgroup_pool_state *pool, u64 size);
+void dmem_cgroup_unpin(struct dmem_cgroup_pool_state *pool, u64 size);
 bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit=
_pool,
 				      struct dmem_cgroup_pool_state *test_pool,
 				      bool ignore_low, bool *ret_hit_low);
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index 10b63433f0573..ec8b1ffec78de 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -147,6 +147,11 @@ static u64 get_resource_current(struct dmem_cgroup_poo=
l_state *pool)
 	return pool ? page_counter_read(&pool->cnt) : 0;
 }
=20
+static u64 get_resource_pinned(struct dmem_cgroup_pool_state *pool)
+{
+	return pool ? page_counter_pinned(&pool->cnt) : 0;
+}
+
 static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
 {
 	set_resource_min(rpool, 0);
@@ -270,7 +275,7 @@ bool dmem_cgroup_state_evict_valuable(struct dmem_cgrou=
p_pool_state *limit_pool,
 {
 	struct dmem_cgroup_pool_state *pool =3D test_pool;
 	struct page_counter *ctest;
-	u64 used, min, low;
+	u64 used, min, low, pinned;
=20
 	/* Can always evict from current pool, despite limits */
 	if (limit_pool =3D=3D test_pool)
@@ -296,16 +301,18 @@ bool dmem_cgroup_state_evict_valuable(struct dmem_cgr=
oup_pool_state *limit_pool,
=20
 	ctest =3D &test_pool->cnt;
=20
+	/* Protection is calculated without pinned memory */
 	dmem_cgroup_calculate_protection(limit_pool, test_pool);
=20
 	used =3D page_counter_read(ctest);
-	min =3D READ_ONCE(ctest->emin);
+	pinned =3D page_counter_pinned(ctest);
+	min =3D READ_ONCE(ctest->emin) + pinned;
=20
 	if (used <=3D min)
 		return false;
=20
 	if (!ignore_low) {
-		low =3D READ_ONCE(ctest->elow);
+		low =3D READ_ONCE(ctest->elow) + pinned;
 		if (used > low)
 			return true;
=20
@@ -641,6 +648,41 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *=
region, u64 size,
 }
 EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
=20
+/**
+ * dmem_cgroup_unpin() - Unpin from a pool.
+ * @pool: Pool to unpin.
+ * @size: Size to unpin.
+ *
+ * Undoes the effects of dmem_cgroup_try_pin.
+ * Must be called with the returned pool as argument,
+ * and same @index and @size.
+ */
+void dmem_cgroup_unpin(struct dmem_cgroup_pool_state *pool, u64 size)
+{
+	if (pool)
+		page_counter_unpin(&pool->cnt, size);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_unpin);
+
+/**
+ * dmem_cgroup_try_pin() - Try pinning an existing allocation to a region.
+ * @pool: dmem region to pin
+ * @size: Size (in bytes) to pin.
+ *
+ * This function pins in @pool for a size of @size bytes.
+ *
+ * If the function succeeds, the memory is succesfully accounted as being =
pinned.
+ * The memory may not be uncharged before unpin is called.
+ *
+ * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno o=
n failure.
+ */
+int dmem_cgroup_try_pin(struct dmem_cgroup_pool_state *pool, u64 size)
+{
+	return page_counter_try_pin(&pool->cnt, size) ? 0 : -EAGAIN;
+
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_try_pin);
+
 static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
 {
 	struct dmem_cgroup_region *region;
@@ -756,6 +798,11 @@ static int dmem_cgroup_region_current_show(struct seq_=
file *sf, void *v)
 	return dmemcg_limit_show(sf, v, get_resource_current);
 }
=20
+static int dmem_cgroup_region_pinned_show(struct seq_file *sf, void *v)
+{
+	return dmemcg_limit_show(sf, v, get_resource_pinned);
+}
+
 static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
 {
 	return dmemcg_limit_show(sf, v, get_resource_min);
@@ -799,6 +846,10 @@ static struct cftype files[] =3D {
 		.name =3D "current",
 		.seq_show =3D dmem_cgroup_region_current_show,
 	},
+	{
+		.name =3D "pinned",
+		.seq_show =3D dmem_cgroup_region_pinned_show,
+	},
 	{
 		.name =3D "min",
 		.write =3D dmem_cgroup_region_min_write,
--=20
2.50.0
From nobody Sat Oct  4 06:37:16 2025
Received: from lankhorst.se (lankhorst.se [141.105.120.124])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 03E53335BC0;
	Tue, 19 Aug 2025 11:56:40 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=141.105.120.124
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1755604602; cv=none;
 b=t3gwFtc5jj86PMil7f5dS10NiebBCHip+SZEeacYMMt3t3fYFxEBFAaTp/Qp4bnv7GSOthgnA4xXBdvgrA1BCGot1pGYwTkXNYuX/OA4kBmYrp5FddFfkZsWnLoXpP1IjpawUty4X2+gx1DMJbZm9/lbVH9lS268FMkzCRjlpE8=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1755604602; c=relaxed/simple;
	bh=92hpTpw6kjNGRWDPrjfBvrWQNm8B6rCLxySPKGhMDf8=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version;
 b=m1k45R/GB9FLybKxZp3wo0qVdPIN5nAerTlmhbgVcPC1ZiRl97pxhYYMxeu538NGhWhGzsuAsLEzIeBhyLuYeqtkVHfac+PXEpr/YM3gcG9j3eMFP9W44+uvPt+ingIefHm59LSdbbv6CI++1dCoWVqPblQF1zycFWFZGjGXi4U=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=lankhorst.se;
 spf=pass smtp.mailfrom=lankhorst.se; arc=none smtp.client-ip=141.105.120.124
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=lankhorst.se
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=lankhorst.se
From: Maarten Lankhorst <dev@lankhorst.se>
To: Lucas De Marchi <lucas.demarchi@intel.com>,
	=?UTF-8?q?=27Thomas=20Hellstr=C3=B6m=27?= <thomas.hellstrom@linux.intel.com>,
	Rodrigo Vivi <rodrigo.vivi@intel.com>,
	David Airlie <airlied@gmail.com>,
	Simona Vetter <simona@ffwll.ch>,
	Maarten Lankhorst <dev@lankhorst.se>,
	Maxime Ripard <mripard@kernel.org>,
	Natalie Vock <natalie.vock@gmx.de>,
	Tejun Heo <tj@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	=?UTF-8?q?=27Michal=20Koutn=C3=BD=27?= <mkoutny@suse.com>,
	Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	Muchun Song <muchun.song@linux.dev>,
	Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@redhat.com>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	"'Liam R . Howlett'" <Liam.Howlett@oracle.com>,
	Vlastimil Babka <vbabka@suse.cz>,
	Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Thomas Zimmermann <tzimmermann@suse.de>
Cc: Michal Hocko <mhocko@suse.com>,
	intel-xe@lists.freedesktop.org,
	dri-devel@lists.freedesktop.org,
	linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org,
	linux-mm@kvack.org
Subject: [RFC 3/3] drm/xe: Add DRM_XE_GEM_CREATE_FLAG_PINNED flag and
 implementation
Date: Tue, 19 Aug 2025 13:49:36 +0200
Message-ID: <20250819114932.597600-8-dev@lankhorst.se>
X-Mailer: git-send-email 2.50.0
In-Reply-To: <20250819114932.597600-5-dev@lankhorst.se>
References: <20250819114932.597600-5-dev@lankhorst.se>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

Add an option to pin memory through the science of cgroup accounting.
A bo will be pinned for its entire lifetime, and this allows buffers
that are pinned for dma-buf export without requiring the pinning to be
done at the dma-buf layer for all devices.

For now only implement VRAM pinning. Dave Airlie has a series to implement
memcg accounting for the GPU but that is not ready yet.

Signed-off-by: Maarten Lankhorst <dev@lankhorst.se>
---
 drivers/gpu/drm/xe/xe_bo.c      | 66 ++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/xe/xe_dma_buf.c | 10 ++++-
 include/uapi/drm/xe_drm.h       | 10 ++++-
 3 files changed, 82 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 6fea39842e1e6..4095e6bd04ea9 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -5,6 +5,7 @@
=20
 #include "xe_bo.h"
=20
+#include <linux/cgroup_dmem.h>
 #include <linux/dma-buf.h>
 #include <linux/nospec.h>
=20
@@ -208,7 +209,8 @@ static bool force_contiguous(u32 bo_flags)
 	 * must be contiguous, also only contiguous BOs support xe_bo_vmap.
 	 */
 	return bo_flags & XE_BO_FLAG_NEEDS_CPU_ACCESS &&
-	       bo_flags & XE_BO_FLAG_PINNED;
+	       bo_flags & XE_BO_FLAG_PINNED &&
+	       !(bo_flags & XE_BO_FLAG_USER);
 }
=20
 static void add_vram(struct xe_device *xe, struct xe_bo *bo,
@@ -1697,6 +1699,16 @@ static void xe_gem_object_free(struct drm_gem_object=
 *obj)
 	ttm_bo_put(container_of(obj, struct ttm_buffer_object, base));
 }
=20
+static void xe_bo_unpin_user(struct xe_bo *bo)
+{
+	xe_bo_unpin_external(bo);
+
+	if (bo->flags & XE_BO_FLAG_SYSTEM)
+		WARN_ON(1);
+	else
+		dmem_cgroup_unpin(bo->ttm.resource->css, xe_bo_size(bo));
+}
+
 static void xe_gem_object_close(struct drm_gem_object *obj,
 				struct drm_file *file_priv)
 {
@@ -1708,6 +1720,10 @@ static void xe_gem_object_close(struct drm_gem_objec=
t *obj,
 		xe_bo_lock(bo, false);
 		ttm_bo_set_bulk_move(&bo->ttm, NULL);
 		xe_bo_unlock(bo);
+	} else if (bo->flags & XE_BO_FLAG_PINNED) {
+		xe_bo_lock(bo, false);
+		xe_bo_unpin_user(bo);
+		xe_bo_unlock(bo);
 	}
 }
=20
@@ -2128,8 +2144,27 @@ struct xe_bo *xe_bo_create_user(struct xe_device *xe=
, struct xe_tile *tile,
 	struct xe_bo *bo =3D __xe_bo_create_locked(xe, tile, vm, size, 0, ~0ULL,
 						 cpu_caching, ttm_bo_type_device,
 						 flags | XE_BO_FLAG_USER, 0);
-	if (!IS_ERR(bo))
+	if (!IS_ERR(bo)) {
+		int ret =3D 0;
+
+		if (bo->flags & XE_BO_FLAG_PINNED) {
+			if (bo->flags & XE_BO_FLAG_SYSTEM) {
+				ret =3D -ENOSYS; // TODO
+			} else {
+				ret =3D dmem_cgroup_try_pin(bo->ttm.resource->css, size);
+			}
+			if (!ret)
+				ret =3D xe_bo_pin_external(bo);
+			else if (ret =3D=3D -EAGAIN)
+				ret =3D -ENOSPC;
+		}
+
 		xe_bo_unlock_vm_held(bo);
+		if (ret) {
+			xe_bo_put(bo);
+			return ERR_PTR(ret);
+		}
+	}
=20
 	return bo;
 }
@@ -2745,6 +2780,28 @@ int xe_gem_create_ioctl(struct drm_device *dev, void=
 *data,
 			 args->cpu_caching =3D=3D DRM_XE_GEM_CPU_CACHING_WB))
 		return -EINVAL;
=20
+	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_GEM_CREATE_FLAG_PINNED)) {
+		bool pinned_flag =3D true;
+		/* Only allow a single placement for pinning */
+		if (XE_IOCTL_DBG(xe, pinned_flag && hweight32(args->placement) !=3D 1))
+			return -EINVAL;
+
+		/* Meant for exporting, do not allow a VM-local BO */
+		if (XE_IOCTL_DBG(xe, pinned_flag && args->vm_id))
+			return -EINVAL;
+
+		/* Similarly, force fail at creation time for now. We may relax this req=
uirement later */
+		if (XE_IOCTL_DBG(xe, pinned_flag && args->flags & DRM_XE_GEM_CREATE_FLAG=
_DEFER_BACKING))
+			return -EINVAL;
+
+		/* Require the appropriate cgroups to be enabled. */
+		if (XE_IOCTL_DBG(xe, pinned_flag && !IS_ENABLED(CONFIG_CGROUP_DMEM) && b=
o_flags & XE_BO_FLAG_VRAM_MASK) ||
+		    XE_IOCTL_DBG(xe, pinned_flag && !IS_ENABLED(CONFIG_MEMCG) && bo_flag=
s & XE_BO_FLAG_SYSTEM))
+			return -EINVAL;
+
+		bo_flags |=3D XE_BO_FLAG_PINNED;
+	}
+
 	if (args->vm_id) {
 		vm =3D xe_vm_lookup(xef, args->vm_id);
 		if (XE_IOCTL_DBG(xe, !vm))
@@ -2790,6 +2847,11 @@ int xe_gem_create_ioctl(struct drm_device *dev, void=
 *data,
 		__xe_bo_unset_bulk_move(bo);
 		xe_vm_unlock(vm);
 	}
+	if (bo->flags & XE_BO_FLAG_PINNED) {
+		xe_bo_lock(bo, false);
+		xe_bo_unpin_user(bo);
+		xe_bo_unlock(bo);
+	}
 out_put:
 	xe_bo_put(bo);
 out_vm:
diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_bu=
f.c
index 346f857f38374..6719f4552ad37 100644
--- a/drivers/gpu/drm/xe/xe_dma_buf.c
+++ b/drivers/gpu/drm/xe/xe_dma_buf.c
@@ -53,6 +53,11 @@ static int xe_dma_buf_pin(struct dma_buf_attachment *att=
ach)
 	struct xe_device *xe =3D xe_bo_device(bo);
 	int ret;
=20
+	if (bo->flags & XE_BO_FLAG_PINNED) {
+		ttm_bo_pin(&bo->ttm);
+		return 0;
+	}
+
 	/*
 	 * For now only support pinning in TT memory, for two reasons:
 	 * 1) Avoid pinning in a placement not accessible to some importers.
@@ -83,7 +88,10 @@ static void xe_dma_buf_unpin(struct dma_buf_attachment *=
attach)
 	struct drm_gem_object *obj =3D attach->dmabuf->priv;
 	struct xe_bo *bo =3D gem_to_xe_bo(obj);
=20
-	xe_bo_unpin_external(bo);
+	if (bo->flags & XE_BO_FLAG_PINNED)
+		ttm_bo_unpin(&bo->ttm);
+	else
+		xe_bo_unpin_external(bo);
 }
=20
 static struct sg_table *xe_dma_buf_map(struct dma_buf_attachment *attach,
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index c721e130c1d2d..3184fa38ce17e 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -765,12 +765,15 @@ struct drm_xe_device_query {
  *    until the object is either bound to a virtual memory region via
  *    VM_BIND or accessed by the CPU. As a result, no backing memory is
  *    reserved at the time of GEM object creation.
- *  - %DRM_XE_GEM_CREATE_FLAG_SCANOUT
+ *  - %DRM_XE_GEM_CREATE_FLAG_SCANOUT - GEM object will be used
+ *    display framebuffer.
  *  - %DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM - When using VRAM as a
  *    possible placement, ensure that the corresponding VRAM allocation
  *    will always use the CPU accessible part of VRAM. This is important
  *    for small-bar systems (on full-bar systems this gets turned into a
  *    noop).
+ *  - %DRM_XE_GEM_CREATE_FLAG_PINNED - Pin the backing memory permanently
+ *    on allocation, if withing cgroups limits.
  *    Note1: System memory can be used as an extra placement if the kernel
  *    should spill the allocation to system memory, if space can't be made
  *    available in the CPU accessible part of VRAM (giving the same
@@ -781,6 +784,10 @@ struct drm_xe_device_query {
  *    need to use VRAM for display surfaces, therefore the kernel requires
  *    setting this flag for such objects, otherwise an error is thrown on
  *    small-bar systems.
+ *    Note3: %DRM_XE_GEM_CREATE_FLAG_PINNED requires the BO to have only
+ *    a single placement, no vm_id, requires (device) memory cgroups enabl=
ed,
+ *    and is incompatible with the %DEFER_BACKING and %NEEDS_VISIBLE_VRAM
+ *    flags.
  *
  * @cpu_caching supports the following values:
  *  - %DRM_XE_GEM_CPU_CACHING_WB - Allocate the pages with write-back
@@ -827,6 +834,7 @@ struct drm_xe_gem_create {
 #define DRM_XE_GEM_CREATE_FLAG_DEFER_BACKING		(1 << 0)
 #define DRM_XE_GEM_CREATE_FLAG_SCANOUT			(1 << 1)
 #define DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM	(1 << 2)
+#define DRM_XE_GEM_CREATE_FLAG_PINNED			(1 << 3)
 	/**
 	 * @flags: Flags, currently a mask of memory instances of where BO can
 	 * be placed
--=20
2.50.0