From nobody Tue Jun 16 06:30:00 2026 Received: from mailgw.kylinos.cn (mailgw.kylinos.cn [124.126.103.232]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A63EA2DECDF; Fri, 17 Apr 2026 03:38:40 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=124.126.103.232 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1776397122; cv=none; b=MHDwKRGwu22OJGsg00REkWD8OQCyDmjrMFnKhSXG+8RQM2nNT+V5edjCD164JpAbeq2b1OdfDrnSWpXKdKNEKVS2KtBdZh4BxaaFuUlnl/etiENtu+q4993NX3ICqH8QqEEemW2oTeBlePVQv++g2Mswj5EL0KdHZ9ZuJAxIGYE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1776397122; c=relaxed/simple; bh=PKpIjj53GQeEkxG8kyR4DFiTfEL3Lj5E1Ag3FIbhFSQ=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=iP9+nL1kj+Y2l4tAM6BZnkm8jZ2A/tQH9aOsk8MAR2RzIgLPsL1zC5vWAneQTQu9X22tAONxr19Z3y7GX6Sesy7E5vSh3MtRXJXK3jqYl/s+7Ifq49RdTcNHBDsEJjmgDuMUeYexzss18x5i+mH6jdn1fpqYxvc56phYf2/Vs6E= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=kylinos.cn; spf=pass smtp.mailfrom=kylinos.cn; arc=none smtp.client-ip=124.126.103.232 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=kylinos.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=kylinos.cn X-UUID: ea1b2c103a0e11f1aa26b74ffac11d73-20260417 X-CTIC-Tags: HR_CC_COUNT, HR_CC_DOMAIN_COUNT, HR_CC_NAME, HR_CC_NO_NAME, HR_CTE_8B HR_CTT_MISS, HR_DATE_H, HR_DATE_WKD, HR_DATE_ZONE, HR_FROM_NAME HR_SJ_DIGIT_LEN, HR_SJ_LANG, HR_SJ_LEN, HR_SJ_LETTER, HR_SJ_NOR_SYM HR_SJ_PHRASE, HR_SJ_PHRASE_LEN, HR_SJ_WS, HR_TO_COUNT, HR_TO_DOMAIN_COUNT HR_TO_NO_NAME, IP_TRUSTED, SRC_TRUSTED, DN_TRUSTED, SA_TRUSTED SA_EXISTED, SN_TRUSTED, SN_EXISTED, SPF_NOPASS, DKIM_NOPASS DMARC_NOPASS, CIE_BAD, CIE_GOOD, CIE_GOOD_SPF, GTI_FG_BS GTI_RG_INFO, GTI_C_BU, AMN_GOOD, ABX_MISS_RDNS X-CID-P-RULE: Release_Ham X-CID-O-INFO: VERSION:1.3.12,REQID:eb60d489-14b4-43b8-94a4-d28e5699a37a,IP:10, URL:0,TC:0,Content:-25,EDM:-25,RT:0,SF:-5,FILE:0,BULK:0,RULE:Release_Ham,A CTION:release,TS:-45 X-CID-INFO: VERSION:1.3.12,REQID:eb60d489-14b4-43b8-94a4-d28e5699a37a,IP:10,UR L:0,TC:0,Content:-25,EDM:-25,RT:0,SF:-5,FILE:0,BULK:0,RULE:NOTI_GNA5D1EA,A CTION:release,TS:-45 X-CID-META: VersionHash:e7bac3a,CLOUDID:a25a31f5f62182f223ab176ae21435a2,BulkI D:26041711383601XWS6LU,BulkQuantity:0,Recheck:0,SF:17|19|38|66|78|81|82|10 2|127|898,TC:nil,Content:0|15|50,EDM:2,IP:-2,URL:0,File:nil,RT:nil,Bulk:ni l,QS:nil,BEC:nil,COL:0,OSI:0,OSA:0,AV:0,LES:1,SPR:NO,DKR:0,DKP:0,BRR:0,BRE :0,ARC:0 X-CID-BVR: 2,SSN|SDN X-CID-BAS: 2,SSN|SDN,0,_ X-CID-FACTOR: TF_CID_SPAM_SNR,TF_CID_SPAM_FAS,TF_CID_SPAM_FSD X-CID-RHF: D41D8CD98F00B204E9800998ECF8427E X-UUID: ea1b2c103a0e11f1aa26b74ffac11d73-20260417 X-User: zhangguopeng@kylinos.cn Received: from yan.. [(183.242.174.22)] by mailgw.kylinos.cn (envelope-from ) (Generic MTA with TLSv1.3 TLS_AES_256_GCM_SHA384 256/256) with ESMTP id 1882427054; Fri, 17 Apr 2026 11:38:35 +0800 From: Guopeng Zhang To: longman@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, void@manifault.com, arighi@nvidia.com, changwoo@igalia.com, shuah@kernel.org, chenridong@huaweicloud.com Cc: cgroups@vger.kernel.org, sched-ext@lists.linux.dev, linux-kselftest@vger.kernel.org, linux-kernel@vger.kernel.org, Guopeng Zhang Subject: [PATCH 1/2] cgroup/cpuset: record DL BW alloc CPU for attach rollback Date: Fri, 17 Apr 2026 11:37:41 +0800 Message-ID: <20260417033742.40793-2-zhangguopeng@kylinos.cn> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20260417033742.40793-1-zhangguopeng@kylinos.cn> References: <20260417033742.40793-1-zhangguopeng@kylinos.cn> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" cpuset_can_attach() allocates DL bandwidth only when migrating deadline tasks to a disjoint CPU mask, but cpuset_cancel_attach() rolls back based only on nr_migrate_dl_tasks. This makes the DL bandwidth alloc/free paths asymmetric: rollback can call dl_bw_free() even when no dl_bw_alloc() was done. Rollback also needs to undo the reservation against the same CPU/root domain that was charged. Record the CPU used by dl_bw_alloc() and use that state in cpuset_cancel_attach(). If no allocation happened, dl_bw_cpu stays at -1 and rollback skips dl_bw_free(). If allocation did happen, bandwidth is returned to the same CPU/root domain. Successful attach paths are unchanged. This only fixes failed attach rollback accounting. Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails") Signed-off-by: Guopeng Zhang Reviewed-by: Chen Ridong Reviewed-by: Waiman Long --- kernel/cgroup/cpuset-internal.h | 5 +++++ kernel/cgroup/cpuset.c | 13 +++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-interna= l.h index fd7d19842ded..bb4e692bea30 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -168,6 +168,11 @@ struct cpuset { int nr_deadline_tasks; int nr_migrate_dl_tasks; u64 sum_migrate_dl_bw; + /* + * CPU used for temporary DL bandwidth allocation during attach; + * -1 if no DL bandwidth was allocated in the current attach. + */ + int dl_bw_cpu; =20 /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1335e437098e..e3a081a07c6d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -288,6 +288,7 @@ struct cpuset top_cpuset =3D { .flags =3D BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state =3D PRS_ROOT, + .dl_bw_cpu =3D -1, }; =20 /** @@ -579,6 +580,8 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset= *cs) if (!trial) return NULL; =20 + trial->dl_bw_cpu =3D -1; + /* Setup cpumask pointer array */ cpumask_var_t *pmask[4] =3D { &trial->cpus_allowed, @@ -2980,6 +2983,7 @@ static void reset_migrate_dl_data(struct cpuset *cs) { cs->nr_migrate_dl_tasks =3D 0; cs->sum_migrate_dl_bw =3D 0; + cs->dl_bw_cpu =3D -1; } =20 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held= */ @@ -3056,6 +3060,8 @@ static int cpuset_can_attach(struct cgroup_taskset *t= set) reset_migrate_dl_data(cs); goto out_unlock; } + + cs->dl_bw_cpu =3D cpu; } =20 out_success: @@ -3080,12 +3086,11 @@ static void cpuset_cancel_attach(struct cgroup_task= set *tset) mutex_lock(&cpuset_mutex); dec_attach_in_progress_locked(cs); =20 - if (cs->nr_migrate_dl_tasks) { - int cpu =3D cpumask_any(cs->effective_cpus); + if (cs->dl_bw_cpu >=3D 0) + dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); =20 - dl_bw_free(cpu, cs->sum_migrate_dl_bw); + if (cs->nr_migrate_dl_tasks) reset_migrate_dl_data(cs); - } =20 mutex_unlock(&cpuset_mutex); } --=20 2.43.0 From nobody Tue Jun 16 06:30:00 2026 Received: from mailgw.kylinos.cn (mailgw.kylinos.cn [124.126.103.232]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A06F835AC29; Fri, 17 Apr 2026 03:38:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=124.126.103.232 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1776397126; cv=none; b=Vd9yfF6XyaY5i/lEK5UWabTcXI9yhJgigHQwn57ugmQz1aBrorllzAI6QWvTeq+CjzUwR3kH/fwbvb1f0hTadH2/zz5cxmA5KG4i01JmhFN1h9cRtskMZ9o8nPDV410XINQ3R00ImkjPm2OTyaDVXlaynJk1LoYvylag773DWHA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1776397126; c=relaxed/simple; bh=h2Fs++siKjyTsS0bIJDVMJiUlyqQfMdAKlfA+KH5aFM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=oJxR1Tm8xzxeJ7yOdzch0AXEv3L+gQfGKxPTW514HINv5YyQn5xi4aoHXWagQPJCK2NMfbUcswzZfcO5c7LnJaML1umUaQrZNfic3GkKKHLzabfF3+EyeurmTE9/b6Qba4DCvAN9rkeiOwATGF81JwZK2Z7llGyqkX957YTl88k= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=kylinos.cn; spf=pass smtp.mailfrom=kylinos.cn; arc=none smtp.client-ip=124.126.103.232 Authentication-Results: smtp.subspace.kernel.org; dmarc=none (p=none dis=none) header.from=kylinos.cn Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=kylinos.cn X-UUID: ebc029763a0e11f1aa26b74ffac11d73-20260417 X-CTIC-Tags: HR_CC_COUNT, HR_CC_DOMAIN_COUNT, HR_CC_NAME, HR_CC_NO_NAME, HR_CTE_8B HR_CTT_MISS, HR_DATE_H, HR_DATE_WKD, HR_DATE_ZONE, HR_FROM_NAME HR_SJ_DIGIT_LEN, HR_SJ_LANG, HR_SJ_LEN, HR_SJ_LETTER, HR_SJ_NOR_SYM HR_SJ_PHRASE, HR_SJ_PHRASE_LEN, HR_SJ_WS, HR_TO_COUNT, HR_TO_DOMAIN_COUNT HR_TO_NO_NAME, IP_TRUSTED, SRC_TRUSTED, DN_TRUSTED, SA_TRUSTED SA_EXISTED, SN_TRUSTED, SN_EXISTED, SPF_NOPASS, DKIM_NOPASS DMARC_NOPASS, CIE_BAD, CIE_GOOD, CIE_GOOD_SPF, GTI_FG_BS GTI_RG_INFO, GTI_C_BU, AMN_GOOD, ABX_MISS_RDNS X-CID-P-RULE: Release_Ham X-CID-O-INFO: VERSION:1.3.12,REQID:1ccfb8e6-0053-48c2-8917-b61beb9eb2d7,IP:10, URL:0,TC:0,Content:-25,EDM:0,RT:0,SF:-5,FILE:0,BULK:0,RULE:Release_Ham,ACT ION:release,TS:-20 X-CID-INFO: VERSION:1.3.12,REQID:1ccfb8e6-0053-48c2-8917-b61beb9eb2d7,IP:10,UR L:0,TC:0,Content:-25,EDM:0,RT:0,SF:-5,FILE:0,BULK:0,RULE:Release_Ham,ACTIO N:release,TS:-20 X-CID-META: VersionHash:e7bac3a,CLOUDID:7dfa1faac29263a7d3cd348da00468de,BulkI D:260417113839ABO8TAQR,BulkQuantity:0,Recheck:0,SF:17|19|38|66|78|81|82|10 2|127|898,TC:nil,Content:0|15|50,EDM:-3,IP:-2,URL:0,File:nil,RT:nil,Bulk:n il,QS:nil,BEC:nil,COL:0,OSI:0,OSA:0,AV:0,LES:1,SPR:NO,DKR:0,DKP:0,BRR:0,BR E:0,ARC:0 X-CID-BVR: 2,SSN|SDN X-CID-BAS: 2,SSN|SDN,0,_ X-CID-FACTOR: TF_CID_SPAM_FAS,TF_CID_SPAM_FSD,TF_CID_SPAM_SNR X-CID-RHF: D41D8CD98F00B204E9800998ECF8427E X-UUID: ebc029763a0e11f1aa26b74ffac11d73-20260417 X-User: zhangguopeng@kylinos.cn Received: from yan.. [(183.242.174.22)] by mailgw.kylinos.cn (envelope-from ) (Generic MTA with TLSv1.3 TLS_AES_256_GCM_SHA384 256/256) with ESMTP id 1006010797; Fri, 17 Apr 2026 11:38:38 +0800 From: Guopeng Zhang To: longman@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com, void@manifault.com, arighi@nvidia.com, changwoo@igalia.com, shuah@kernel.org, chenridong@huaweicloud.com Cc: cgroups@vger.kernel.org, sched-ext@lists.linux.dev, linux-kselftest@vger.kernel.org, linux-kernel@vger.kernel.org, Guopeng Zhang Subject: [PATCH 2/2] selftests/sched_ext: add cpuset DL rollback test Date: Fri, 17 Apr 2026 11:37:42 +0800 Message-ID: <20260417033742.40793-3-zhangguopeng@kylinos.cn> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20260417033742.40793-1-zhangguopeng@kylinos.cn> References: <20260417033742.40793-1-zhangguopeng@kylinos.cn> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The cpuset DL rollback bug only shows up when another controller rejects a migration after cpuset_can_attach() has already succeeded. Use a sched_ext scheduler whose cgroup_prep_move() rejects SCHED_DEADLINE tasks so that the cpu controller fails after cpuset and drives the real attach rollback path from userspace. Create overlapping source and destination cpusets under the current cgroup, using that cgroup's effective CPU and memory masks. Constrain the destination cpuset to a single CPU so the rollback accounting target is deterministic, then compare dl_bw->total_bw for that CPU before and after the failed move. Restore the parent subtree_control state during cleanup so the test does not leave the cgroup tree changed. This catches the old behavior where cpuset_cancel_attach() could free DL bandwidth even though cpuset_can_attach() never allocated it. The test reads sched/debug because that debugfs output exposes the per-CPU dl_bw->total_bw accounting that the rollback perturbs. Signed-off-by: Guopeng Zhang --- tools/testing/selftests/sched_ext/Makefile | 1 + .../sched_ext/cpuset_dl_rollback.bpf.c | 28 + .../selftests/sched_ext/cpuset_dl_rollback.c | 810 ++++++++++++++++++ 3 files changed, 839 insertions(+) create mode 100644 tools/testing/selftests/sched_ext/cpuset_dl_rollback.bp= f.c create mode 100644 tools/testing/selftests/sched_ext/cpuset_dl_rollback.c diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/sel= ftests/sched_ext/Makefile index 789037be44c7..2a54d15552bd 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -162,6 +162,7 @@ endef all_test_bpfprogs :=3D $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$= (patsubst %.c,%.skel.h,$(prog))) =20 auto-test-targets :=3D \ + cpuset_dl_rollback \ create_dsq \ dequeue \ enq_last_no_enq_fails \ diff --git a/tools/testing/selftests/sched_ext/cpuset_dl_rollback.bpf.c b/t= ools/testing/selftests/sched_ext/cpuset_dl_rollback.bpf.c new file mode 100644 index 000000000000..ca5758a7361f --- /dev/null +++ b/tools/testing/selftests/sched_ext/cpuset_dl_rollback.bpf.c @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A sched_ext scheduler used to trigger attach rollback after cpuset has + * already accepted the migration. + * + * Reject moving SCHED_DEADLINE tasks between cgroups from cgroup_prep_mov= e(), + * which makes the cpu controller fail after cpuset has already succeeded. + */ + +#include + +#define SCHED_DEADLINE 6 + +char _license[] SEC("license") =3D "GPL"; + +s32 BPF_STRUCT_OPS(cpuset_dl_rollback_cgroup_prep_move, struct task_struct= *p, + struct cgroup *from, struct cgroup *to) +{ + if (p->policy =3D=3D SCHED_DEADLINE) + return -EAGAIN; + + return 0; +} +SEC(".struct_ops.link") +struct sched_ext_ops cpuset_dl_rollback_ops =3D { + .cgroup_prep_move =3D (void *)cpuset_dl_rollback_cgroup_prep_move, + .name =3D "cpuset_dl_rollback", +}; diff --git a/tools/testing/selftests/sched_ext/cpuset_dl_rollback.c b/tools= /testing/selftests/sched_ext/cpuset_dl_rollback.c new file mode 100644 index 000000000000..44b6cad77d3e --- /dev/null +++ b/tools/testing/selftests/sched_ext/cpuset_dl_rollback.c @@ -0,0 +1,810 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Verify that rollback from cpu_cgroup_can_attach() failure doesn't pertu= rb DL + * bandwidth accounting when cpuset_can_attach() didn't allocate DL bandwi= dth in + * the first place. + * + * The test uses a sched_ext scheduler whose cgroup_prep_move() rejects + * SCHED_DEADLINE task migration. That makes the cpu controller fail after= the + * cpuset controller has already accepted the move, which triggers the cgr= oup + * rollback path without any kernel fault injection. + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cpuset_dl_rollback.bpf.skel.h" +#include "scx_test.h" + +#ifndef SYS_sched_setattr +#if defined(__x86_64__) +#define SYS_sched_setattr 314 +#elif defined(__i386__) +#define SYS_sched_setattr 351 +#elif defined(__aarch64__) +#define SYS_sched_setattr 274 +#else +#error "Unknown architecture: please define SYS_sched_setattr" +#endif +#endif + +#ifndef SCHED_DEADLINE +#define SCHED_DEADLINE 6 +#endif + +#define CGROUP2_ROOT "/sys/fs/cgroup" +#define SCHED_DEBUG "/sys/kernel/debug/sched/debug" + +struct cpuset_dl_rollback_ctx { + struct cpuset_dl_rollback *skel; + struct bpf_link *link; + pid_t child; + /* The only CPU in dst, and the rollback accounting observation point. */ + int target_cpu; + bool restore_parent_subtree; + char parent[PATH_MAX]; + char root[PATH_MAX]; + char src[PATH_MAX]; + char dst[PATH_MAX]; + char src_rel[PATH_MAX]; + char parent_subtree[256]; + char cpu_list[1024]; + char mem_list[256]; + char dst_cpu[32]; +}; + +static void cleanup(void *arg); + +static int sched_setattr(pid_t pid, const struct sched_attr *attr, + unsigned int flags) +{ + return syscall(SYS_sched_setattr, pid, attr, flags); +} + +static void trim_trailing_ws(char *buf) +{ + size_t len =3D strlen(buf); + + while (len > 0) { + char c =3D buf[len - 1]; + + if (c !=3D '\n' && c !=3D ' ' && c !=3D '\t') + break; + buf[--len] =3D '\0'; + } +} + +static int read_text(const char *path, char *buf, size_t size) +{ + ssize_t len; + int fd; + + fd =3D open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return -errno; + + len =3D read(fd, buf, size - 1); + close(fd); + if (len < 0) + return -errno; + + buf[len] =3D '\0'; + trim_trailing_ws(buf); + return 0; +} + +static int write_text(const char *path, const char *buf) +{ + size_t len =3D strlen(buf); + ssize_t ret; + int fd; + + fd =3D open(path, O_WRONLY | O_CLOEXEC); + if (fd < 0) + return -errno; + + ret =3D write(fd, buf, len); + close(fd); + if (ret < 0) + return -errno; + if ((size_t)ret !=3D len) + return -EIO; + + return 0; +} + +static int build_path(char *buf, size_t size, const char *dir, const char = *file) +{ + int ret; + + ret =3D snprintf(buf, size, "%s/%s", dir, file); + if (ret < 0 || (size_t)ret >=3D size) + return -ENAMETOOLONG; + + return 0; +} + +static int build_cgroup_dir(const char *rel, char *buf, size_t size) +{ + int ret; + + if (!strcmp(rel, "/")) + ret =3D snprintf(buf, size, "%s", CGROUP2_ROOT); + else + ret =3D snprintf(buf, size, "%s%s", CGROUP2_ROOT, rel); + + if (ret < 0 || (size_t)ret >=3D size) + return -ENAMETOOLONG; + + return 0; +} + +static int read_cgroup_relpath(const char *path, char *buf, size_t size) +{ + char line[PATH_MAX]; + FILE *fp; + int ret; + + fp =3D fopen(path, "r"); + if (!fp) + return -errno; + + while (fgets(line, sizeof(line), fp)) { + char *first, *second, *rel; + + trim_trailing_ws(line); + + first =3D strchr(line, ':'); + if (!first) { + fclose(fp); + return -EINVAL; + } + + second =3D strchr(first + 1, ':'); + if (!second) { + fclose(fp); + return -EINVAL; + } + + *first =3D '\0'; + *second =3D '\0'; + + /* Match the cgroup v2 entry, which is formatted as 0::/path. */ + if (strcmp(line, "0") || first[1] !=3D '\0') + continue; + + rel =3D second + 1; + if (rel[0] !=3D '/') { + fclose(fp); + return -EINVAL; + } + + ret =3D snprintf(buf, size, "%s", rel); + fclose(fp); + if (ret < 0 || (size_t)ret >=3D size) + return -ENAMETOOLONG; + + return 0; + } + + if (ferror(fp)) { + fclose(fp); + return -EIO; + } + + fclose(fp); + return -EOPNOTSUPP; +} + +static bool has_token(const char *list, const char *token) +{ + size_t len =3D strlen(token); + const char *pos =3D list; + + while ((pos =3D strstr(pos, token))) { + bool left_ok =3D pos =3D=3D list || pos[-1] =3D=3D ' '; + bool right_ok =3D pos[len] =3D=3D '\0' || pos[len] =3D=3D ' '; + + if (left_ok && right_ok) + return true; + pos +=3D len; + } + + return false; +} + +static int enable_controllers(const char *dir, char *orig, size_t orig_sz, + bool *changed) +{ + char ctrl_path[PATH_MAX]; + char subtree_path[PATH_MAX]; + char controllers[256]; + char subtree[256]; + char enable[64]; + size_t len =3D 0; + int ret; + + ret =3D build_path(ctrl_path, sizeof(ctrl_path), dir, "cgroup.controllers= "); + if (ret) + return ret; + ret =3D build_path(subtree_path, sizeof(subtree_path), dir, + "cgroup.subtree_control"); + if (ret) + return ret; + + ret =3D read_text(ctrl_path, controllers, sizeof(controllers)); + if (ret =3D=3D -ENOENT) + return -EOPNOTSUPP; + if (ret) + return ret; + if (!has_token(controllers, "cpu") || !has_token(controllers, "cpuset")) + return -EOPNOTSUPP; + + ret =3D read_text(subtree_path, subtree, sizeof(subtree)); + if (ret =3D=3D -ENOENT) + return -EOPNOTSUPP; + if (ret) + return ret; + + enable[0] =3D '\0'; + if (!has_token(subtree, "cpu")) + len +=3D snprintf(enable + len, sizeof(enable) - len, "+cpu "); + if (!has_token(subtree, "cpuset")) + len +=3D snprintf(enable + len, sizeof(enable) - len, "+cpuset "); + if (len >=3D sizeof(enable)) + return -EOVERFLOW; + + if (!enable[0]) { + if (orig && orig_sz) { + ret =3D snprintf(orig, orig_sz, "%s", subtree); + if (ret < 0 || (size_t)ret >=3D orig_sz) + return -ENAMETOOLONG; + } + if (changed) + *changed =3D false; + return 0; + } + + if (orig && orig_sz) { + ret =3D snprintf(orig, orig_sz, "%s", subtree); + if (ret < 0 || (size_t)ret >=3D orig_sz) + return -ENAMETOOLONG; + } + + trim_trailing_ws(enable); + ret =3D write_text(subtree_path, enable); + if (!ret && changed) + *changed =3D true; + return ret; +} + +static int restore_controllers(const char *dir, const char *orig) +{ + char subtree_path[PATH_MAX]; + char subtree[256]; + char disable[64]; + size_t len =3D 0; + int ret; + + ret =3D build_path(subtree_path, sizeof(subtree_path), dir, + "cgroup.subtree_control"); + if (ret) + return ret; + + ret =3D read_text(subtree_path, subtree, sizeof(subtree)); + if (ret) + return ret; + + /* + * Only undo controllers that this test turned on. If "cpu" or "cpuset" + * was already present in the original subtree_control state, leave it + * alone. + */ + disable[0] =3D '\0'; + if (has_token(subtree, "cpu") && !has_token(orig, "cpu")) + len +=3D snprintf(disable + len, sizeof(disable) - len, "-cpu "); + if (has_token(subtree, "cpuset") && !has_token(orig, "cpuset")) + len +=3D snprintf(disable + len, sizeof(disable) - len, + "-cpuset "); + if (len >=3D sizeof(disable)) + return -EOVERFLOW; + + if (!disable[0]) + return 0; + + trim_trailing_ws(disable); + return write_text(subtree_path, disable); +} + +static int mkdir_one(const char *path) +{ + if (mkdir(path, 0755) && errno !=3D EEXIST) + return -errno; + return 0; +} + +static int write_pid(const char *path, pid_t pid) +{ + char buf[32]; + int ret; + + ret =3D snprintf(buf, sizeof(buf), "%d", pid); + if (ret < 0 || (size_t)ret >=3D sizeof(buf)) + return -EOVERFLOW; + + return write_text(path, buf); +} + +/* Parse the first CPU from a cpulist-style string such as "0-3,8". */ +static int first_list_item(const char *list, char *buf, size_t size, int *= valp) +{ + char *end; + long val; + int ret; + + errno =3D 0; + val =3D strtol(list, &end, 10); + if (errno || end =3D=3D list || val < 0) + return -EINVAL; + + if (valp) + *valp =3D val; + + ret =3D snprintf(buf, size, "%ld", val); + if (ret < 0 || (size_t)ret >=3D size) + return -EOVERFLOW; + + return 0; +} + +/* + * sched/debug reports dl_bw->total_bw inside each CPU section. + * + * This test constrains dst to a single CPU and stores that CPU number in + * ctx->target_cpu. cpuset_cancel_attach() rolls rollback accounting again= st a + * CPU selected from the destination effective mask, so with a single-CPU = dst + * that exact CPU becomes the rollback site and the matching observation p= oint. + * + * Reading only the target CPU's dl_bw->total_bw avoids assuming that ever= y CPU + * in the system shares one root domain. Unlike sched_ext/total_bw.c, this= test + * has to identify one specific CPU section, so it also relies on the curr= ent + * sched/debug "cpu#" section header format. + */ +static int read_cpu_total_bw(int target_cpu, long long *bw) +{ + char line[256]; + FILE *fp; + bool in_target =3D false; + + fp =3D fopen(SCHED_DEBUG, "r"); + if (!fp) + return -errno; + + while (fgets(line, sizeof(line), fp)) { + int header_cpu; + char *val; + + if (sscanf(line, "cpu#%d", &header_cpu) =3D=3D 1) { + if (in_target) + break; + + in_target =3D header_cpu =3D=3D target_cpu; + continue; + } + if (!in_target) + continue; + + val =3D strstr(line, "dl_bw->total_bw"); + if (!val) + continue; + + val =3D strchr(val, ':'); + if (!val) { + fclose(fp); + return -EINVAL; + } + + *bw =3D strtoll(val + 1, NULL, 10); + fclose(fp); + return 0; + } + + fclose(fp); + return -ENOENT; +} + +static int set_deadline_policy(void) +{ + struct sched_attr attr =3D { + .size =3D sizeof(attr), + .sched_policy =3D SCHED_DEADLINE, + .sched_runtime =3D 10 * 1000 * 1000ULL, + .sched_deadline =3D 30 * 1000 * 1000ULL, + .sched_period =3D 30 * 1000 * 1000ULL, + }; + + return sched_setattr(0, &attr, 0); +} + +static int spawn_dl_child(struct cpuset_dl_rollback_ctx *ctx) +{ + char procs_path[PATH_MAX]; + int pipefd[2]; + pid_t pid; + int child_ret; + int ret; + + ret =3D build_path(procs_path, sizeof(procs_path), ctx->src, "cgroup.proc= s"); + if (ret) + return ret; + + if (pipe(pipefd)) + return -errno; + + pid =3D fork(); + if (pid < 0) { + ret =3D -errno; + close(pipefd[0]); + close(pipefd[1]); + return ret; + } + + if (!pid) { + int err =3D 0; + + close(pipefd[0]); + + err =3D write_pid(procs_path, getpid()); + if (!err && set_deadline_policy()) + err =3D -errno; + + if (write(pipefd[1], &err, sizeof(err)) !=3D sizeof(err)) + _exit(1); + + if (err) + _exit(1); + + for (;;) + pause(); + } + + close(pipefd[1]); + ret =3D read(pipefd[0], &child_ret, sizeof(child_ret)); + close(pipefd[0]); + if (ret !=3D sizeof(child_ret)) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + return -EIO; + } + + if (child_ret) { + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + return child_ret; + } + + ctx->child =3D pid; + return 0; +} + +static int create_cgroups(struct cpuset_dl_rollback_ctx *ctx) +{ + char parent_rel[PATH_MAX]; + char path[PATH_MAX]; + char tmpl[PATH_MAX]; + int ret; + + ret =3D read_cgroup_relpath("/proc/self/cgroup", parent_rel, + sizeof(parent_rel)); + if (ret) + return ret; + + ret =3D build_cgroup_dir(parent_rel, ctx->parent, sizeof(ctx->parent)); + if (ret) + return ret; + + ret =3D enable_controllers(ctx->parent, ctx->parent_subtree, + sizeof(ctx->parent_subtree), + &ctx->restore_parent_subtree); + if (ret) + return ret; + + ret =3D build_path(path, sizeof(path), ctx->parent, "cpuset.cpus.effectiv= e"); + if (ret) + return ret; + ret =3D read_text(path, ctx->cpu_list, sizeof(ctx->cpu_list)); + if (ret =3D=3D -ENOENT) + return -EOPNOTSUPP; + if (ret) + return ret; + + ret =3D build_path(path, sizeof(path), ctx->parent, "cpuset.mems.effectiv= e"); + if (ret) + return ret; + ret =3D read_text(path, ctx->mem_list, sizeof(ctx->mem_list)); + if (ret =3D=3D -ENOENT) + return -EOPNOTSUPP; + if (ret) + return ret; + if (!ctx->cpu_list[0] || !ctx->mem_list[0]) + return -ENOSPC; + + /* + * Keep dst on a single CPU so the rollback accounting target is + * deterministic. That same CPU is later sampled from sched/debug. + */ + ret =3D first_list_item(ctx->cpu_list, ctx->dst_cpu, sizeof(ctx->dst_cpu), + &ctx->target_cpu); + if (ret) + return ret; + + ret =3D snprintf(tmpl, sizeof(tmpl), "%s/scx-cpuset-dl-rollback-XXXXXX", + ctx->parent); + if (ret < 0 || (size_t)ret >=3D sizeof(tmpl)) + return -ENAMETOOLONG; + + if (!mkdtemp(tmpl)) + return -errno; + + ret =3D snprintf(ctx->root, sizeof(ctx->root), "%s", tmpl); + if (ret < 0 || (size_t)ret >=3D sizeof(ctx->root)) + return -EOVERFLOW; + + ret =3D snprintf(ctx->src, sizeof(ctx->src), "%s/src", ctx->root); + if (ret < 0 || (size_t)ret >=3D sizeof(ctx->src)) + return -EOVERFLOW; + ret =3D snprintf(ctx->dst, sizeof(ctx->dst), "%s/ovl", ctx->root); + if (ret < 0 || (size_t)ret >=3D sizeof(ctx->dst)) + return -EOVERFLOW; + ret =3D snprintf(ctx->src_rel, sizeof(ctx->src_rel), "%s/src", + ctx->root + strlen(CGROUP2_ROOT)); + if (ret < 0 || (size_t)ret >=3D sizeof(ctx->src_rel)) + return -EOVERFLOW; + + ret =3D build_path(path, sizeof(path), ctx->root, "cpuset.cpus"); + if (ret) + return ret; + ret =3D write_text(path, ctx->cpu_list); + if (ret) + return ret; + + ret =3D build_path(path, sizeof(path), ctx->root, "cpuset.mems"); + if (ret) + return ret; + ret =3D write_text(path, ctx->mem_list); + if (ret) + return ret; + + ret =3D enable_controllers(ctx->root, NULL, 0, NULL); + if (ret) + return ret; + + ret =3D mkdir_one(ctx->src); + if (ret) + return ret; + ret =3D mkdir_one(ctx->dst); + if (ret) + return ret; + + ret =3D build_path(path, sizeof(path), ctx->src, "cpuset.cpus"); + if (ret) + return ret; + ret =3D write_text(path, ctx->cpu_list); + if (ret) + return ret; + + ret =3D build_path(path, sizeof(path), ctx->src, "cpuset.mems"); + if (ret) + return ret; + ret =3D write_text(path, ctx->mem_list); + if (ret) + return ret; + + ret =3D build_path(path, sizeof(path), ctx->dst, "cpuset.cpus"); + if (ret) + return ret; + ret =3D write_text(path, ctx->dst_cpu); + if (ret) + return ret; + + ret =3D build_path(path, sizeof(path), ctx->dst, "cpuset.mems"); + if (ret) + return ret; + return write_text(path, ctx->mem_list); +} + +static bool child_in_src(const struct cpuset_dl_rollback_ctx *ctx) +{ + char path[PATH_MAX]; + char cgroup[PATH_MAX]; + int ret; + + ret =3D snprintf(path, sizeof(path), "/proc/%d/cgroup", ctx->child); + if (ret < 0 || (size_t)ret >=3D sizeof(path)) + return false; + + if (read_cgroup_relpath(path, cgroup, sizeof(cgroup))) + return false; + + return strcmp(cgroup, ctx->src_rel) =3D=3D 0; +} + +static enum scx_test_status setup(void **out_ctx) +{ + struct cpuset_dl_rollback_ctx *ctx; + int ret; + + if (geteuid()) { + fprintf(stderr, "Skipping test: root privileges required\n"); + return SCX_TEST_SKIP; + } + + if (access(SCHED_DEBUG, R_OK)) { + fprintf(stderr, "Skipping test: %s not accessible\n", SCHED_DEBUG); + return SCX_TEST_SKIP; + } + + ctx =3D calloc(1, sizeof(*ctx)); + if (!ctx) + return SCX_TEST_FAIL; + + ret =3D create_cgroups(ctx); + switch (ret) { + case -EOPNOTSUPP: + fprintf(stderr, + "Skipping test: cgroup v2 cpu/cpuset controllers unavailable in current= cgroup tree\n"); + cleanup(ctx); + return SCX_TEST_SKIP; + case -EPERM: + case -EACCES: + case -EROFS: + fprintf(stderr, + "Skipping test: current cgroup tree does not allow cpu/cpuset writes\n"= ); + cleanup(ctx); + return SCX_TEST_SKIP; + case -EBUSY: + fprintf(stderr, + "Skipping test: current cgroup tree does not allow enabling cpu/cpuset = controllers here\n"); + cleanup(ctx); + return SCX_TEST_SKIP; + case -ENOSPC: + fprintf(stderr, + "Skipping test: current cgroup does not expose enough effective cpuset = resources\n"); + cleanup(ctx); + return SCX_TEST_SKIP; + } + if (ret) { + SCX_ERR("Failed to create cgroups (%d)", ret); + cleanup(ctx); + return SCX_TEST_FAIL; + } + + ctx->skel =3D cpuset_dl_rollback__open(); + if (!ctx->skel) { + SCX_ERR("Failed to open skel"); + cleanup(ctx); + return SCX_TEST_FAIL; + } + SCX_ENUM_INIT(ctx->skel); + if (cpuset_dl_rollback__load(ctx->skel)) { + SCX_ERR("Failed to load skel"); + cleanup(ctx); + return SCX_TEST_FAIL; + } + + *out_ctx =3D ctx; + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *arg) +{ + struct cpuset_dl_rollback_ctx *ctx =3D arg; + char procs_path[PATH_MAX]; + long long before_bw, after_bw; + int ret; + + ret =3D read_cpu_total_bw(ctx->target_cpu, &before_bw); + SCX_FAIL_IF(ret, "Failed to read baseline total_bw (%d)", ret); + + ctx->link =3D bpf_map__attach_struct_ops(ctx->skel->maps.cpuset_dl_rollba= ck_ops); + SCX_FAIL_IF(!ctx->link, "Failed to attach scheduler"); + + ret =3D spawn_dl_child(ctx); + switch (ret) { + case -EACCES: + case -EPERM: + fprintf(stderr, + "Skipping test: unable to place child in the source cgroup or enable SC= HED_DEADLINE due to permissions (%d)\n", + ret); + return SCX_TEST_SKIP; + case -EBUSY: + fprintf(stderr, + "Skipping test: SCHED_DEADLINE admission control rejected the child (%d= )\n", + ret); + return SCX_TEST_SKIP; + case -EINVAL: + fprintf(stderr, + "Skipping test: unable to enable SCHED_DEADLINE for the child in this e= nvironment (%d)\n", + ret); + return SCX_TEST_SKIP; + } + SCX_FAIL_IF(ret, "Failed to start SCHED_DEADLINE child (%d)", ret); + + ret =3D read_cpu_total_bw(ctx->target_cpu, &before_bw); + SCX_FAIL_IF(ret, "Failed to read pre-move total_bw (%d)", ret); + + ret =3D build_path(procs_path, sizeof(procs_path), ctx->dst, "cgroup.proc= s"); + SCX_FAIL_IF(ret, "Failed to build cgroup.procs path (%d)", ret); + + ret =3D write_pid(procs_path, ctx->child); + SCX_FAIL_IF(ret !=3D -EAGAIN, + "Expected cgroup move failure with -EAGAIN, got %d", ret); + SCX_FAIL_IF(!child_in_src(ctx), "Child left source cgroup after rollback"= ); + + ret =3D read_cpu_total_bw(ctx->target_cpu, &after_bw); + SCX_FAIL_IF(ret, "Failed to read post-move total_bw (%d)", ret); + SCX_FAIL_IF(after_bw !=3D before_bw, + "Expected total_bw for CPU%d to remain unchanged (%lld !=3D %lld)", + ctx->target_cpu, after_bw, before_bw); + + return SCX_TEST_PASS; +} + +static void cleanup(void *arg) +{ + struct cpuset_dl_rollback_ctx *ctx =3D arg; + int ret; + + if (!ctx) + return; + + if (ctx->child > 0) { + kill(ctx->child, SIGKILL); + waitpid(ctx->child, NULL, 0); + } + + if (ctx->link) + bpf_link__destroy(ctx->link); + if (ctx->skel) + cpuset_dl_rollback__destroy(ctx->skel); + + if (ctx->dst[0]) + rmdir(ctx->dst); + if (ctx->src[0]) + rmdir(ctx->src); + if (ctx->root[0]) + rmdir(ctx->root); + + if (ctx->restore_parent_subtree) { + ret =3D restore_controllers(ctx->parent, ctx->parent_subtree); + if (ret) + fprintf(stderr, + "%s: failed to restore %s/cgroup.subtree_control (%d)\n", + __func__, ctx->parent, ret); + } + + free(ctx); +} + +struct scx_test cpuset_dl_rollback =3D { + .name =3D "cpuset_dl_rollback", + .description =3D "Verify attach rollback after cpuset preserves DL bandwi= dth accounting", + .setup =3D setup, + .run =3D run, + .cleanup =3D cleanup, +}; +REGISTER_SCX_TEST(&cpuset_dl_rollback) --=20 2.43.0