From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 3306EC61DA7 for ; Wed, 22 Nov 2023 21:12:18 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1344365AbjKVVMS (ORCPT ); Wed, 22 Nov 2023 16:12:18 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37076 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1343845AbjKVVMN (ORCPT ); Wed, 22 Nov 2023 16:12:13 -0500 Received: from mail-pf1-x442.google.com (mail-pf1-x442.google.com [IPv6:2607:f8b0:4864:20::442]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 986F9D47; Wed, 22 Nov 2023 13:12:08 -0800 (PST) Received: by mail-pf1-x442.google.com with SMTP id d2e1a72fcca58-6b709048f32so265316b3a.0; Wed, 22 Nov 2023 13:12:08 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687528; x=1701292328; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=UftY+9nJn82Vz5RJpbG87UiQLd2iCLBgMyI8O4iubXI=; b=EU3eyIJ/OBG42+qF2vpTOldu0KNB6DVzZQCeaQEQGVPoYN//InAIWxAtun2WfIbfzH TWL/XIqYPkNYEO0ZcxkFbx5D0BWnzTmRCjmaeR3eE7HpgzthwbT3vkWN4waN4E4fv4lj ejhd+zXWGt2stqG8f3YZabC7dhahyRFITCiMxPGVDI1sUv9dGbQLI/kYdqeetVG/pnpo bHBy6XrK0QUFX9YoK5H199sT2humxLXx/3L2chjv8RHA4KcGrqsSvZRjfv30JMHC7dcG wEYsucJjsdGiZORhNIk4n+tgfJC53U1lsbcr1pC7bZUgw9Bz3ZnVgt0o6khP8OP6TOGp TE4g== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687528; x=1701292328; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=UftY+9nJn82Vz5RJpbG87UiQLd2iCLBgMyI8O4iubXI=; b=fFKtZMn/rKIod9M/FtNiwFu98MJDfLm9Hfzh40zNv3kgktPIN0voao6anea9YcbUqA sHVrNVPAHqDuu4ubAAlBnF+qeAgMbT3fOtskJeuxVdhiErpiW+4Mdh3PhiqqBYjM6rq2 lkXJZ6DYHkMBuu32pN8IKbgDCaS/t41bT1EMDfhFxeo8/L9nyveFOfP+KRMwzrSq016/ GLKCh+31GpLjooUWWgNoGGk04Viv+iof+SbWuRUdE17ukE2aSQHonKawRrQWUF2ZTfND 6VX2otg6qz0WRQ3iN+sUKg6ITAlBXEv5IBLPJnrQtjrA7Lg5ZB9TnchWDmZ/86QuUUiO 7aFw== X-Gm-Message-State: AOJu0Yz3APQDHBHmgxDc2CIDoDPbHRiMOs2xRbk4rEA5WtKVEd03ku5C B04r0jQ/9wT4014SUL3y7w== X-Google-Smtp-Source: AGHT+IFApmrVmTMi/koEr4L/CLh547UBRzR9U2yTau9qj52IoeLKDXr1/nWf98+BfkPjiCFsngGxIQ== X-Received: by 2002:a05:6a00:4ac7:b0:6c2:cf23:3e21 with SMTP id ds7-20020a056a004ac700b006c2cf233e21mr4101209pfb.17.1700687527998; Wed, 22 Nov 2023 13:12:07 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.06 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:07 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 01/11] mm/mempolicy: refactor do_set_mempolicy for code re-use Date: Wed, 22 Nov 2023 16:11:50 -0500 Message-Id: <20231122211200.31620-2-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Refactors do_set_mempolicy into replace_mempolicy and do_set_mempolicy so that replace_mempolicy can be re-used with new code, not just the system call. Signed-off-by: Gregory Price --- mm/mempolicy.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 10a590ee1c89..410754d56e46 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -809,28 +809,20 @@ static int mbind_range(struct vma_iterator *vmi, stru= ct vm_area_struct *vma, return vma_replace_policy(vma, new_pol); } =20 -/* Set the process memory policy */ -static long do_set_mempolicy(unsigned short mode, unsigned short flags, - nodemask_t *nodes) +/* Attempt to replace mempolicy, release the old one if successful */ +static long replace_mempolicy(struct mempolicy *new, nodemask_t *nodes) { - struct mempolicy *new, *old; + struct mempolicy *old =3D NULL; NODEMASK_SCRATCH(scratch); int ret; =20 if (!scratch) return -ENOMEM; =20 - new =3D mpol_new(mode, flags, nodes); - if (IS_ERR(new)) { - ret =3D PTR_ERR(new); - goto out; - } - task_lock(current); ret =3D mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); - mpol_put(new); goto out; } =20 @@ -838,14 +830,32 @@ static long do_set_mempolicy(unsigned short mode, uns= igned short flags, current->mempolicy =3D new; if (new && new->mode =3D=3D MPOL_INTERLEAVE) current->il_prev =3D MAX_NUMNODES-1; +out: task_unlock(current); mpol_put(old); - ret =3D 0; -out: + NODEMASK_SCRATCH_FREE(scratch); return ret; } =20 +/* Set the process memory policy */ +static long do_set_mempolicy(unsigned short mode, unsigned short flags, + nodemask_t *nodes) +{ + struct mempolicy *new; + int ret; + + new =3D mpol_new(mode, flags, nodes); + if (IS_ERR(new)) + return PTR_ERR(new); + + ret =3D replace_mempolicy(new, nodes); + if (ret) + mpol_put(new); + + return ret; +} + /* * Return nodemask for policy for get_mempolicy() query * --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id C24AAC27C40 for ; Wed, 22 Nov 2023 21:12:25 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232331AbjKVVM1 (ORCPT ); Wed, 22 Nov 2023 16:12:27 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37076 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1344185AbjKVVMP (ORCPT ); Wed, 22 Nov 2023 16:12:15 -0500 Received: from mail-ot1-x341.google.com (mail-ot1-x341.google.com [IPv6:2607:f8b0:4864:20::341]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A9AF41A5; Wed, 22 Nov 2023 13:12:11 -0800 (PST) Received: by mail-ot1-x341.google.com with SMTP id 46e09a7af769-6ce2c5b2154so138977a34.3; Wed, 22 Nov 2023 13:12:11 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687531; x=1701292331; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=03wqCqNdJaIR+mhkCLDXmbEtw/nvWV7JtWUH8CKfPbk=; b=XqCNkqjEfiPy1oagjqduRGn0KoZZ2/7EmcihPDgvLrZuPbYETIeHN2OYQhnKzHfVD5 /hE3vIr+g4LZUfBtqIh8MnS5lrVjGjSMP7zKGUmMxkeqUCB61gYZw5gNnWkd/+pkgs1v eT6R5sDBh8PRvQmkd1e3ihEzXJs6pnPklAjlm1BRK8onom7eusCg9r6aJupS0FORMeC6 Bf45VYTJ7+ozSrPCOgVAgiz4JgzvQLhXUFonrrlH/cf1FP488iy6myPMvYeGGcPPoYLV xy66gkJCPjyuOVakQVOuc0h7acbe2zKjufT5iTU5LfmSY+0i5CqrBWZfptytvlqM/vW6 6lZQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687531; x=1701292331; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=03wqCqNdJaIR+mhkCLDXmbEtw/nvWV7JtWUH8CKfPbk=; b=uzRgK8DLPrjQ22NeKM9p3RY/O6KnSxHlgzwPXUtvKfHKv5fIYK5OqtV9sjza7Hk+e8 0iEUJyLdWKPNkmY/bYUSMoa+ONCQqUW9wbp+uuWwFtizc3d+GCUUSBBFrluxDBm+GVCv uo7EUlxCXsOjXes91epV+Rbj2tFdveGjkSqRlrAk9tNTGFt5hp2GGWsRbsLY2MGFYL/f nEb7iR9Q1ru+8A0/BqprWu4qCgdivvqZodAWuzHYRg1RniWpWDqdZTUnV58mdx1QTKxE TrF/5ZPnx4VRkwobSozQSLzE1M3XeMxi6Ifz3lry59mwFS5SMdsLy4CeJCKgx21i41UY lB1A== X-Gm-Message-State: AOJu0YwhQuuH4h9gmfdOVWgMrFLMcBmnfrQ1/NxmXtu0tTToC8qhsBtI FsVyVAXv5sVI79Io985dbnSKXNVRusY8 X-Google-Smtp-Source: AGHT+IHcsK0aMfpRQrTeevPOO7K62/8e0CbVC+qqs3g2eVJvgvGpmbk4gG2Ep5IyIbhkFCI7iShlQA== X-Received: by 2002:a05:6830:1102:b0:6b9:6419:1cde with SMTP id w2-20020a056830110200b006b964191cdemr3959734otq.22.1700687530930; Wed, 22 Nov 2023 13:12:10 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.09 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:10 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 02/11] mm/mempolicy: swap cond reference counting logic in do_get_mempolicy Date: Wed, 22 Nov 2023 16:11:51 -0500 Message-Id: <20231122211200.31620-3-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" In preparation for making get/set mempolicy possible from outside the context of the task being changed, we will need to take a reference count on the task mempolicy in do_get_mempolicy. do_get_mempolicy, operations on one of three policies 1) when MPOL_F_ADDR is set, it operates on a vma mempolicy 2) if the task does not have a mempolicy, default_policy is used 3) otherwise the task mempolicy is operated on When the policy is from a vma, and that vma is a shared memory region, the __get_vma_policy stack will take an additional reference Change the behavior of do_get_mempolicy to unconditionally reference whichever policy is operated on so that the cleanup logic can mpol_put unconditionally, and mpol_cond_put is only called when a vma policy is used. Signed-off-by: Gregory Price --- mm/mempolicy.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 410754d56e46..37da712259d7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -900,9 +900,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *n= mask, unsigned long addr, unsigned long flags) { int err; - struct mm_struct *mm =3D current->mm; + struct mm_struct *mm; struct vm_area_struct *vma =3D NULL; - struct mempolicy *pol =3D current->mempolicy, *pol_refcount =3D NULL; + struct mempolicy *pol =3D NULL, *pol_refcount =3D NULL; =20 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) @@ -925,29 +925,38 @@ static long do_get_mempolicy(int *policy, nodemask_t = *nmask, * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ + mm =3D current->mm; mmap_read_lock(mm); vma =3D vma_lookup(mm, addr); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } - pol =3D __get_vma_policy(vma, addr, &ilx); + /* + * __get_vma_policy can refcount if a shared policy is + * referenced. We'll need to do a cond_put on the way + * out, but we need to reference this policy either way + * because we may drop the mmap read lock. + */ + pol =3D pol_refcount =3D __get_vma_policy(vma, addr, &ilx); + mpol_get(pol); } else if (addr) return -EINVAL; + else { + /* take a reference of the task policy now */ + pol =3D current->mempolicy; + mpol_get(pol); + } =20 - if (!pol) + if (!pol) { pol =3D &default_policy; /* indicates default behavior */ + mpol_get(pol); + } + /* we now have at least one reference on the policy */ =20 if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - /* - * Take a refcount on the mpol, because we are about to - * drop the mmap_lock, after which only "pol" remains - * valid, "vma" is stale. - */ - pol_refcount =3D pol; vma =3D NULL; - mpol_get(pol); mmap_read_unlock(mm); err =3D lookup_node(mm, addr); if (err < 0) @@ -982,11 +991,11 @@ static long do_get_mempolicy(int *policy, nodemask_t = *nmask, } =20 out: - mpol_cond_put(pol); + mpol_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) - mpol_put(pol_refcount); + mpol_cond_put(pol_refcount); return err; } =20 --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id C357EC61D97 for ; Wed, 22 Nov 2023 21:12:30 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1344421AbjKVVMb (ORCPT ); Wed, 22 Nov 2023 16:12:31 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37550 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S235170AbjKVVMY (ORCPT ); Wed, 22 Nov 2023 16:12:24 -0500 Received: from mail-pf1-x441.google.com (mail-pf1-x441.google.com [IPv6:2607:f8b0:4864:20::441]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id C3AF3D50; Wed, 22 Nov 2023 13:12:14 -0800 (PST) Received: by mail-pf1-x441.google.com with SMTP id d2e1a72fcca58-6cb9dd2ab56so252387b3a.3; Wed, 22 Nov 2023 13:12:14 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687534; x=1701292334; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=1L0St/1imG1/+PsAaTcIH+nuU4Zq2K1uLUk+VNFsJhE=; b=VxBXfKUr0p9owk/MJbxDb7e1iMfbu+RMK+ZJbudD5zpHPt5XIEoft/4oqCtj9EULsh z8fZ9S+GJ11daAXjlB2/OJEJYvC+R43vcCYIwk4OG9hl6aTGS0aocUN/uCpBBrZ8eU0R xiWF8u1KiFPBsoUdxgUUV6Hf5y7zTuFb2NQ4Uo15/3koJzLqyhLzIAyflZ5y6Tec9QnK dr9Ttoq8S/Bh2FtBvZIuGGgjT7w44QBnqOgkLMkyi+h73DD45/347Z+Jm+PF4ba5n4RV AwKi8YXstl5zLHT1h3G/3+QRfniWGE0W0IlMOHFQ7/YApZsRpnH2Seb5BGi8fubOaBIO tlcQ== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687534; x=1701292334; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=1L0St/1imG1/+PsAaTcIH+nuU4Zq2K1uLUk+VNFsJhE=; b=qiLduj52lutKV9SO00tW66uZ5DQ2S9L4mW9SGBBcwYdFBj1nyWw/KcEN9ZsKLsk9e8 dd6Kpgza/yYKNJAM3XoKM9Ms5uQbZkNrix84yLNiWJWLWbTdY5QofW8hcTehL693NEcu 9Pj3OboOXEN0D9ZOQT8ZBSy/3KAp56AMXeBsPv5XwO8nFX5Umv5ld3UMjBLaPBRN7ak7 UfBf15We4hsHZsEis5kyzt3MhtIltPLd9Hwz5d8Mp/gyexC900nBRwPGFochy4H43rva BA03492QIfrA6VZXnnh7LabHxa8WCAUq5TtQyQTZzGZe74vA619ILB7wUIbuHOp1X0sZ 83Ig== X-Gm-Message-State: AOJu0YwNMuleQ3xPWKx8TlE5Wb7CmUabDyTFBgXrxRCwr6PFwpIYhyeb XR/OJm/XbTyx/1FOqtUnEQ== X-Google-Smtp-Source: AGHT+IG6Q6k/J46fkuHI+vHQ3UkB56d4pIUNrjj7twNbdRajXKBsjZF+C9yC+5OfzogHrLEo5QCK1w== X-Received: by 2002:a05:6a20:7f94:b0:18b:2dc6:7e18 with SMTP id d20-20020a056a207f9400b0018b2dc67e18mr3989169pzj.61.1700687534128; Wed, 22 Nov 2023 13:12:14 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.12 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:13 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 03/11] mm/mempolicy: refactor set_mempolicy stack to take a task argument Date: Wed, 22 Nov 2023 16:11:52 -0500 Message-Id: <20231122211200.31620-4-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" To make mempolicy modifiable by external tasks, we must refactor the callstack to take a task as an argument. Modify the following functions to require a task argument: mpol_set_nodemask replace_mempolicy do_set_mempolicy Since replace_mempolicy already acquired the task lock, there is no need to change any locking behaviors. All other callers (as of this patch) to mpol_set_nodemask call either in the context of current with the task or mmap lock held, so no other changes are required. Signed-off-by: Gregory Price --- mm/mempolicy.c | 51 +++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 37da712259d7..9ea3e1bfc002 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -226,8 +226,10 @@ static int mpol_new_preferred(struct mempolicy *pol, c= onst nodemask_t *nodes) * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ -static int mpol_set_nodemask(struct mempolicy *pol, - const nodemask_t *nodes, struct nodemask_scratch *nsc) +static int mpol_set_nodemask(struct task_struct *tsk, + struct mempolicy *pol, + const nodemask_t *nodes, + struct nodemask_scratch *nsc) { int ret; =20 @@ -240,8 +242,7 @@ static int mpol_set_nodemask(struct mempolicy *pol, return 0; =20 /* Check N_MEMORY */ - nodes_and(nsc->mask1, - cpuset_current_mems_allowed, node_states[N_MEMORY]); + nodes_and(nsc->mask1, tsk->mems_allowed, node_states[N_MEMORY]); =20 VM_BUG_ON(!nodes); =20 @@ -253,7 +254,7 @@ static int mpol_set_nodemask(struct mempolicy *pol, if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask =3D *nodes; else - pol->w.cpuset_mems_allowed =3D cpuset_current_mems_allowed; + pol->w.cpuset_mems_allowed =3D tsk->mems_allowed; =20 ret =3D mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; @@ -810,7 +811,9 @@ static int mbind_range(struct vma_iterator *vmi, struct= vm_area_struct *vma, } =20 /* Attempt to replace mempolicy, release the old one if successful */ -static long replace_mempolicy(struct mempolicy *new, nodemask_t *nodes) +static long replace_mempolicy(struct task_struct *task, + struct mempolicy *new, + nodemask_t *nodes) { struct mempolicy *old =3D NULL; NODEMASK_SCRATCH(scratch); @@ -819,19 +822,19 @@ static long replace_mempolicy(struct mempolicy *new, = nodemask_t *nodes) if (!scratch) return -ENOMEM; =20 - task_lock(current); - ret =3D mpol_set_nodemask(new, nodes, scratch); + task_lock(task); + ret =3D mpol_set_nodemask(task, new, nodes, scratch); if (ret) { - task_unlock(current); + task_unlock(task); goto out; } =20 - old =3D current->mempolicy; - current->mempolicy =3D new; + old =3D task->mempolicy; + task->mempolicy =3D new; if (new && new->mode =3D=3D MPOL_INTERLEAVE) - current->il_prev =3D MAX_NUMNODES-1; + task->il_prev =3D MAX_NUMNODES-1; out: - task_unlock(current); + task_unlock(task); mpol_put(old); =20 NODEMASK_SCRATCH_FREE(scratch); @@ -839,8 +842,8 @@ static long replace_mempolicy(struct mempolicy *new, no= demask_t *nodes) } =20 /* Set the process memory policy */ -static long do_set_mempolicy(unsigned short mode, unsigned short flags, - nodemask_t *nodes) +static long do_set_mempolicy(struct task_struct *task, unsigned short mode, + unsigned short flags, nodemask_t *nodes) { struct mempolicy *new; int ret; @@ -849,7 +852,7 @@ static long do_set_mempolicy(unsigned short mode, unsig= ned short flags, if (IS_ERR(new)) return PTR_ERR(new); =20 - ret =3D replace_mempolicy(new, nodes); + ret =3D replace_mempolicy(task, new, nodes); if (ret) mpol_put(new); =20 @@ -1284,7 +1287,7 @@ static long do_mbind(unsigned long start, unsigned lo= ng len, NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); - err =3D mpol_set_nodemask(new, nmask, scratch); + err =3D mpol_set_nodemask(current, new, nmask, scratch); if (err) mmap_write_unlock(mm); } else @@ -1580,7 +1583,8 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned= long, len, } =20 /* Set the process memory policy */ -static long kernel_set_mempolicy(int mode, const unsigned long __user *nma= sk, +static long kernel_set_mempolicy(struct task_struct *task, int mode, + const unsigned long __user *nmask, unsigned long maxnode) { unsigned short mode_flags; @@ -1596,13 +1600,13 @@ static long kernel_set_mempolicy(int mode, const un= signed long __user *nmask, if (err) return err; =20 - return do_set_mempolicy(lmode, mode_flags, &nodes); + return do_set_mempolicy(task, lmode, mode_flags, &nodes); } =20 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nm= ask, unsigned long, maxnode) { - return kernel_set_mempolicy(mode, nmask, maxnode); + return kernel_set_mempolicy(current, mode, nmask, maxnode); } =20 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, @@ -2722,7 +2726,8 @@ void mpol_shared_policy_init(struct shared_policy *sp= , struct mempolicy *mpol) goto free_scratch; /* no valid nodemask intersection */ =20 task_lock(current); - ret =3D mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); + ret =3D mpol_set_nodemask(current, npol, &mpol->w.user_nodemask, + scratch); task_unlock(current); if (ret) goto put_npol; @@ -2870,7 +2875,7 @@ void __init numa_policy_init(void) if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); =20 - if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) + if (do_set_mempolicy(current, MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); =20 check_numabalancing_enable(); @@ -2879,7 +2884,7 @@ void __init numa_policy_init(void) /* Reset policy of current process to default */ void numa_default_policy(void) { - do_set_mempolicy(MPOL_DEFAULT, 0, NULL); + do_set_mempolicy(current, MPOL_DEFAULT, 0, NULL); } =20 /* --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 7C994C61DA7 for ; Wed, 22 Nov 2023 21:12:34 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1344512AbjKVVMf (ORCPT ); Wed, 22 Nov 2023 16:12:35 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37578 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1344390AbjKVVMZ (ORCPT ); Wed, 22 Nov 2023 16:12:25 -0500 Received: from mail-pf1-x443.google.com (mail-pf1-x443.google.com [IPv6:2607:f8b0:4864:20::443]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 8D4A9D66; Wed, 22 Nov 2023 13:12:17 -0800 (PST) Received: by mail-pf1-x443.google.com with SMTP id d2e1a72fcca58-6c3363a2b93so257892b3a.3; Wed, 22 Nov 2023 13:12:17 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687537; x=1701292337; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=GABulzaAiHjU3X3l/c8xxKNbyTu6XjRf70gPWHFnEeU=; b=ZbnpJgNjs4dBcrBJ0t4DDz+Qat2I9JACrZNqlr2ESg/gcT51vNGjOgZwn0oG/UVAZ7 pJiA6uVge8W7h/gzkcmsR6DUHTetxIvGJqOSV47Rko3Ffwaft7y1svuz9csW8JOfKLst UeZ/WS/tSq8V78tl02Gk6rL9qCu/kFMK2EUH0x4Zk+vATlDcAh0bfjKAh4GTtlf/5Kl5 /KnwIAdgGletqP3STCEdDb6Hm03stzzF778hqlA4qxhllUXSwW7Q3J30iAusEn8Iiukg gnvsKd1ElELo2OPc+yT9YeDbJhkbGxr7vJaf5B5HT3zxgBMGfytdZTdih7/tEsgqD1rU EFmg== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687537; x=1701292337; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=GABulzaAiHjU3X3l/c8xxKNbyTu6XjRf70gPWHFnEeU=; b=Pqh01OrqJ8E5Z1Eo5OlaF5TmLPsujF1+jIEJyEqNE36+s6JI05fjmdM+RKQxw1pdpI uQkS/yL4lD4vvUqTHrhTGGQRn2IDNvV0NUs2Lxq5djPmrfqIB4Zjc+iHPgE0XTBjHiju IbTJEPsBNbxa1qUcaybcTb7/4Rn0ywjyb+kOBt66Q2v3C7tgitl7LAdiIEbXR9BM50qM IBblLLQ7PZIrZYTG4NL3K2H6is59JpTPnS+TQcdEKABfNt9uYm5g4wRMAfHlz3HeX+BW AOxTYATiVOcVQznA/gSiAH9l0nrFTXgOvvmNovLzGxelyFGxM/ZIMeR0/Fm0yzZQy8wR 8y6g== X-Gm-Message-State: AOJu0YwI0KPTngJ108aALE16euTT5pcyVT29EDFrk+nhkmoEGeqeD7Rs 6XZ7mBivv6Fa0BuENetchs8v+EYAP9YS X-Google-Smtp-Source: AGHT+IFXvdmudROHD0YlWi+YK0a7lcqiiD79ewyUHoexoGYHQI8f+8OjJBq0l9lLV64qzqt5Crv6dA== X-Received: by 2002:a05:6a00:1916:b0:6cb:bc1a:dcff with SMTP id y22-20020a056a00191600b006cbbc1adcffmr3982392pfi.13.1700687536917; Wed, 22 Nov 2023 13:12:16 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.15 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:16 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 04/11] mm/mempolicy: modify get_mempolicy call stack to take a task argument Date: Wed, 22 Nov 2023 16:11:53 -0500 Message-Id: <20231122211200.31620-5-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" To make mempolicy fetchable by external tasks, we must first change the callstack to take a task as an argument. Modify the following functions to require a task argument: do_get_mempolicy kernel_get_mempolicy The way the task->mm is acquired must change slightly to enable this change. Originally, do_get_mempolicy would acquire the task->mm directly via (current->mm). This is unsafe to do in a non-current context. However, utilizing get_task_mm would break the original functionality of do_get_mempolicy due to the following check in get_task_mm: if (mm) { if (task->flags & PF_KTHREAD) mm =3D NULL; else mmget(mm); } To retain the original behavior, if (task =3D=3D current) we access the task->mm directly, but if (task !=3D current) we will utilize get_task_mm to safely access the mm. We simplify the get/put mechanics by always taking a reference to the mm, even if we are in the context of (task =3D=3D current). Additionally, since the mempolicy will become externally modifiable, we need to take the task lock to acquire task->mempolicy safely, regardless of whether we are operating on current or not. Signed-off-by: Gregory Price --- mm/mempolicy.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9ea3e1bfc002..4519f39b1a07 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -899,8 +899,9 @@ static int lookup_node(struct mm_struct *mm, unsigned l= ong addr) } =20 /* Retrieve NUMA policy */ -static long do_get_mempolicy(int *policy, nodemask_t *nmask, - unsigned long addr, unsigned long flags) +static long do_get_mempolicy(struct task_struct *task, int *policy, + nodemask_t *nmask, unsigned long addr, + unsigned long flags) { int err; struct mm_struct *mm; @@ -915,9 +916,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *n= mask, if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy =3D 0; /* just so it's initialized */ - task_lock(current); - *nmask =3D cpuset_current_mems_allowed; - task_unlock(current); + task_lock(task); + *nmask =3D task->mems_allowed; + task_unlock(task); return 0; } =20 @@ -928,7 +929,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *= nmask, * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ - mm =3D current->mm; + if (task =3D=3D current) { + /* + * original behavior allows a kernel task changing its + * own policy to avoid the condition in get_task_mm, + * so we'll directly access + */ + mm =3D task->mm; + mmget(mm); + } else + mm =3D get_task_mm(task); mmap_read_lock(mm); vma =3D vma_lookup(mm, addr); if (!vma) { @@ -947,8 +957,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *= nmask, return -EINVAL; else { /* take a reference of the task policy now */ - pol =3D current->mempolicy; + task_lock(task); + pol =3D task->mempolicy; mpol_get(pol); + task_unlock(task); } =20 if (!pol) { @@ -962,12 +974,13 @@ static long do_get_mempolicy(int *policy, nodemask_t = *nmask, vma =3D NULL; mmap_read_unlock(mm); err =3D lookup_node(mm, addr); + mmput(mm); if (err < 0) goto out; *policy =3D err; - } else if (pol =3D=3D current->mempolicy && + } else if (pol =3D=3D task->mempolicy && pol->mode =3D=3D MPOL_INTERLEAVE) { - *policy =3D next_node_in(current->il_prev, pol->nodes); + *policy =3D next_node_in(task->il_prev, pol->nodes); } else { err =3D -EINVAL; goto out; @@ -987,9 +1000,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *= nmask, if (mpol_store_user_nodemask(pol)) { *nmask =3D pol->w.user_nodemask; } else { - task_lock(current); + task_lock(task); get_policy_nodemask(pol, nmask); - task_unlock(current); + task_unlock(task); } } =20 @@ -1704,7 +1717,8 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned l= ong, maxnode, } =20 /* Retrieve NUMA policy */ -static int kernel_get_mempolicy(int __user *policy, +static int kernel_get_mempolicy(struct task_struct *task, + int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, @@ -1719,7 +1733,7 @@ static int kernel_get_mempolicy(int __user *policy, =20 addr =3D untagged_addr(addr); =20 - err =3D do_get_mempolicy(&pval, &nodes, addr, flags); + err =3D do_get_mempolicy(task, &pval, &nodes, addr, flags); =20 if (err) return err; @@ -1737,7 +1751,8 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { - return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); + return kernel_get_mempolicy(current, policy, nmask, maxnode, addr, + flags); } =20 bool vma_migratable(struct vm_area_struct *vma) --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 42A5CC27C40 for ; Wed, 22 Nov 2023 21:12:39 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1344579AbjKVVMk (ORCPT ); Wed, 22 Nov 2023 16:12:40 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37544 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1344482AbjKVVM0 (ORCPT ); Wed, 22 Nov 2023 16:12:26 -0500 Received: from mail-pf1-x443.google.com (mail-pf1-x443.google.com [IPv6:2607:f8b0:4864:20::443]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 91BD2D40; Wed, 22 Nov 2023 13:12:20 -0800 (PST) Received: by mail-pf1-x443.google.com with SMTP id d2e1a72fcca58-6c115026985so264223b3a.1; Wed, 22 Nov 2023 13:12:20 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687540; x=1701292340; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=oVrRu5wbv05JVXCiU4W4p4q4IBiu16aLNt9pxHyDOm8=; b=Jb0vDsYDRkJlIf0Tw8qaMnVNgSJz3dbYeAhXrON2LFbDOUYDthhzBWLawt8M84BrdS ohm23n5Req7+V0xKcUShqijp723+AZiSaPL9+EQ6E9QD76SrALyVO5lKiJwYIriaDphf Ik/SDLSSGOBRBTCv0Ro7TmtAgufAI8LX5n5me6lSgv55AxpieK8d7wNIRifPKZc4azTS Pnw+tmSLIZSVX0Nt70m65O1un4tTpzEwDLf/RREzYQwrioYId8mppl7tcSbszD1Mea9D k2Es6xKdz6HfTmJx2vqNYKheSrtXZPfgyF0ELKgg/Q8gyAd2GzBUiemz3hIxuNi2PGpX 1/4Q== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687540; x=1701292340; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=oVrRu5wbv05JVXCiU4W4p4q4IBiu16aLNt9pxHyDOm8=; b=aQorblcMEeQ9cTVg8UpDCvY2VQqMrPflSPcxncnaDFKg5G6tsbkKALor0ijOH/aSjI zsu55cdvOAbzc+fJ6ZUKrJmF7YNJrSJVEW37ds5WbL2VyKT6RvptCeJUJ82mlgNLoU0n w7bTLUdqtkU+N9XCMHn7VyVnSPCJsYxIWfTiyjjJJFhEMDcj5k9b7zSgTCLHvgyJNhKt zWhM92hCdhj2U7qUQ0nheheXaXM/NibxrCBxCqUCOVkedQyapuYdUunPFYaJs5hyHUWH yQ2WawqVdDgvgznAsnNmfcmPz7orqdFROfW3K1qfsQ1m6a4+dP8FL1EGmzm+ow+YOa2F oN2g== X-Gm-Message-State: AOJu0Yz3lV4pgz/DhHx4+LgBFUHPZsNc55r+oWq20/5SXRvHZ7RThHHh UHDILxyJ75yTnBu0VGtmig== X-Google-Smtp-Source: AGHT+IEhhVXk/VdCpqlemyCiB0SWUWiIBbhHHB1UaPHoDn3xJW2QhnBhVaPfSnRII43jNAanPH6BAg== X-Received: by 2002:a05:6a21:9706:b0:18a:f1f5:c4ae with SMTP id ub6-20020a056a21970600b0018af1f5c4aemr3137822pzb.42.1700687540034; Wed, 22 Nov 2023 13:12:20 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.18 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:19 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 05/11] mm/mempolicy: modify set_mempolicy_home_node to take a task argument Date: Wed, 22 Nov 2023 16:11:54 -0500 Message-Id: <20231122211200.31620-6-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" To make mempolicy fetchable by external tasks, we must first change the callstack to take a task as an argument. Modify the following functions to require a task argument: set_mempolicy_home_node First we refactor set_mempolicy_home_node to __set_mempolicy_home_node which accepts a task argument, and change the syscall definition to pass in (current). The only functional change in this patch is related to the way task->mm is acquired. Originally, set_mempolicy_home_node would acquire task->mm directly via (current->mm). This is unsafe to do in a non-current context. However, utilizing get_task_mm would break the original functionality of do_get_mempolicy due to the following check in get_task_mm: if (mm) { if (task->flags & PF_KTHREAD) mm =3D NULL; else mmget(mm); } To retain the original behavior, if (task =3D=3D current) we access the task->mm directly, but if (task !=3D current) we will utilize get_task_mm to safely access the mm. We always take a reference to the mm to keep the cleanup semantics simple. Signed-off-by: Gregory Price --- mm/mempolicy.c | 62 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4519f39b1a07..540163f5d349 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1521,39 +1521,67 @@ static long kernel_mbind(unsigned long start, unsig= ned long len, return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } =20 -SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned lo= ng, len, - unsigned long, home_node, unsigned long, flags) +static long __set_mempolicy_home_node(struct task_struct *task, + unsigned long start, + unsigned long len, + unsigned long home_node, + unsigned long flags) { - struct mm_struct *mm =3D current->mm; + struct mm_struct *mm; struct vm_area_struct *vma, *prev; struct mempolicy *new, *old; unsigned long end; int err =3D -ENOENT; + + /* + * Behavior when task =3D=3D current allows a task modifying itself + * to bypass the check in get_task_mm and acquire the mm directly + */ + if (task =3D=3D current) { + mm =3D task->mm; + mmget(mm); + } else + mm =3D get_task_mm(task); + + if (!mm) + return -ENODEV; + VMA_ITERATOR(vmi, mm, start); =20 start =3D untagged_addr(start); - if (start & ~PAGE_MASK) - return -EINVAL; + if (start & ~PAGE_MASK) { + err =3D -EINVAL; + goto mm_out; + } /* * flags is used for future extension if any. */ - if (flags !=3D 0) - return -EINVAL; + if (flags !=3D 0) { + err =3D -EINVAL; + goto mm_out; + } =20 /* * Check home_node is online to avoid accessing uninitialized * NODE_DATA. */ - if (home_node >=3D MAX_NUMNODES || !node_online(home_node)) - return -EINVAL; + if (home_node >=3D MAX_NUMNODES || !node_online(home_node)) { + err =3D -EINVAL; + goto mm_out; + } =20 len =3D PAGE_ALIGN(len); end =3D start + len; =20 - if (end < start) - return -EINVAL; - if (end =3D=3D start) - return 0; + if (end < start) { + err =3D -EINVAL; + goto mm_out; + } + if (end =3D=3D start) { + err =3D 0; + goto mm_out; + } + mmap_write_lock(mm); prev =3D vma_prev(&vmi); for_each_vma_range(vmi, vma, end) { @@ -1585,9 +1613,17 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned lo= ng, start, unsigned long, le break; } mmap_write_unlock(mm); +mm_out: + mmput(mm); return err; } =20 +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned lo= ng, len, + unsigned long, home_node, unsigned long, flags) +{ + return __set_mempolicy_home_node(current, start, len, home_node, flags); +} + SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 7A260C27C40 for ; Wed, 22 Nov 2023 21:12:45 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232683AbjKVVMq (ORCPT ); Wed, 22 Nov 2023 16:12:46 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:40596 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1344230AbjKVVM3 (ORCPT ); Wed, 22 Nov 2023 16:12:29 -0500 Received: from mail-pf1-x441.google.com (mail-pf1-x441.google.com [IPv6:2607:f8b0:4864:20::441]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 84FA5D4A; Wed, 22 Nov 2023 13:12:23 -0800 (PST) Received: by mail-pf1-x441.google.com with SMTP id d2e1a72fcca58-6cbd24d9557so186329b3a.1; Wed, 22 Nov 2023 13:12:23 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687543; x=1701292343; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=zGEV0GKEGIpaRLeFOk46VDdRd1VzAop7kM786wY6S5I=; b=OsO4emHSquVHxGgvcvtn+ucjFAHifEu8WKEQe0IchiGCgJO8k/veaisZcHp0YsiWVj eiazu7+mxGL2M/0Tm18N+fUfkNO+o2SHpL07D3OTp2d5QmKJgX+1mhrANZf+DP2KNrLq PXDlrDhg/x83hj0mGkd5u/hHRPz6R3HTW6uHg5PAn7SfdrRz+EDXhqV144m0YxJexLy2 2ycTEpBKQEbWVQB+ALUR785vWnY5m4WV0aJylp0vbyN8AvxTA8IldyYFXyR5MxxmP1FT FcBAT50DZp9Fb3DA6gI1mTdFozKhud0/KSWxSsm72z+NrXDFV1lNgJVbGaIesAXky2WF DE+Q== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687543; x=1701292343; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=zGEV0GKEGIpaRLeFOk46VDdRd1VzAop7kM786wY6S5I=; b=eSqSglXslNRInLS69y9fFikAXv74fLAbsh7ysOVCfimOrIap/fq0lZ+yTZ+sI/IPKa SyKjWwzB4ulEtHtnV/KXoli3fBIudEWXG7Ju+4WBRm97xNSmtEM6acH9EH7OPiEgu7ti zykImn8Z0cjeUlAYCftRAcdLQ8oJcyZx856gGzYs2ie7fH4dSG5yzk/Sz9gRo98mbTC+ NVeQ9J0g/LFc00iO/uQUUWJeE4XQl8NgmRQgSg1FPGOmaX28RX+2J0Ea3/I1Jd/hzt4k jT/unCdOvbL3Spz4Q0yKEmLal/9Vf+tcp/LK8cIAB51xa8Mv/lASthRXdn0p/QsIyAGd 3aXw== X-Gm-Message-State: AOJu0YxpmlVRAOhzBuspG/NFaeWOomVLBrp3RwqcYhFpE5IzJNJGN6vA aKV9VFvtBnINb1HshqAX0w== X-Google-Smtp-Source: AGHT+IEngkamQgS9JDDH0OMG5qcgYPn/R+C8BVPyrbVkNf+Ro78kVa/NtW1kXBp/Ws13QuBfIRwScg== X-Received: by 2002:a05:6a20:8f06:b0:18a:df69:eabe with SMTP id b6-20020a056a208f0600b0018adf69eabemr969850pzk.11.1700687542801; Wed, 22 Nov 2023 13:12:22 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.20 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:22 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 06/11] mm/mempolicy: modify do_mbind to operate on task argument instead of current Date: Wed, 22 Nov 2023 16:11:55 -0500 Message-Id: <20231122211200.31620-7-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" To make mbind applied mempolicies modifable by external tasks, we must first change the do_mbind callstack to take a task as an argument. This patch includes changes to the following functions: do_mbind kernel_mbind get_vma_policy And adds the following: get_task_vma_policy get_vma_policy is changed into a wrapper of get_task_vma_policy which passes current as an argument to retain the existing behavior for callers of get_vma_policy. do_mbind is modified as followed: 1) the way task->mm is acquired is changed to be safe for non-current tasks, but the original behavior of (task =3D=3D current) is retained. 2) we take a reference to the mm so that the task lock can be dropped. 3) the task lock must now be acquired on call to get_task_policy to ensure we acquire and reference the policy safely. 4) get_task_vma_policy is called instead of get_vma_policy. This requires taking the task_lock because of the new semantics. Change to acquiring task->mm: When (task =3D=3D curent), if we use get_task_mm, it would prevent a kernel task from making modifications or accessing information about its own vma's. So in this scenario, we simply access and reference the mm directly, since the mempolicy information is being accessed in the context of the task itself. if (mm) { if (task->flags & PF_KTHREAD) mm =3D NULL; else mmget(mm); } The retains the existing behavior. Change to get_task_vma_policy locking behavior: Since task->policy is no longer guaranteed to be stable, any time we seek to acquire a policy via get_task_vma_policy, we must use the task_lock and reference it accordingly, regardless of where it ultimately came from. A similar behvior can be seen in do_get_mempolicy, where a reference is taken and a conditional release is made to handle the situation where a shared policy is acquired. In the case of do_mbind, we don't actually need to take a reference to the policy, as we only call get_task_vma_policy to find the ilx. In this case, we only need to call mpol_cond_put immediately to ensure that if get_task_vma_policy returns a shared policy we decrement the reference count since a shared mpol will return already referenced. Signed-off-by: Gregory Price --- mm/mempolicy.c | 92 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 540163f5d349..3d2171ac4098 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -422,6 +422,10 @@ static bool migrate_folio_add(struct folio *folio, str= uct list_head *foliolist, unsigned long flags); static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, pgoff_t ilx, int *nid); +static struct mempolicy *get_task_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, + unsigned long addr, int order, + pgoff_t *ilx); =20 static bool strictly_unmovable(unsigned long flags) { @@ -1250,11 +1254,12 @@ static struct folio *alloc_migration_target_by_mpol= (struct folio *src, } #endif =20 -static long do_mbind(unsigned long start, unsigned long len, - unsigned short mode, unsigned short mode_flags, - nodemask_t *nmask, unsigned long flags) +static long do_mbind(struct task_struct *task, unsigned long start, + unsigned long len, unsigned short mode, + unsigned short mode_flags, nodemask_t *nmask, + unsigned long flags) { - struct mm_struct *mm =3D current->mm; + struct mm_struct *mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; struct migration_mpol mmpol; @@ -1287,6 +1292,21 @@ static long do_mbind(unsigned long start, unsigned l= ong len, if (IS_ERR(new)) return PTR_ERR(new); =20 + /* + * original behavior allows a kernel task modifying itself to bypass + * check in get_task_mm, so directly acquire mm in this case + */ + if (task =3D=3D current) { + mm =3D task->mm; + mmget(mm); + } else + mm =3D get_task_mm(task); + + if (!mm) { + err =3D -ENODEV; + goto mpol_out; + } + /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1300,7 +1320,9 @@ static long do_mbind(unsigned long start, unsigned lo= ng len, NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); - err =3D mpol_set_nodemask(current, new, nmask, scratch); + task_lock(task); + err =3D mpol_set_nodemask(task, new, nmask, scratch); + task_unlock(task); if (err) mmap_write_unlock(mm); } else @@ -1308,7 +1330,7 @@ static long do_mbind(unsigned long start, unsigned lo= ng len, NODEMASK_SCRATCH_FREE(scratch); } if (err) - goto mpol_out; + goto mm_out; =20 /* * Lock the VMAs before scanning for pages to migrate, @@ -1333,8 +1355,10 @@ static long do_mbind(unsigned long start, unsigned l= ong len, if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { - new =3D get_task_policy(current); + task_lock(task); + new =3D get_task_policy(task); mpol_get(new); + task_unlock(task); } mmpol.pol =3D new; mmpol.ilx =3D 0; @@ -1365,8 +1389,11 @@ static long do_mbind(unsigned long start, unsigned l= ong len, if (addr !=3D -EFAULT) { order =3D compound_order(page); /* We already know the pol, but not the ilx */ - mpol_cond_put(get_vma_policy(vma, addr, order, - &mmpol.ilx)); + task_lock(task); + mpol_cond_put(get_task_vma_policy(task, vma, + addr, order, + &mmpol.ilx)); + task_unlock(task); /* Set base from which to increment by index */ mmpol.ilx -=3D page->index >> order; } @@ -1386,6 +1413,8 @@ static long do_mbind(unsigned long start, unsigned lo= ng len, err =3D -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); +mm_out: + mmput(mm); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) @@ -1500,8 +1529,9 @@ static inline int sanitize_mpol_flags(int *mode, unsi= gned short *flags) return 0; } =20 -static long kernel_mbind(unsigned long start, unsigned long len, - unsigned long mode, const unsigned long __user *nmask, +static long kernel_mbind(struct task_struct *task, unsigned long start, + unsigned long len, unsigned long mode, + const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { unsigned short mode_flags; @@ -1518,7 +1548,7 @@ static long kernel_mbind(unsigned long start, unsigne= d long len, if (err) return err; =20 - return do_mbind(start, len, lmode, mode_flags, &nodes, flags); + return do_mbind(task, start, len, lmode, mode_flags, &nodes, flags); } =20 static long __set_mempolicy_home_node(struct task_struct *task, @@ -1628,7 +1658,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned= long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { - return kernel_mbind(start, len, mode, nmask, maxnode, flags); + return kernel_mbind(current, start, len, mode, nmask, maxnode, flags); } =20 /* Set the process memory policy */ @@ -1827,6 +1857,31 @@ struct mempolicy *__get_vma_policy(struct vm_area_st= ruct *vma, vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } =20 +/* + * Task variant of get_vma_policy for use internally. Returns the task + * policy if the vma does not have a policy of its own. get_vma_policy + * will return current->mempolicy as a result. + * + * Like get_vma_policy and get_task_policy, must hold alloc/task_lock + * while calling this. + */ +static struct mempolicy *get_task_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, + unsigned long addr, int order, + pgoff_t *ilx) +{ + struct mempolicy *pol; + + pol =3D __get_vma_policy(vma, addr, ilx); + if (!pol) + pol =3D get_task_policy(task); + if (pol->mode =3D=3D MPOL_INTERLEAVE) { + *ilx +=3D vma->vm_pgoff >> order; + *ilx +=3D (addr - vma->vm_start) >> (PAGE_SHIFT + order); + } + return pol; +} + /* * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought @@ -1844,16 +1899,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_st= ruct *vma, struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { - struct mempolicy *pol; - - pol =3D __get_vma_policy(vma, addr, ilx); - if (!pol) - pol =3D get_task_policy(current); - if (pol->mode =3D=3D MPOL_INTERLEAVE) { - *ilx +=3D vma->vm_pgoff >> order; - *ilx +=3D (addr - vma->vm_start) >> (PAGE_SHIFT + order); - } - return pol; + return get_task_vma_policy(current, vma, addr, order, ilx); } =20 bool vma_policy_mof(struct vm_area_struct *vma) --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 97C12C61D97 for ; Wed, 22 Nov 2023 21:12:58 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1344607AbjKVVM7 (ORCPT ); Wed, 22 Nov 2023 16:12:59 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:53448 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1343845AbjKVVMi (ORCPT ); Wed, 22 Nov 2023 16:12:38 -0500 Received: from mail-pf1-x442.google.com (mail-pf1-x442.google.com [IPv6:2607:f8b0:4864:20::442]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id CB7FED71; Wed, 22 Nov 2023 13:12:27 -0800 (PST) Received: by mail-pf1-x442.google.com with SMTP id d2e1a72fcca58-6cbc8199a2aso239252b3a.1; Wed, 22 Nov 2023 13:12:27 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687546; x=1701292346; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=efc4LuxM+YClZeP0rF1BlV0TPdiTLe9TrsONK1dWGk4=; b=m6hm5JNStM0VWz+yvsA2T8+kuwVKJTjnNsyxYW0UBVYQR9VRzwQMOlRMW4gO4FVny+ IuMUCSrYVu+kBczaLEpQFBrxnLlkuJIQcCWiDHYMtir4Atr2UIdQ8rvnIKOAH4VzGNgy TSxDRnw9LpeHDDZ1hmtBCUGhi0z7nLMNF2UCFWsWJXLE0SSmiSNV827X6ZLO3k2GCnxE wLJNS8sdXVo5Yf+bQZGCpnobEqmtClbsP6ESb0KG3qmoE/ulv1w5je6rTvbR1R7QNiGh v/aD0P+jx808rifVYlcwutiDAJXCq7RVwlGrwXTDYD7sadvjfmbFmMIhn1kljsgRqJWn aUOw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687546; x=1701292346; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=efc4LuxM+YClZeP0rF1BlV0TPdiTLe9TrsONK1dWGk4=; b=biRxU4sNL1MAAiG2lS+5X5+gkAHROBEV7IRX7INFQxwAnsiTi1rmqI5dXOvgO0gNtg sJkZYE+K0H7v/QnNxs+V3H0+hmOmi71VVTBkNpT+AwGYLyW3IwpLHAr9KBaTKZ8GueWd Ya98MieTnsH8QsFP2G1WPivjKkN4uKkpFcx+oUSpn1BxDlub1dhU/D7ZsaLhYSnk+QpX w8YzNdUqkwD0BkCuCw/k78aW3Jz06Vo4fztZoy0PwtxZlrvzQCZWkYJOQMSng04FkEzo hayCofMR9xYa2IEwjAjVh6OqmP+e3G0dYpseTQ/wQyfGu0SG24RgnB7cxekOyQ7P6gKg bM8w== X-Gm-Message-State: AOJu0Yz5erGOqSOXIveiR/tcUMxgMbG3WVW/ien6vnTRL0t51J66O2YQ iqAtZworklb1a4DkwatPdQ== X-Google-Smtp-Source: AGHT+IFyLKi/UX8cG8JtNK31D46LnIz+FKATE0AdMLHzg0L0sgRXSVvqGSyPkTSkfLO98TY33r1+qg== X-Received: by 2002:a05:6a00:800d:b0:6c4:dc5b:5b2b with SMTP id eg13-20020a056a00800d00b006c4dc5b5b2bmr3826752pfb.20.1700687546091; Wed, 22 Nov 2023 13:12:26 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.23 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:25 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 07/11] mm/mempolicy: add task mempolicy syscall variants Date: Wed, 22 Nov 2023 16:11:56 -0500 Message-Id: <20231122211200.31620-8-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Add system calls to allow one task to view or change another task's mempolicy settings. The task mempolicy has traditionally been a feature that could only be changed by the task itself. This creates issues with task migrations between cgroups where cpusets may differ. Attempts were made to allow policy nodemasks to be shifted via a flag (MPOL_F_RELATIVE_NODES), but this is not foolproof. Additionally, as new policies emerge (like weighted interleave), it may be necessary to allow not just the policy to be changed, but individual attributes of the policy (such as a node weight) in response to other system events - such as memory hotplug. If pid is 0, this behaves the same as the original mempolicy syscalls, otherwise this interface requires CAP_SYS_NICE. Syscalls in this patch: sys_set_task_mempolicy sys_get_task_mempolicy sys_set_task_mempolicy_home_node sys_task_mbind Signed-off-by: Gregory Price --- arch/x86/entry/syscalls/syscall_32.tbl | 4 + arch/x86/entry/syscalls/syscall_64.tbl | 4 + include/linux/syscalls.h | 14 +++ include/uapi/asm-generic/unistd.h | 10 ++- include/uapi/linux/mempolicy.h | 10 +++ mm/mempolicy.c | 119 +++++++++++++++++++++++++ 6 files changed, 160 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscal= ls/syscall_32.tbl index c8fac5205803..358bd91d7461 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -461,3 +461,7 @@ 454 i386 futex_wake sys_futex_wake 455 i386 futex_wait sys_futex_wait 456 i386 futex_requeue sys_futex_requeue +457 i386 set_task_mempolicy sys_set_task_mempolicy +458 i386 get_task_mempolicy sys_get_task_mempolicy +459 i386 set_task_mempolicy_home_node sys_set_task_mempolicy_home_node +460 i386 task_mbind sys_task_mbind diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscal= ls/syscall_64.tbl index 8cb8bf68721c..c83b0c5c1ff9 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -378,6 +378,10 @@ 454 common futex_wake sys_futex_wake 455 common futex_wait sys_futex_wait 456 common futex_requeue sys_futex_requeue +457 common set_task_mempolicy sys_set_task_mempolicy +458 common get_task_mempolicy sys_get_task_mempolicy +459 common set_task_mempolicy_home_node sys_set_task_mempolicy_home_node +460 common task_mbind sys_task_mbind =20 # # Due to a historical design error, certain syscalls are numbered differen= tly diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fd9d12de7e92..fd1a8863b5c1 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -816,12 +816,21 @@ asmlinkage long sys_mbind(unsigned long start, unsign= ed long len, const unsigned long __user *nmask, unsigned long maxnode, unsigned flags); +asmlinkage long sys_task_mbind(const struct mbind_args __user *uargs, + size_t usize); asmlinkage long sys_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags); asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nm= ask, unsigned long maxnode); +asmlinkage long sys_get_task_mempolicy(pid_t pid, int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, unsigned long flags); +asmlinkage long sys_set_task_mempolicy(pid_t pid, int mode, + const unsigned long __user *nmask, + unsigned long maxnode); asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *from, const unsigned long __user *to); @@ -945,6 +954,11 @@ asmlinkage long sys_memfd_secret(unsigned int flags); asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned = long len, unsigned long home_node, unsigned long flags); +asmlinkage long sys_set_task_mempolicy_home_node(pid_t pid, + unsigned long start, + unsigned long len, + unsigned long home_node, + unsigned long flags); asmlinkage long sys_cachestat(unsigned int fd, struct cachestat_range __user *cstat_range, struct cachestat __user *cstat, unsigned int flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/u= nistd.h index 756b013fb832..f179715f1d59 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -828,9 +828,17 @@ __SYSCALL(__NR_futex_wake, sys_futex_wake) __SYSCALL(__NR_futex_wait, sys_futex_wait) #define __NR_futex_requeue 456 __SYSCALL(__NR_futex_requeue, sys_futex_requeue) +#define __NR_set_task_mempolicy 457 +__SYSCALL(__NR_set_task_mempolicy, sys_set_task_mempolicy) +#define __NR_get_task_mempolicy 458 +__SYSCALL(__NR_get_task_mempolicy, sys_get_task_mempolicy) +#define __NR_set_task_mempolicy_home_node 459 +__SYSCALL(__NR_set_task_mempolicy_home_node, sys_set_task_mempolicy_home_n= ode) +#define __NR_task_mbind 460 +__SYSCALL(__NR_task_mbind, sys_task_mbind) =20 #undef __NR_syscalls -#define __NR_syscalls 457 +#define __NR_syscalls 461 =20 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index a8963f7ef4c2..c29cfb25db29 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -26,6 +26,16 @@ enum { MPOL_MAX, /* always last member of enum */ }; =20 +struct mbind_args { + pid_t pid; + unsigned long start; + unsigned long len; + unsigned long mode; + unsigned long *nmask; + unsigned long maxnode; + unsigned int flags; +}; + /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3d2171ac4098..fb295ade8ad7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1654,6 +1654,32 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned lo= ng, start, unsigned long, le return __set_mempolicy_home_node(current, start, len, home_node, flags); } =20 +SYSCALL_DEFINE5(set_task_mempolicy_home_node, pid_t, pid, unsigned long, s= tart, + unsigned long, len, unsigned long, home_node, + unsigned long, flags) +{ + struct task_struct *task; + int err; + + if (pid && !capable(CAP_SYS_NICE)) + return -EPERM; + + rcu_read_lock(); + task =3D pid ? find_task_by_vpid(pid) : current; + if (!task) { + rcu_read_unlock(); + err =3D -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + err =3D __set_mempolicy_home_node(task, start, len, home_node, flags); + put_task_struct(task); +out: + return err; +} + SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) @@ -1661,6 +1687,48 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigne= d long, len, return kernel_mbind(current, start, len, mode, nmask, maxnode, flags); } =20 +static long kernel_task_mbind(const struct mbind_args __user *uargs, + size_t usize) +{ + struct mbind_args kargs; + struct task_struct *task; + int err; + + if (usize < sizeof(kargs)) + return -EINVAL; + + err =3D copy_struct_from_user(&kargs, sizeof(kargs), uargs, usize); + if (err) + return err; + + + if (kargs.pid && !capable(CAP_SYS_NICE)) + return -EPERM; + + rcu_read_lock(); + task =3D kargs.pid ? find_task_by_vpid(kargs.pid) : current; + if (!task) { + rcu_read_unlock(); + err =3D -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + err =3D kernel_mbind(task, kargs.start, kargs.len, kargs.mode, + kargs.nmask, kargs.maxnode, kargs.flags); + + put_task_struct(task); +out: + return err; +} + +SYSCALL_DEFINE2(task_mbind, const struct mbind_args __user *, args, + size_t, size) +{ + return kernel_task_mbind(args, size); +} + /* Set the process memory policy */ static long kernel_set_mempolicy(struct task_struct *task, int mode, const unsigned long __user *nmask, @@ -1688,6 +1756,31 @@ SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsi= gned long __user *, nmask, return kernel_set_mempolicy(current, mode, nmask, maxnode); } =20 +SYSCALL_DEFINE4(set_task_mempolicy, pid_t, pid, int, mode, + const unsigned long __user *, nmask, unsigned long, maxnode) +{ + struct task_struct *task; + int err; + + if (pid && !capable(CAP_SYS_NICE)) + return -EPERM; + + rcu_read_lock(); + task =3D pid ? find_task_by_vpid(pid) : current; + if (!task) { + rcu_read_unlock(); + err =3D -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + err =3D kernel_set_mempolicy(task, mode, nmask, maxnode); + put_task_struct(task); +out: + return err; +} + static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) @@ -1821,6 +1914,32 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, flags); } =20 +SYSCALL_DEFINE6(get_task_mempolicy, pid_t, pid, int __user *, policy, + unsigned long __user *, nmask, unsigned long, maxnode, + unsigned long, addr, unsigned long, flags) +{ + struct task_struct *task; + int err; + + if (pid && !capable(CAP_SYS_NICE)) + return -EPERM; + + rcu_read_lock(); + task =3D pid ? find_task_by_vpid(pid) : current; + if (!task) { + rcu_read_unlock(); + err =3D -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + err =3D kernel_get_mempolicy(task, policy, nmask, maxnode, addr, flags); + put_task_struct(task); +out: + return err; +} + bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id C3A64C61D97 for ; Wed, 22 Nov 2023 21:13:01 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1344497AbjKVVNC (ORCPT ); Wed, 22 Nov 2023 16:13:02 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:53520 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1344589AbjKVVMl (ORCPT ); Wed, 22 Nov 2023 16:12:41 -0500 Received: from mail-pf1-x444.google.com (mail-pf1-x444.google.com [IPv6:2607:f8b0:4864:20::444]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 27D4BD41; Wed, 22 Nov 2023 13:12:30 -0800 (PST) Received: by mail-pf1-x444.google.com with SMTP id d2e1a72fcca58-6cbc8199a2aso239284b3a.1; Wed, 22 Nov 2023 13:12:30 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687549; x=1701292349; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=Ytb+zLJCPBPTeP+HrayDkCJaJ1nmZr7MUC/QCQp0Rn8=; b=VpCN+Rfz0k6df6KdgOAJqVdlwGCL1xYt1DbAqiyLKtIAYGBNWAK15OaSgKlYHaA9Td td8PdKQiHXmAL06x0fNiErR4d6Szu/1UZ2+OlPyAyEhpDiZBWFDCLWX3QM3/01UHGY4U R6QBcTJSHgv6nzzVNcofnNXOqwnYGDEpdD8sYhCqUC8Wb0UzHFkVcwQR/TFZ7hPDMpZG 3Va2OjxEAJcij/mAOO4qfIbtZIHjm5i7qnVgPtQ0bs/x+X2AJJKSPdi5m32l0rwAoW/f prtc8LNNI9ycudrLBrTUFS7ekI1M7FVgzxpHqmLQ7Z6AllzKaIVYBKBvcS0A1cOI7wQP qR4A== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687549; x=1701292349; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=Ytb+zLJCPBPTeP+HrayDkCJaJ1nmZr7MUC/QCQp0Rn8=; b=JMoUL41RiZlMAAb3sA77J/hR6zHnPnZ0BlcFDqnSxLUg6vaPTVN7YTX4i+XICBYRK/ 2SyDGbJZP+R62sgQzurlBUbg96Y3xB4jH3lJSbM1H83w0xz1d7e+eH+YAI37tYJVsD4I o9BzYUdxdmgtG5ZhT2wBHNlPtdcpxAQGPvMxEO/4J97WNna/izw/JwQuSirnwpROwisF bnbREmviswuwxHfFS4kQBt6eFUDS7WPOwB5NdIdSYgvzyYU8RYYyuiSolM6uFmJ6GdoQ 236nPG6MOmMfZmuP3na4UKh/CsPa8Ge3CWi790ruhMZNHmIpdCy4baWS1ZBQ1KMwrSvv mFmg== X-Gm-Message-State: AOJu0YyxjEzyfYiRI1j0aJOtfgQUQHnVfzofq2bhEURK4rVJwBjMx84p 05p6jwPxjcukmOuQBtqkQw== X-Google-Smtp-Source: AGHT+IFKLpnDulOr/ZPYNoDPRpcqX2G937AQ6Vr6ejSPK5ZU1d6oIX9/jtBqfVf6LI42wNWwo5Yp6w== X-Received: by 2002:a05:6a20:3d8f:b0:18b:37b4:e4e7 with SMTP id s15-20020a056a203d8f00b0018b37b4e4e7mr3025618pzi.39.1700687549206; Wed, 22 Nov 2023 13:12:29 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.27 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:28 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 08/11] mm/mempolicy: export replace_mempolicy for use by procfs Date: Wed, 22 Nov 2023 16:11:57 -0500 Message-Id: <20231122211200.31620-9-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" We will be adding a /proc/pid/mempolicy entry for use in swapping the mempolicy of a process at runtime. Export replace_mempolicy so that this can be used by that interface. Signed-off-by: Gregory Price --- include/linux/mempolicy.h | 9 +++++++++ mm/mempolicy.c | 5 ++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 931b118336f4..b951e96a53ce 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -177,6 +177,8 @@ static inline bool mpol_is_preferred_many(struct mempol= icy *pol) =20 extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zon= e); =20 +extern long replace_mempolicy(struct task_struct *task, struct mempolicy *= new, + nodemask_t *nodes); #else =20 struct mempolicy {}; @@ -297,5 +299,12 @@ static inline bool mpol_is_preferred_many(struct mempo= licy *pol) return false; } =20 +static inline long replace_mempolicy(struct task_struct *task, + struct mempolicy *new, + nodemask_t *nodes) +{ + return -ENODEV; +} + #endif /* CONFIG_NUMA */ #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fb295ade8ad7..e0c9127571dd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -815,9 +815,8 @@ static int mbind_range(struct vma_iterator *vmi, struct= vm_area_struct *vma, } =20 /* Attempt to replace mempolicy, release the old one if successful */ -static long replace_mempolicy(struct task_struct *task, - struct mempolicy *new, - nodemask_t *nodes) +long replace_mempolicy(struct task_struct *task, struct mempolicy *new, + nodemask_t *nodes) { struct mempolicy *old =3D NULL; NODEMASK_SCRATCH(scratch); --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id C5385C27C40 for ; Wed, 22 Nov 2023 21:13:03 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1344698AbjKVVNF (ORCPT ); Wed, 22 Nov 2023 16:13:05 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37704 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S235201AbjKVVMs (ORCPT ); Wed, 22 Nov 2023 16:12:48 -0500 Received: from mail-pf1-x441.google.com (mail-pf1-x441.google.com [IPv6:2607:f8b0:4864:20::441]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 0ED571703; Wed, 22 Nov 2023 13:12:32 -0800 (PST) Received: by mail-pf1-x441.google.com with SMTP id d2e1a72fcca58-6cb66fbc63dso204290b3a.0; Wed, 22 Nov 2023 13:12:32 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687552; x=1701292352; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=vndFmtwoNBfvOSHr5hNiC5ndKtGe9ea3Be2dCt86/rY=; b=TCX1z6jIkLlsoLVCH5SmYQhGEaYyrHjOOP2C7HxFhIwnNfD9Bvxw074Ujx4VEZp2S+ tzw2iO3/4lI9kEWyRPQbO7V39EIBawKUCTBrgXLfem3i5Zqi1f2jWft/p+mKKkuYBVP9 3xg0QYbN3+XUmo2ENp2hH1/iXuaChTz7lMLSg4U56TN3p/5YgaMObtY1TTQXQUhfGOA8 ei5FGMF0L/jr3g6Fx93jWe7yX8aL3AC1Id0RAe4k8LzS+EX/ndyJ6NzzMRz2bv8CgRjv qmFt1vbTLKSW4r9COidZNttv1bM8TOD9tVa5/CweETWP5rIJmHkhTjigxVbp5uREu7l1 nRYw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687552; x=1701292352; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=vndFmtwoNBfvOSHr5hNiC5ndKtGe9ea3Be2dCt86/rY=; b=vYV+xS8OP2IeX/mBpG4GKhHL5PY5sFrbDerHxIhIjsi3y97KG3iDqy4wTa7iOnQVPY C9p0jK8cgS7S2HWou23bl0Tty0qG0GP+EXQjJQQffpEM8/nGuN/vQHigSNVZLzX2pu9w Tnw/j1bQJYaS8LfK85UhJhFDel4UdAxhAtBau3pncK20YjYeQ0h8YBg8CcatYndb1xoB ah+jdJ5Xo5cZx2JFSbA+fOLaaGfEej6v+aTq922jpFMqe5ZD1KXh8PwsdyYOIE8xl/oZ 2CAEtMSduycYKjNL5YzIqpSGp6Yxe2Jnbh4pUxjjiDGkGpfJV5lQTCzrCkbb9Zm2CQqX ZNDQ== X-Gm-Message-State: AOJu0Yxv1Ryqfg2T129FBekcYf8WHnIoFx7Ndkx429gYvQ813ZBvuGAz LqOjZFg77iBkh4oM57T54w== X-Google-Smtp-Source: AGHT+IFbCsnwOzk3NV6X56jO78VOMP+TCUq5eJCWYi9iESvIVcb1EX5hhADLjZtOWBoP1ANbQ+1PSw== X-Received: by 2002:a05:6a00:2d94:b0:6c4:d615:2169 with SMTP id fb20-20020a056a002d9400b006c4d6152169mr1127761pfb.10.1700687552189; Wed, 22 Nov 2023 13:12:32 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.30 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:31 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 09/11] mm/mempolicy: build mpol_parse_str unconditionally Date: Wed, 22 Nov 2023 16:11:58 -0500 Message-Id: <20231122211200.31620-10-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" mpol_parse_str is conditioned on CONFIG_TMPFS. We intend to reuse this interface for procfs/mempolicy, so build unconditionally. Signed-off-by: Gregory Price --- include/linux/mempolicy.h | 4 ---- mm/mempolicy.c | 2 -- 2 files changed, 6 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index b951e96a53ce..1adbcc10f291 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -158,9 +158,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodema= sk_t *from, const nodemask_t *to, int flags); =20 =20 -#ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); -#endif =20 extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); =20 @@ -276,12 +274,10 @@ static inline void check_highest_zone(int k) { } =20 -#ifdef CONFIG_TMPFS static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } -#endif =20 static inline int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e0c9127571dd..a418af0a1359 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3115,7 +3115,6 @@ static const char * const policy_modes[] =3D [MPOL_PREFERRED_MANY] =3D "prefer (many)", }; =20 -#ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse @@ -3248,7 +3247,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) *mpol =3D new; return err; } -#endif /* CONFIG_TMPFS */ =20 /** * mpol_to_str - format a mempolicy structure for printing --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 8F77BC61D97 for ; Wed, 22 Nov 2023 21:13:12 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1344628AbjKVVNO (ORCPT ); Wed, 22 Nov 2023 16:13:14 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:60342 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1344372AbjKVVMz (ORCPT ); Wed, 22 Nov 2023 16:12:55 -0500 Received: from mail-pg1-x543.google.com (mail-pg1-x543.google.com [IPv6:2607:f8b0:4864:20::543]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id E45FF1719; Wed, 22 Nov 2023 13:12:35 -0800 (PST) Received: by mail-pg1-x543.google.com with SMTP id 41be03b00d2f7-5bd306f86a8so167434a12.0; Wed, 22 Nov 2023 13:12:35 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687555; x=1701292355; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=HOrPhfU2mXxyBlZrT3heEZSZHpuQWnnIT7HedEB0hRs=; b=JlX5uNarejalLnI2Pu9HRXNhGvHcdL33My/bwG9ec8rFnYIwKhWNLJ0Xq6ol+iUSR6 Daqhk/EcldqH7f2dIz328o5byD9vx3Uybrwb7v9Qaz9MYOtXGViOpheoTDS67EnuZqNR lc9P2iVzKMtElT+73SsU6L2wEPfkYvsrOvccYSZw9EdPAMRhi9LachWT0FMA9TDeOuP1 lL1jcHeIi0DV3QgZO1YCyAJhwvFX2wEFJBALpog+DvgCb2qtkg648KTK0Ob23P8j/Rqb kf/eOggQdNf8S3eHc8d9YFvl95uktY65xUJPfEoraCwpjmMNeczSPz3ke8iAm8ZjvfpY D13w== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687555; x=1701292355; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=HOrPhfU2mXxyBlZrT3heEZSZHpuQWnnIT7HedEB0hRs=; b=MTwSuOl3ZAWmV/EQawgctdZwe0NbRgG8+AfqKVztIbCqG052bzG+4nhQqrVzGKklt8 2FqWDWEVk2aai994xPJ/9fMKBSgYJGp3Z9T8SbGNp0tE3KzHYF8izyQRIaW2W+y0vZIu s3N/JzxoT4UqgAEDgj0etk1WCXbz7sZmsLsd5gG9Qq3SNGr7SCFx/HQhoEsD9i7A6LVq pMgzeH+ZO83KzTSL5Ib+r4N6FcyzEpjKKivy6dcBJ13n1Ak3B/DGxU6BtQyVvTgE65WE IDCFIMp3adGSZrDsCMGjadBtzh2VZ49tQLemagBmse5qOuovvQumt0g9VijPD8Rz2viF yKDw== X-Gm-Message-State: AOJu0YwLOEOxni1bAR4+hJxFVu3LW9/51buaUaF4Va0WlKrAdB0v7soW jv1pJb62R2i6ThTmv9dV9Q== X-Google-Smtp-Source: AGHT+IEMVmGwOk2fW31bSZrZqUdP6V2qhUklPoSlj+1Lun+mapADWEqdgcI4ZGVd2ENv41y7cSvJXA== X-Received: by 2002:a05:6a21:9982:b0:189:11e8:6237 with SMTP id ve2-20020a056a21998200b0018911e86237mr4066108pzb.51.1700687554935; Wed, 22 Nov 2023 13:12:34 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.33 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:34 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 10/11] mm/mempolicy: mpol_parse_str should ignore trailing characters in nodelist Date: Wed, 22 Nov 2023 16:11:59 -0500 Message-Id: <20231122211200.31620-11-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" When validating MPOL_PREFERRED, the nodelist has already been parsed and error checked by nodelist_parse. So rather than looping through the string again, we should just check that the weight of the nodemask is 1, which is the actual condition we care to check. This also handles the case where newline characters are present. Signed-off-by: Gregory Price --- mm/mempolicy.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a418af0a1359..eac71f2adfdc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3159,12 +3159,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpo= l) * nodelist (or nodes) cannot be empty. */ if (nodelist) { - char *rest =3D nodelist; - while (isdigit(*rest)) - rest++; - if (*rest) - goto out; - if (nodes_empty(nodes)) + if (nodes_weight(nodes) !=3D 1) goto out; } break; --=20 2.39.1 From nobody Wed Dec 17 19:06:01 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 15D69C61D97 for ; Wed, 22 Nov 2023 21:13:16 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1344651AbjKVVNR (ORCPT ); Wed, 22 Nov 2023 16:13:17 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37068 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1344654AbjKVVM5 (ORCPT ); Wed, 22 Nov 2023 16:12:57 -0500 Received: from mail-il1-x143.google.com (mail-il1-x143.google.com [IPv6:2607:f8b0:4864:20::143]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id DE3D81732; Wed, 22 Nov 2023 13:12:38 -0800 (PST) Received: by mail-il1-x143.google.com with SMTP id e9e14a558f8ab-359d27f6d46so675435ab.3; Wed, 22 Nov 2023 13:12:38 -0800 (PST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1700687558; x=1701292358; darn=vger.kernel.org; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=nj0/yLijkUDdchJx9f8WaqM9fNxYMjAKFnMHDPEdLR0=; b=CvLkIMDkDzteeQ6Ts+K4t3f+w6WGRk2D658wVs0Lj9UFMwobPY0HwrMsnMkXfyh3S9 VfilVD4qVHokHaRvnYM+cfsfPHXM0UKBl+S6ieVvv2BDzEWzmjo/8hqU9iffVFFi5WAt aX7QSfHXZWk5DBtsznxyMK47zYlulIPdgiY118oPBGWruuB1GMRLogc3boldj1f8MMbd QQJP9BwFKPKeUAsmkEF4J++KwT7cFA0Ol0bdjmpLM6cDl7DbKlEPls1CYBkG2qSkswVe g0Oz1Q35EkfceH9oFKpgdUZ/EUtwUaNlqyp9Hr/geiEwMpjnYQHn6PntfhiGXVq4+Wg+ bD1w== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1700687558; x=1701292358; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=nj0/yLijkUDdchJx9f8WaqM9fNxYMjAKFnMHDPEdLR0=; b=MAZ2goqW4hMq6r9Qcpm+XCKoIiqUIiLtUj8mVwEPWJDgQmU8WS8sgZ9DLX2t7oC0jW wUd+eAwHNNoNKsLGIn9QD8+CxYE9eoXymZ+T6i1rzYURaygBJZnNfBjR9O8JaRJlrTYE DlzFkcurlb+hSTdryPfFBMUaYzh4++ZOLhpsH3hJi4q/Ot0ClUFsFxsWlkDBNabQEh4+ 3ILdX8qeGVvZAoqAUPDDpdQyQlq9+nCJqkgFtVirAzBKAKyI8V9QD1xIYJXTab8AdNGS QcAhuQXSm88JS4AbLx0PU8LKBkv60B2HQ00qOLSwXJnRD24omp2YRoup6w1gP5Vve0Wa g7Ug== X-Gm-Message-State: AOJu0Yy7Y18iohcUp0gv2YRtwWCUFBZ9cMx7GeOD+fm/q8Z+lgEVyz4G WPAA61Ogf3zWDS2wqrlS5Hz1N0Wfo8BX X-Google-Smtp-Source: AGHT+IGWypkgJaNcLXjK/eS64ds6ElauQ9WWQUmh33zgk8yVZiSbJ9XTKd3ElRdp4RxU4lnDOSLTsw== X-Received: by 2002:a05:6e02:1c84:b0:350:f51b:c32e with SMTP id w4-20020a056e021c8400b00350f51bc32emr4817949ill.16.1700687558122; Wed, 22 Nov 2023 13:12:38 -0800 (PST) Received: from fedora.mshome.net ([75.167.214.230]) by smtp.gmail.com with ESMTPSA id j18-20020a635512000000b005bdbce6818esm132136pgb.30.2023.11.22.13.12.35 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Wed, 22 Nov 2023 13:12:37 -0800 (PST) From: Gregory Price X-Google-Original-From: Gregory Price To: linux-mm@kvack.org Cc: linux-doc@vger.kernel.org, linux-fsdevel@vger.kernel.org, linux-api@vger.kernel.org, linux-arch@vger.kernel.org, linux-kernel@vger.kernel.org, akpm@linux-foundation.org, arnd@arndb.de, tglx@linutronix.de, luto@kernel.org, mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com, mhocko@kernel.org, tj@kernel.org, ying.huang@intel.com, Gregory Price Subject: [RFC PATCH 11/11] fs/proc: Add mempolicy attribute to allow read/write of task mempolicy Date: Wed, 22 Nov 2023 16:12:00 -0500 Message-Id: <20231122211200.31620-12-gregory.price@memverge.com> X-Mailer: git-send-email 2.39.1 In-Reply-To: <20231122211200.31620-1-gregory.price@memverge.com> References: <20231122211200.31620-1-gregory.price@memverge.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Expose mempolicy via procfs, and utilize the existing mpol_parse_str format to allow external tasks to change the policies of another task. mpol_parse_str format: [=3D][:] valid settings: "prefer" (without a nodemask, aliases to 'local') "prefer:node" "interleave:nodelist" "local" "default" "prefer (many):nodelist" "bind:nodelist" flags are either "=3Dstatic" or "=3Drelative", and cannot be used with "prefer" or "local" ("prefer=3Dflag:nodelist" is valid). Signed-off-by: Gregory Price --- fs/proc/Makefile | 1 + fs/proc/base.c | 1 + fs/proc/internal.h | 1 + fs/proc/mempolicy.c | 117 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+) create mode 100644 fs/proc/mempolicy.c diff --git a/fs/proc/Makefile b/fs/proc/Makefile index bd08616ed8ba..272d22d9022f 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -27,6 +27,7 @@ proc-y +=3D softirqs.o proc-y +=3D namespaces.o proc-y +=3D self.o proc-y +=3D thread_self.o +proc-y +=3D mempolicy.o proc-$(CONFIG_PROC_SYSCTL) +=3D proc_sysctl.o proc-$(CONFIG_NET) +=3D proc_net.o proc-$(CONFIG_PROC_KCORE) +=3D kcore.o diff --git a/fs/proc/base.c b/fs/proc/base.c index dd31e3b6bf77..3eb3d6d81a8e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3279,6 +3279,7 @@ static const struct pid_entry tgid_base_stuff[] =3D { REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), + REG("mempolicy", S_IRUSR|S_IWUSR, proc_mempolicy_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff5..e8e81629a8d8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -303,6 +303,7 @@ extern const struct file_operations proc_pid_smaps_oper= ations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_mempolicy_operations; =20 extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/mempolicy.c b/fs/proc/mempolicy.c new file mode 100644 index 000000000000..417c2c8046d9 --- /dev/null +++ b/fs/proc/mempolicy.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_NUMA +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define MPOL_STR_SIZE 4096 +static ssize_t mempolicy_read_proc(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + struct mempolicy *policy; + char *buffer; + ssize_t rv =3D 0; + size_t outlen; + + buffer =3D kzalloc(MPOL_STR_SIZE, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + task =3D get_proc_task(file_inode(file)); + if (!task) { + rv =3D -ESRCH; + goto freebuf; + } + + task_lock(task); + policy =3D get_task_policy(task); + mpol_get(policy); + task_unlock(task); + + if (!policy) + goto out; + + mpol_to_str(buffer, MPOL_STR_SIZE, policy); + + buffer[MPOL_STR_SIZE-1] =3D '\0'; + outlen =3D strlen(buffer); + if (outlen < MPOL_STR_SIZE - 1) { + buffer[outlen] =3D '\n'; + buffer[outlen + 1] =3D '\0'; + outlen++; + } + rv =3D simple_read_from_buffer(buf, count, ppos, buffer, outlen); + mpol_put(policy); +out: + put_task_struct(task); +freebuf: + kfree(buffer); + return rv; +} + +static ssize_t mempolicy_write_proc(struct file *file, const char __user *= buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + struct mempolicy *new_policy =3D NULL; + char *mempolicy_str, *nl; + nodemask_t nodes; + int err; + + mempolicy_str =3D kmalloc(count + 1, GFP_KERNEL); + if (!mempolicy_str) + return -ENOMEM; + + if (copy_from_user(mempolicy_str, buf, count)) { + kfree(mempolicy_str); + return -EFAULT; + } + mempolicy_str[count] =3D '\0'; + + /* strip new line characters for simplicity of handling by parser */ + nl =3D strchr(mempolicy_str, '\n'); + if (nl) + *nl =3D '\0'; + nl =3D strchr(mempolicy_str, '\r'); + if (nl) + *nl =3D '\0'; + + err =3D mpol_parse_str(mempolicy_str, &new_policy); + if (err) { + kfree(mempolicy_str); + return err; + } + + /* If no error and no policy, it was 'default', clear node list */ + if (new_policy) + nodes =3D new_policy->nodes; + else + nodes_clear(nodes); + + task =3D get_proc_task(file_inode(file)); + if (!task) { + mpol_put(new_policy); + kfree(mempolicy_str); + return -ESRCH; + } + + err =3D replace_mempolicy(task, new_policy, &nodes); + + put_task_struct(task); + kfree(mempolicy_str); + + return err ? err : count; +} + +const struct file_operations proc_mempolicy_operations =3D { + .read =3D mempolicy_read_proc, + .write =3D mempolicy_write_proc, + .llseek =3D noop_llseek, +}; +#endif /* CONFIG_NUMA */ --=20 2.39.1