From nobody Wed Feb 11 22:54:49 2026 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 5F92E23759 for ; Tue, 18 Jun 2024 23:27:00 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753222; cv=none; b=Ig+VCP6G2HNaXV/OaIH0LxoePX9IU63CAFGyJ6ItrwEfMDHLUKwP5jDSR8ul+31zq6mvWiltxR1CrqlyHy8lXKGaICreYIyfKP42vHhuDNQuUdXlfKXkncM743Yto1qklozwXgXygPhKWe7ODa20YwxaQ/m4qra/C1X7maUp+xw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753222; c=relaxed/simple; bh=0vFa6zFjl0vkzalReskAx7/vllP2W2aEst65PwL+gXY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=SMkMDqI6Vfi95c6G7aqz8xvnYwkx9p7yOOsO8qmu+mqlvU+RlWB07zqpaOOBvZ99/fFs30kgmOtzWPmjAZmB2pEc7ed3CcSh/m3Z0uCe8O6tXorrQaIt1wh4bnoiAA1gyoiJf5+orMSn6R1jLVZw+7iksqSvfPQx8iE6TVAP8CM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 5CD461042; Tue, 18 Jun 2024 16:27:24 -0700 (PDT) Received: from e125769.cambridge.arm.com (e125769.cambridge.arm.com [10.1.196.27]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id EECF83F64C; Tue, 18 Jun 2024 16:26:57 -0700 (PDT) From: Ryan Roberts To: Andrew Morton , Chris Li , Kairui Song , "Huang, Ying" , Kalesh Singh , Barry Song , Hugh Dickins , David Hildenbrand Cc: Ryan Roberts , linux-kernel@vger.kernel.org, linux-mm@kvack.org Subject: [RFC PATCH v1 1/5] mm: swap: Simplify end-of-cluster calculation Date: Wed, 19 Jun 2024 00:26:41 +0100 Message-ID: <20240618232648.4090299-2-ryan.roberts@arm.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240618232648.4090299-1-ryan.roberts@arm.com> References: <20240618232648.4090299-1-ryan.roberts@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Its possible that a swap file will have a partial cluster at the end, if the swap size is not a multiple of the cluster size. But this partial cluster will never be marked free and so scan_swap_map_try_ssd_cluster() will never see it. Therefore it can always consider that a cluster ends at the next cluster boundary. This leads to a simplification of the endpoint calculation and removal of an unnecessary conditional. This change has the useful side effect of making lock_cluster() unconditional, which will be used in a later commit. Signed-off-by: Ryan Roberts --- mm/swapfile.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index b3e5e384e330..30e79739dfdc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -677,16 +677,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap= _info_struct *si, * check if there is still free entry in the cluster, maintaining * natural alignment. */ - max =3D min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER)); - if (tmp < max) { - ci =3D lock_cluster(si, tmp); - while (tmp < max) { - if (swap_range_empty(si->swap_map, tmp, nr_pages)) - break; - tmp +=3D nr_pages; - } - unlock_cluster(ci); + max =3D ALIGN(tmp + 1, SWAPFILE_CLUSTER); + ci =3D lock_cluster(si, tmp); + while (tmp < max) { + if (swap_range_empty(si->swap_map, tmp, nr_pages)) + break; + tmp +=3D nr_pages; } + unlock_cluster(ci); if (tmp >=3D max) { cluster->next[order] =3D SWAP_NEXT_INVALID; goto new_cluster; -- 2.43.0 From nobody Wed Feb 11 22:54:49 2026 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 555451849E7 for ; Tue, 18 Jun 2024 23:27:02 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753224; cv=none; b=A6KrC3bpoUamTQTiOHNyw11ypMoJxy0NxMEu5sCXwylokFpFkguFVCaWYKWM86HY5A6wY/F7+H2CCYiqNGuk8AndwFV0ASoayxTtFSsK7sn1bLVrkv3fTd16wg9hsTI96Db45pioaQ0BKSJj5qtqx4DRIqMsHswIQ6+lDqs/3a4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753224; c=relaxed/simple; bh=KPWhWMwn8+fxwlOnQKIDT3FoyE/8dI9V/ENsNSbrzHY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=hXaA4bNZwwX6pGLTdgpexItabwGxlf3FodbXT4+4Af/4Mud/jJ6nUbIRjl7kkzK1frHZj0RyTORCY5y5u6rhY5uhQw9+vfpMxdzaz2oh2wg/2yC03oGdZ0D1FCccW3gd95l7knsh9uj/dZDrxnxIfymgWKGuz7WxSc+7fm1etEQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 5C3A61474; Tue, 18 Jun 2024 16:27:26 -0700 (PDT) Received: from e125769.cambridge.arm.com (e125769.cambridge.arm.com [10.1.196.27]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id EE2AE3F64C; Tue, 18 Jun 2024 16:26:59 -0700 (PDT) From: Ryan Roberts To: Andrew Morton , Chris Li , Kairui Song , "Huang, Ying" , Kalesh Singh , Barry Song , Hugh Dickins , David Hildenbrand Cc: Ryan Roberts , linux-kernel@vger.kernel.org, linux-mm@kvack.org Subject: [RFC PATCH v1 2/5] mm: swap: Change SWAP_NEXT_INVALID to highest value Date: Wed, 19 Jun 2024 00:26:42 +0100 Message-ID: <20240618232648.4090299-3-ryan.roberts@arm.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240618232648.4090299-1-ryan.roberts@arm.com> References: <20240618232648.4090299-1-ryan.roberts@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" We are about to introduce a scanning mechanism that can present 0 as a valid cluster offset to scan_swap_map_try_ssd_cluster(), so let's change SWAP_NEXT_INVALID to UINT_MAX, which is always invalid as an offset in practice. Signed-off-by: Ryan Roberts --- include/linux/swap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index bd450023b9a4..66566251ba31 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -261,12 +261,12 @@ struct swap_cluster_info { #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ /* - * The first page in the swap file is the swap header, which is always mar= ked - * bad to prevent it from being allocated as an entry. This also prevents = the - * cluster to which it belongs being marked free. Therefore 0 is safe to u= se as - * a sentinel to indicate next is not valid in percpu_cluster. + * swap_info_struct::max is an unsigned int, so the maximum number of page= s in + * the swap file is UINT_MAX. Therefore the highest legitimate index is + * UINT_MAX-1. Therefore UINT_MAX is safe to use as a sentinel to indicate= next + * is not valid in percpu_cluster. */ -#define SWAP_NEXT_INVALID 0 +#define SWAP_NEXT_INVALID UINT_MAX #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) -- 2.43.0 From nobody Wed Feb 11 22:54:49 2026 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 5ECBA23759 for ; Tue, 18 Jun 2024 23:27:04 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753226; cv=none; b=mV6CrHuowSjYXbjojOz1Gfo1r2yUF6sBO9BNHLHfs36PHOm75Z/RnQhPBXfsRUkPo7AZbA5zxoGn1K8PalU+BP/GNmIq4TZQPez0RLpgr8NqdWl1pryaRmRg9UeA3h2CKrkxH6tcXzGWw3Imdn4+nmVnFOUdIwoIplG5yd6RNF0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753226; c=relaxed/simple; bh=OvWkGAiTCnldUmxHd3cu95ToTjRntwaLtWx2AD+68yM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=TEvNqa5IBPK1N6FQlFPZCSgV2cGQ1AlFqOsLJyS7Qwh3BYC+iRLgaOM2PARmhPa2WPp40WGz0OXJ63lTHvfWndkr2p6yWodvOdj0dj3SpaujlLvbglJwDgE2TkDQ8E76aXjzedpjdF/gNaVs9P9ZSjrf0ffs8ZWCHJixMXu2dcc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 5D21A1477; Tue, 18 Jun 2024 16:27:28 -0700 (PDT) Received: from e125769.cambridge.arm.com (e125769.cambridge.arm.com [10.1.196.27]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id ED9C93F64C; Tue, 18 Jun 2024 16:27:01 -0700 (PDT) From: Ryan Roberts To: Andrew Morton , Chris Li , Kairui Song , "Huang, Ying" , Kalesh Singh , Barry Song , Hugh Dickins , David Hildenbrand Cc: Ryan Roberts , linux-kernel@vger.kernel.org, linux-mm@kvack.org Subject: [RFC PATCH v1 3/5] mm: swap: Track allocation order for clusters Date: Wed, 19 Jun 2024 00:26:43 +0100 Message-ID: <20240618232648.4090299-4-ryan.roberts@arm.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240618232648.4090299-1-ryan.roberts@arm.com> References: <20240618232648.4090299-1-ryan.roberts@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Add an `order` field to `struct swap_cluster_info`, which applies to allocated clusters (i.e. those not on the free list) and tracks the swap entry order that the cluster should be used to allocate. A future commit will use this information to scan partially filled clusters to find appropriate free swap entries for allocation. Note that it is still possible that order-0 swap entries will be allocated in clusters that indicate a higher order due to the order-0 scanning mechanism. The maximum order we ever expect to see is 13 - PMD-size on arm64 with 64K base pages. 13 fits into 4 bits, so let's steal 4 unused flags bits for this purpose to avoid making `struct swap_cluster_info` any bigger. Signed-off-by: Ryan Roberts --- include/linux/swap.h | 3 ++- mm/swapfile.c | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 66566251ba31..2a40fe02d281 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -255,7 +255,8 @@ struct swap_cluster_info { * cluster */ unsigned int data:24; - unsigned int flags:8; + unsigned int flags:4; + unsigned int order:4; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 30e79739dfdc..7b13f02a7ac2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -307,11 +307,13 @@ static inline void cluster_set_count(struct swap_clus= ter_info *info, info->data =3D c; } -static inline void cluster_set_count_flag(struct swap_cluster_info *info, - unsigned int c, unsigned int f) +static inline void cluster_set_count_flag_order(struct swap_cluster_info *= info, + unsigned int c, unsigned int f, + unsigned int o) { info->flags =3D f; info->data =3D c; + info->order =3D o; } static inline unsigned int cluster_next(struct swap_cluster_info *info) @@ -330,6 +332,7 @@ static inline void cluster_set_next_flag(struct swap_cl= uster_info *info, { info->flags =3D f; info->data =3D n; + info->order =3D 0; } static inline bool cluster_is_free(struct swap_cluster_info *info) @@ -346,6 +349,7 @@ static inline void cluster_set_null(struct swap_cluster= _info *info) { info->flags =3D CLUSTER_FLAG_NEXT_NULL; info->data =3D 0; + info->order =3D 0; } static inline struct swap_cluster_info *lock_cluster(struct swap_info_stru= ct *si, @@ -521,13 +525,14 @@ static void swap_users_ref_free(struct percpu_ref *re= f) complete(&si->comp); } -static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +static void alloc_cluster(struct swap_info_struct *si, unsigned long idx, + int order) { struct swap_cluster_info *ci =3D si->cluster_info; VM_BUG_ON(cluster_list_first(&si->free_clusters) !=3D idx); cluster_list_del_first(&si->free_clusters, ci); - cluster_set_count_flag(ci + idx, 0, 0); + cluster_set_count_flag_order(ci + idx, 0, 0, order); } static void free_cluster(struct swap_info_struct *si, unsigned long idx) @@ -556,14 +561,15 @@ static void free_cluster(struct swap_info_struct *si,= unsigned long idx) */ static void add_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr, - unsigned long count) + int order) { unsigned long idx =3D page_nr / SWAPFILE_CLUSTER; + unsigned long count =3D 1 << order; if (!cluster_info) return; if (cluster_is_free(&cluster_info[idx])) - alloc_cluster(p, idx); + alloc_cluster(p, idx, order); VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], @@ -577,7 +583,7 @@ static void add_cluster_info_page(struct swap_info_stru= ct *p, static void inc_cluster_info_page(struct swap_info_struct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr) { - add_cluster_info_page(p, cluster_info, page_nr, 1); + add_cluster_info_page(p, cluster_info, page_nr, 0); } /* @@ -964,7 +970,7 @@ static int scan_swap_map_slots(struct swap_info_struct = *si, goto done; } memset(si->swap_map + offset, usage, nr_pages); - add_cluster_info_page(si, si->cluster_info, offset, nr_pages); + add_cluster_info_page(si, si->cluster_info, offset, order); unlock_cluster(ci); swap_range_alloc(si, offset, nr_pages); @@ -1060,7 +1066,7 @@ static void swap_free_cluster(struct swap_info_struct= *si, unsigned long idx) ci =3D lock_cluster(si, offset); memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); - cluster_set_count_flag(ci, 0, 0); + cluster_set_count_flag_order(ci, 0, 0, 0); free_cluster(si, idx); unlock_cluster(ci); swap_range_free(si, offset, SWAPFILE_CLUSTER); -- 2.43.0 From nobody Wed Feb 11 22:54:49 2026 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id D45A11891A6 for ; Tue, 18 Jun 2024 23:27:05 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753227; cv=none; b=Wrpvlbs36nkGiMj8SRxYACd/y5tIV/Rx1w0Db6bMLCL/Q5PR0zFCH6whr0uR0Icsu5KQBlSHoQW+iGMqIAMvkI0S9CB8HMWn0bXo3CYTYOJddgo1XEoLAJOoGrTe7CNBYqT6FZ/kxNaPs2it+n2cbjFKlJcApmaZmBDCz/KetTw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753227; c=relaxed/simple; bh=oW/XrdstAv09FNMedV19nsUUJDgFoszN7+cijJf697A=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=giLqEIz7E0iwj0BDN84HbQu6DM6iGPCTrgbkdNsYjyjWuL3NUYuBy0LaWFV12hBOYZhuBQNd4y9cvjDalyaXqUbPzhcgDvmG3+gzOy9rZqHTN7cJSAye7ibLb+I0mSzawHH8czkNkcUe9k0T5j71kJDCMTBG9aUP4hdkuyLAUBk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 5AD4014BF; Tue, 18 Jun 2024 16:27:30 -0700 (PDT) Received: from e125769.cambridge.arm.com (e125769.cambridge.arm.com [10.1.196.27]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id ECDFD3F64C; Tue, 18 Jun 2024 16:27:03 -0700 (PDT) From: Ryan Roberts To: Andrew Morton , Chris Li , Kairui Song , "Huang, Ying" , Kalesh Singh , Barry Song , Hugh Dickins , David Hildenbrand Cc: Ryan Roberts , linux-kernel@vger.kernel.org, linux-mm@kvack.org Subject: [RFC PATCH v1 4/5] mm: swap: Scan for free swap entries in allocated clusters Date: Wed, 19 Jun 2024 00:26:44 +0100 Message-ID: <20240618232648.4090299-5-ryan.roberts@arm.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240618232648.4090299-1-ryan.roberts@arm.com> References: <20240618232648.4090299-1-ryan.roberts@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Previously mTHP would only be swapped out if a CPU could allocate itself a free cluster from which to allocate mTHP-sized contiguous swap entry blocks. But for a system making heavy use of swap, after a while fragmentation ensures there are no available free clusters and therefore the swap entry allocation fails and forces the mTHP to be split to base pages which then get swap entries allocated by scanning the swap file for free individual pages. But when swap entries are freed, this makes holes in the clusters, and often it would be possible to allocate new mTHP swap entries in those holes. So if we fail to allocate a free cluster, scan through the clusters until we find one that is in use and contains swap entries of the order we require. Then scan it until we find a suitably sized and aligned hole. We keep a per-order "next cluster to scan" pointer so that future scanning can be picked up from where we last left off. And if we scan through all clusters without finding a suitable hole, we give up to prevent live lock. Running the test case provided by Barry Song at the below link, I can see swpout fallback rate, which was previously 100% after a few iterations, falls to 0% and stays there for all 100 iterations. This is also the case when sprinkling in some non-mTHP allocations ("-s") too. Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/linux-mm/20240615084714.37499-1-21cnbao@gmail= .com/ --- include/linux/swap.h | 2 + mm/swapfile.c | 90 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2a40fe02d281..34ec4668a5c9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,6 +310,8 @@ struct swap_info_struct { unsigned int cluster_nr; /* countdown to next cluster search */ unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocati= on */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap locatio= n */ + struct swap_cluster_info *next_order_scan[SWAP_NR_ORDERS]; + /* Start cluster for next order-based scan */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 7b13f02a7ac2..24db03db8830 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -644,6 +644,84 @@ static inline bool swap_range_empty(char *swap_map, un= signed int start, return true; } +static inline +struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, + unsigned int offset) +{ + VM_WARN_ON(!si->cluster_info); + return si->cluster_info + (offset / SWAPFILE_CLUSTER); +} + +static inline +unsigned int cluster_to_offset(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + VM_WARN_ON(!si->cluster_info); + return (ci - si->cluster_info) * SWAPFILE_CLUSTER; +} + +static inline +struct swap_cluster_info *next_cluster_circular(struct swap_info_struct *s= i, + struct swap_cluster_info *ci) +{ + struct swap_cluster_info *last; + + /* + * Wrap after the last whole cluster; never return the final partial + * cluster because users assume an entire cluster is accessible. + */ + last =3D offset_to_cluster(si, si->max) - 1; + return ci =3D=3D last ? si->cluster_info : ++ci; +} + +static inline +struct swap_cluster_info *prev_cluster_circular(struct swap_info_struct *s= i, + struct swap_cluster_info *ci) +{ + struct swap_cluster_info *last; + + /* + * Wrap to the last whole cluster; never return the final partial + * cluster because users assume an entire cluster is accessible. + */ + last =3D offset_to_cluster(si, si->max) - 1; + return ci =3D=3D si->cluster_info ? last : --ci; +} + +/* + * Returns the offset of the next cluster, allocated to contain swap entri= es of + * `order`, that is eligible to scan for free space. On first call, *stop = should + * be set to SWAP_NEXT_INVALID to indicate the clusters should be scanned = all + * the way back around to the returned cluster. The function updates *stop= upon + * first call and consumes it in subsequent calls. Returns SWAP_NEXT_INVAL= ID if + * no such clusters are available. Must be called with si lock held. + */ +static unsigned int next_cluster_for_scan(struct swap_info_struct *si, + int order, unsigned int *stop) +{ + struct swap_cluster_info *ci; + struct swap_cluster_info *end; + + ci =3D si->next_order_scan[order]; + if (*stop =3D=3D SWAP_NEXT_INVALID) + *stop =3D cluster_to_offset(si, prev_cluster_circular(si, ci)); + end =3D offset_to_cluster(si, *stop); + + while (ci !=3D end) { + if ((ci->flags & CLUSTER_FLAG_FREE) =3D=3D 0 && ci->order =3D=3D order) + break; + ci =3D next_cluster_circular(si, ci); + } + + if (ci =3D=3D end) { + si->next_order_scan[order] =3D ci; + return SWAP_NEXT_INVALID; + } + + si->next_order_scan[order] =3D next_cluster_circular(si, ci); + return cluster_to_offset(si, ci); +} + /* * Try to get swap entries with specified order from current cpu's swap en= try * pool (a cluster). This might involve allocating a new cluster for curre= nt CPU @@ -656,6 +734,7 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_i= nfo_struct *si, struct percpu_cluster *cluster; struct swap_cluster_info *ci; unsigned int tmp, max; + unsigned int stop =3D SWAP_NEXT_INVALID; new_cluster: cluster =3D this_cpu_ptr(si->percpu_cluster); @@ -674,6 +753,15 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_= info_struct *si, *scan_base =3D this_cpu_read(*si->cluster_next_cpu); *offset =3D *scan_base; goto new_cluster; + } else if (nr_pages < SWAPFILE_CLUSTER) { + /* + * There is no point in scanning for free areas the same + * size as the cluster, since the cluster would have + * already been freed in that case. + */ + tmp =3D next_cluster_for_scan(si, order, &stop); + if (tmp =3D=3D SWAP_NEXT_INVALID) + return false; } else return false; } @@ -2392,6 +2480,8 @@ static void setup_swap_info(struct swap_info_struct *= p, int prio, } p->swap_map =3D swap_map; p->cluster_info =3D cluster_info; + for (i =3D 0; i < SWAP_NR_ORDERS; i++) + p->next_order_scan[i] =3D cluster_info; } static void _enable_swap_info(struct swap_info_struct *p) -- 2.43.0 From nobody Wed Feb 11 22:54:49 2026 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 8B2D61891DF for ; Tue, 18 Jun 2024 23:27:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753230; cv=none; b=FxOK4EcD2cAkJ9+1jYcHeHau/WL0759nMhNoYpcy7P10IZhYzru9eR+tvw4Ja4I6uSsHdKAAY+yD+OIpu1W7xGj1fY07PXyp2juequG/Yt0o+UooDbjuv4ExOjrYOZDFNhLZL5laJjMWBntYm/LKIkrh9e+khN1EXbDqyl1CmxM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1718753230; c=relaxed/simple; bh=Znl4Phg+yGQ/MP0Ne9Cpu0oIP0Wn8eDukS41b/ym+lM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=pPc23c8YPA2u8sGMq6ugUKNaGuhX+/NfqjnKUXIrnXqA4t+rDKFmbppwYX/HDKD9VZbPT5EdyG2GSlqUi4CBtfZqToSXZnutYxdDU6eu8m5sEt7Mbw17rL4o892DylpnVfFE95mFHNwJV7jr7tNhBvfiSkgXq3xAoQ30eY15Wkg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 5A856150C; Tue, 18 Jun 2024 16:27:32 -0700 (PDT) Received: from e125769.cambridge.arm.com (e125769.cambridge.arm.com [10.1.196.27]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id EC8073F64C; Tue, 18 Jun 2024 16:27:05 -0700 (PDT) From: Ryan Roberts To: Andrew Morton , Chris Li , Kairui Song , "Huang, Ying" , Kalesh Singh , Barry Song , Hugh Dickins , David Hildenbrand Cc: Ryan Roberts , linux-kernel@vger.kernel.org, linux-mm@kvack.org Subject: [RFC PATCH v1 5/5] mm: swap: Optimize per-order cluster scanning Date: Wed, 19 Jun 2024 00:26:45 +0100 Message-ID: <20240618232648.4090299-6-ryan.roberts@arm.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20240618232648.4090299-1-ryan.roberts@arm.com> References: <20240618232648.4090299-1-ryan.roberts@arm.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Add CLUSTER_FLAG_SKIP_SCAN cluster flag, which is applied to a cluster under 1 of 2 conditions. When present, the cluster will be skipped during a scan. - When the number of free entries is less than the number of entries that would be required for a new allocation of the order that the cluster serves. - When scanning completes for the cluster, and no further scanners are active for the cluster and no swap entries were freed for the cluster since the last scan began. In this case, it has been proven that there are no contiguous free entries of sufficient size to allcoate the order that the cluster serves. In this case the cluster is made eligible for scanning again when the next entry is freed. The latter is implemented to permit multiple CPUs to scan the same cluster, which in turn garrantees that if there is a free block available in a cluster allocated for the desired order then it will be allocated on a first come, first served basis. As a result, the number of active scanners for a cluster must be tracked, costing 4 bytes per cluster. Signed-off-by: Ryan Roberts --- include/linux/swap.h | 3 +++ mm/swapfile.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 34ec4668a5c9..40c308749e79 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -257,9 +257,12 @@ struct swap_cluster_info { unsigned int data:24; unsigned int flags:4; unsigned int order:4; + unsigned int nr_scanners; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ +#define CLUSTER_FLAG_SKIP_SCAN 4 /* Skip cluster for per-order scan */ +#define CLUSTER_FLAG_DECREMENT 8 /* A swap entry was freed from cluster */ /* * swap_info_struct::max is an unsigned int, so the maximum number of page= s in diff --git a/mm/swapfile.c b/mm/swapfile.c index 24db03db8830..caf382b4ecd3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -574,6 +574,9 @@ static void add_cluster_info_page(struct swap_info_stru= ct *p, VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) + count); + + if (SWAPFILE_CLUSTER - cluster_count(&cluster_info[idx]) < count) + cluster_info[idx].flags |=3D CLUSTER_FLAG_SKIP_SCAN; } /* @@ -595,6 +598,7 @@ static void dec_cluster_info_page(struct swap_info_stru= ct *p, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx =3D page_nr / SWAPFILE_CLUSTER; + unsigned long count =3D 1 << cluster_info[idx].order; if (!cluster_info) return; @@ -603,6 +607,10 @@ static void dec_cluster_info_page(struct swap_info_str= uct *p, cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) - 1); + cluster_info[idx].flags |=3D CLUSTER_FLAG_DECREMENT; + if (SWAPFILE_CLUSTER - cluster_count(&cluster_info[idx]) >=3D count) + cluster_info[idx].flags &=3D ~CLUSTER_FLAG_SKIP_SCAN; + if (cluster_count(&cluster_info[idx]) =3D=3D 0) free_cluster(p, idx); } @@ -708,7 +716,8 @@ static unsigned int next_cluster_for_scan(struct swap_i= nfo_struct *si, end =3D offset_to_cluster(si, *stop); while (ci !=3D end) { - if ((ci->flags & CLUSTER_FLAG_FREE) =3D=3D 0 && ci->order =3D=3D order) + if ((ci->flags & (CLUSTER_FLAG_SKIP_SCAN | CLUSTER_FLAG_FREE)) =3D=3D 0 + && ci->order =3D=3D order) break; ci =3D next_cluster_circular(si, ci); } @@ -722,6 +731,21 @@ static unsigned int next_cluster_for_scan(struct swap_= info_struct *si, return cluster_to_offset(si, ci); } +static inline void cluster_inc_scanners(struct swap_cluster_info *ci) +{ + /* Protected by si lock. */ + ci->nr_scanners++; + ci->flags &=3D ~CLUSTER_FLAG_DECREMENT; +} + +static inline void cluster_dec_scanners(struct swap_cluster_info *ci) +{ + /* Protected by si lock. */ + ci->nr_scanners--; + if (ci->nr_scanners =3D=3D 0 && (ci->flags & CLUSTER_FLAG_DECREMENT) =3D= =3D 0) + ci->flags |=3D CLUSTER_FLAG_SKIP_SCAN; +} + /* * Try to get swap entries with specified order from current cpu's swap en= try * pool (a cluster). This might involve allocating a new cluster for curre= nt CPU @@ -764,6 +788,8 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_i= nfo_struct *si, return false; } else return false; + + cluster_inc_scanners(offset_to_cluster(si, tmp)); } /* @@ -780,13 +806,19 @@ static bool scan_swap_map_try_ssd_cluster(struct swap= _info_struct *si, } unlock_cluster(ci); if (tmp >=3D max) { + cluster_dec_scanners(ci); cluster->next[order] =3D SWAP_NEXT_INVALID; goto new_cluster; } *offset =3D tmp; *scan_base =3D tmp; tmp +=3D nr_pages; - cluster->next[order] =3D tmp < max ? tmp : SWAP_NEXT_INVALID; + if (tmp >=3D max) { + cluster_dec_scanners(ci); + cluster->next[order] =3D SWAP_NEXT_INVALID; + } else { + cluster->next[order] =3D tmp; + } return true; } -- 2.43.0