From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.129.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BDB4013EFF4 for ; Wed, 14 Feb 2024 20:44:52 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.129.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943494; cv=none; b=R+gzFkWeYJ1W9XnpGm06DwhQ12Y/YrvA/RYw06iSn0yJ0VlAgU8Hk6Ue+iJarSlGKV1X9sqEUxvrG2m14P9ujppqYzQkDGPlGfU25YpVdCBoiG7Gy0jEeJvwn5U8jjzvH/D6EBSMRSYMjE17SYE5tzS6xdfTltXbOBb8/GrSpd8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943494; c=relaxed/simple; bh=7HbJDHbNEg6CVfsuVyc/qkeafaxTSnyk1xYc8prFL0c=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=irXLmORR36EviLBzR9xW267HXSviNf7rWfaxmwDR9mGkRs4MgmC6tm67SfNzrYhSZvKjpVYwaQFLBSERy4CjNl/SJwHPowsIN8SpJEDBkuC9R+xsaIcmiQ5AjIb1cW71o8HLP1jN/jYA+4jMYo9YSiJECn9k07QV6QM1z1Pc/ZE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=a2AwhUFz; arc=none smtp.client-ip=170.10.129.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="a2AwhUFz" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943491; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=dEQuQbE8PusGeIN6wyahI5tWOdzdv7mIipcs9sgG+zs=; b=a2AwhUFzhRV/eG1TZMNiQUiTgShreLYj0RtSOikqu4mdgcwEhPC8eS2/FiZB2ybaPFungN D00cCP4T+U7MdPHd/HIJzslfQg6SRHiynHlCKfjmPYGyKTUMYBAhFJlZU2RX3f2GzTYrv7 eqvXCR9r0IbapTgj0Rrb58j8nb3c/eM= Received: from mimecast-mx02.redhat.com (mx-ext.redhat.com [66.187.233.73]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-373--0kZEuV3Pfig40WvEuXQ9Q-1; Wed, 14 Feb 2024 15:44:46 -0500 X-MC-Unique: -0kZEuV3Pfig40WvEuXQ9Q-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 05DB528B6A1E; Wed, 14 Feb 2024 20:44:45 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id 5C51A1C066A9; Wed, 14 Feb 2024 20:44:41 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 01/10] mm/memory: factor out zapping of present pte into zap_present_pte() Date: Wed, 14 Feb 2024 21:44:26 +0100 Message-ID: <20240214204435.167852-2-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" Let's prepare for further changes by factoring out processing of present PTEs. Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- mm/memory.c | 94 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7c3ca41a7610..5b0dc33133a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1532,13 +1532,61 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct= *vma, pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); } =20 +static inline void zap_present_pte(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned long addr, struct zap_details *details, + int *rss, bool *force_flush, bool *force_break) +{ + struct mm_struct *mm =3D tlb->mm; + struct folio *folio =3D NULL; + bool delay_rmap =3D false; + struct page *page; + + page =3D vm_normal_page(vma, addr, ptent); + if (page) + folio =3D page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) + return; + ptent =3D ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); + if (unlikely(!page)) { + ksm_might_unmap_zero_page(mm, ptent); + return; + } + + if (!folio_test_anon(folio)) { + if (pte_dirty(ptent)) { + folio_mark_dirty(folio); + if (tlb_delay_rmap(tlb)) { + delay_rmap =3D true; + *force_flush =3D true; + } + } + if (pte_young(ptent) && likely(vma_has_recency(vma))) + folio_mark_accessed(folio); + } + rss[mm_counter(folio)]--; + if (!delay_rmap) { + folio_remove_rmap_pte(folio, page, vma); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + } + if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { + *force_flush =3D true; + *force_break =3D true; + } +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { + bool force_flush =3D false, force_break =3D false; struct mm_struct *mm =3D tlb->mm; - int force_flush =3D 0; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; @@ -1555,7 +1603,7 @@ static unsigned long zap_pte_range(struct mmu_gather = *tlb, arch_enter_lazy_mmu_mode(); do { pte_t ptent =3D ptep_get(pte); - struct folio *folio =3D NULL; + struct folio *folio; struct page *page; =20 if (pte_none(ptent)) @@ -1565,45 +1613,9 @@ static unsigned long zap_pte_range(struct mmu_gather= *tlb, break; =20 if (pte_present(ptent)) { - unsigned int delay_rmap; - - page =3D vm_normal_page(vma, addr, ptent); - if (page) - folio =3D page_folio(page); - - if (unlikely(!should_zap_folio(details, folio))) - continue; - ptent =3D ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, - ptent); - if (unlikely(!page)) { - ksm_might_unmap_zero_page(mm, ptent); - continue; - } - - delay_rmap =3D 0; - if (!folio_test_anon(folio)) { - if (pte_dirty(ptent)) { - folio_mark_dirty(folio); - if (tlb_delay_rmap(tlb)) { - delay_rmap =3D 1; - force_flush =3D 1; - } - } - if (pte_young(ptent) && likely(vma_has_recency(vma))) - folio_mark_accessed(folio); - } - rss[mm_counter(folio)]--; - if (!delay_rmap) { - folio_remove_rmap_pte(folio, page, vma); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - } - if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { - force_flush =3D 1; + zap_present_pte(tlb, vma, pte, ptent, addr, details, + rss, &force_flush, &force_break); + if (unlikely(force_break)) { addr +=3D PAGE_SIZE; break; } --=20 2.43.0 From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 18A6513DBBC for ; Wed, 14 Feb 2024 20:44:54 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943496; cv=none; b=Y3V+yIn06K/QstJVfZqzPnt/iOFoqtnJluMeJolj39Bp39NOq81Lzan6Qo0hSfTEqZIBQ+hy/PzcsfxTWOxuLJRJKtRJ8+ZVp2VRqFC5df2W3XCvgkbWwmaGXyh/Zo64rU6eVpwtsABytcD96FdEc9D4QbmgGgVY8Gl9K9TLDB4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943496; c=relaxed/simple; bh=iKdLPd0l+BlZy4NkWR4PBv4pdjA5DJYrKJe9pxJlmq0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=dCoi4CZ7czrQ0+oxLVAMmUE3EXlMZkyWYMgt8/YiZd42WXBTZdFZx4z+DnSiQMVrsz+GW8BBNN6dca1DwUm4IZvzH+TKqqcU6y8YunovGeJ5TF8gjfeIumeLLdNopxlhuGRTf3LfohDXJ6wMy50fLe7zF0iToeNYPR2/p0jnKm8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=W1RBDJtB; arc=none smtp.client-ip=170.10.133.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="W1RBDJtB" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943493; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=fqQsnr9EFuS72Lg9egurC5IFPM5MYMM8dNEJpcWm/gk=; b=W1RBDJtBwg9W+BfbT6HUgiiUvjjcIhM0oPUPegvDgLyLaH1uQ/3AzflJdVNzp2tC4YUDsy uRhtK9g5gZ9McblshTAH4b4FlMMINq8sVeqO9aDns+7hQrQuI7QQL/fzhA8jaluOp7a7zs x97Xozf/J+DBpyapPn2gKITmhtTbtJY= Received: from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-226-pgh-LenmOMa4VgIEuXmFnA-1; Wed, 14 Feb 2024 15:44:50 -0500 X-MC-Unique: pgh-LenmOMa4VgIEuXmFnA-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 0A351185A782; Wed, 14 Feb 2024 20:44:49 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id 438201C066AA; Wed, 14 Feb 2024 20:44:45 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 02/10] mm/memory: handle !page case in zap_present_pte() separately Date: Wed, 14 Feb 2024 21:44:27 +0100 Message-ID: <20240214204435.167852-3-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" We don't need uptodate accessed/dirty bits, so in theory we could replace ptep_get_and_clear_full() by an optimized ptep_clear_full() function. Let's rely on the provided pte. Further, there is no scenario where we would have to insert uffd-wp markers when zapping something that is not a normal page (i.e., zeropage). Add a sanity check to make sure this remains true. should_zap_folio() no longer has to handle NULL pointers. This change replaces 2/3 "!page/!folio" checks by a single "!page" one. Note that arch_check_zapped_pte() on x86-64 checks the HW-dirty bit to detect shadow stack entries. But for shadow stack entries, the HW dirty bit (in combination with non-writable PTEs) is set by software. So for the arch_check_zapped_pte() check, we don't have to sync against HW setting the HW dirty bit concurrently, it is always set. Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- mm/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5b0dc33133a6..4da6923709b2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1497,10 +1497,6 @@ static inline bool should_zap_folio(struct zap_detai= ls *details, if (should_zap_cows(details)) return true; =20 - /* E.g. the caller passes NULL for the case of a zero folio */ - if (!folio) - return true; - /* Otherwise we should only zap non-anon folios */ return !folio_test_anon(folio); } @@ -1538,24 +1534,28 @@ static inline void zap_present_pte(struct mmu_gathe= r *tlb, int *rss, bool *force_flush, bool *force_break) { struct mm_struct *mm =3D tlb->mm; - struct folio *folio =3D NULL; bool delay_rmap =3D false; + struct folio *folio; struct page *page; =20 page =3D vm_normal_page(vma, addr, ptent); - if (page) - folio =3D page_folio(page); + if (!page) { + /* We don't need up-to-date accessed/dirty bits. */ + ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + VM_WARN_ON_ONCE(userfaultfd_wp(vma)); + ksm_might_unmap_zero_page(mm, ptent); + return; + } =20 + folio =3D page_folio(page); if (unlikely(!should_zap_folio(details, folio))) return; ptent =3D ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); - if (unlikely(!page)) { - ksm_might_unmap_zero_page(mm, ptent); - return; - } =20 if (!folio_test_anon(folio)) { if (pte_dirty(ptent)) { --=20 2.43.0 From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 98A0013F013 for ; Wed, 14 Feb 2024 20:44:58 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943500; cv=none; b=YheeeLTSi+JE3MKH2L+67+awVi1HsOq8I8Uur2F7r+UYZglHLH6yGElGp7hQ6xBJx8KynTDw2BkOiclDJkuQrrHG9WwrlRSM/AWJH8blA/+IwKscdfrBYzyloHo4spbkbnI31Jev25jDuSESdZ6B9y9fWZdsKSbb+aBYEgMvWU8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943500; c=relaxed/simple; bh=gJjdcDN9SIZ57EMSsnxvz4s6wGh9ngGev4G0JhDn5Lw=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=rHWEGwB/MRJvgYRGPAYnJl5QHMjIk4xfJqbQ2jsWVy7+zwWV5Ac/tbu9En/P5tFn7uf4yIYhp5RZZclz/huTieFSFaHCo5faDGWl+Q2IMF88GPT4xeFKceScL/z099fBpbikN5NxPe1npFyjrhRVPTeg8N6YacW6N9id5gkrhh0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=jPkqsZIp; arc=none smtp.client-ip=170.10.133.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="jPkqsZIp" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943497; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=M/Ux9HRgPwmdR3zuVPtEfBlFP5wVpYCc/8Whwmc2o7c=; b=jPkqsZIpAhXubwcPIQoGJd4sJN/P/hl1VJhVEMIqk+U2e7zkIQu8pwMZJD+jV+eMSqHyKS 3xjOGiTSolq9lVvkwxGzLYsKIVo882ujB49Wdj6CzlZf0PhIGQxpCP+fKkUXEtBKZz1Gbv oWCEEZyKBNE7EnIExK4hDC2JALzpXvw= Received: from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-460-Ud6bEsrlM92gS6OHI6pxZA-1; Wed, 14 Feb 2024 15:44:54 -0500 X-MC-Unique: Ud6bEsrlM92gS6OHI6pxZA-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id DDD578630C4; Wed, 14 Feb 2024 20:44:52 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id 4741B1C060B1; Wed, 14 Feb 2024 20:44:49 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 03/10] mm/memory: further separate anon and pagecache folio handling in zap_present_pte() Date: Wed, 14 Feb 2024 21:44:28 +0100 Message-ID: <20240214204435.167852-4-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" We don't need up-to-date accessed-dirty information for anon folios and can simply work with the ptent we already have. Also, we know the RSS counter we want to update. We can safely move arch_check_zapped_pte() + tlb_remove_tlb_entry() + zap_install_uffd_wp_if_needed() after updating the folio and RSS. While at it, only call zap_install_uffd_wp_if_needed() if there is even any chance that pte_install_uffd_wp_if_needed() would do *something*. That is, just don't bother if uffd-wp does not apply. Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- mm/memory.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 4da6923709b2..7a3ebb6e5909 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1552,12 +1552,9 @@ static inline void zap_present_pte(struct mmu_gather= *tlb, folio =3D page_folio(page); if (unlikely(!should_zap_folio(details, folio))) return; - ptent =3D ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); - arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); =20 if (!folio_test_anon(folio)) { + ptent =3D ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); if (pte_dirty(ptent)) { folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { @@ -1567,8 +1564,17 @@ static inline void zap_present_pte(struct mmu_gather= *tlb, } if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); + rss[mm_counter(folio)]--; + } else { + /* We don't need up-to-date accessed/dirty bits. */ + ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + rss[MM_ANONPAGES]--; } - rss[mm_counter(folio)]--; + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(userfaultfd_pte_wp(vma, ptent))) + zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); + if (!delay_rmap) { folio_remove_rmap_pte(folio, page, vma); if (unlikely(page_mapcount(page) < 0)) --=20 2.43.0 From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7D75C141991 for ; Wed, 14 Feb 2024 20:45:02 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943504; cv=none; b=eQdEs5bUNnCo2F8724RM3n9+HPUg6KNHQTLYXI6zjOmKxsBOZ8uQS+/nAYp7mplI+42KIW3ba1b3JX9sGeU2EjB79rnmfyYFyup063PD42DYI+KfdzYW/bdhRPVUzEsv6ijMhEawV/G7D6L33c7UYr3S746jQKVavcz4GRL5qqU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943504; c=relaxed/simple; bh=X688LQ+8vv1x+HYbDq06JeZc4jgkCPC+TN2xr2DyFa8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Id3ldX7IVE68BubyxAk1z6XFAvFSuQP01TlVWDQyACAakJGCK/G4YEsHufECcjZAz6kL6xZEjLw5rZgkFLFPdcmV7SPT8qkQfbR6aLYsCBJGGPWsHSvPb6IR/MCDdu4ST8Y9jXHdKVA4ObSurYHKdv2JXQE671zZdZtVCtKHOJk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=QCbJHDSn; arc=none smtp.client-ip=170.10.133.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="QCbJHDSn" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943501; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=SyWghWuBP6GkqnhxcyLXYq2ssX4b7HML4Pf9katCZ5k=; b=QCbJHDSnTubM8kNwtxKITUDly1CJsT2kTpkPEi5qJNh2YwoLkQ4axBRYx907HOnN3Rnyyi 83isEcpNaoznWLdxCFSr7oy6ywErQRIk+v6z/T+I2i75qpZ6RTEoM4yMatMdM0G638zvEC LdEr7jUOZgAYtyZnvGf3qseaGrW+8Yw= Received: from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-608-cfALmccYMXOB9OP0YG_4hQ-1; Wed, 14 Feb 2024 15:44:58 -0500 X-MC-Unique: cfALmccYMXOB9OP0YG_4hQ-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id B0CEB10AFD6E; Wed, 14 Feb 2024 20:44:56 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id 244661C066A9; Wed, 14 Feb 2024 20:44:53 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 04/10] mm/memory: factor out zapping folio pte into zap_present_folio_pte() Date: Wed, 14 Feb 2024 21:44:29 +0100 Message-ID: <20240214204435.167852-5-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" Let's prepare for further changes by factoring it out into a separate function. Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- mm/memory.c | 53 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7a3ebb6e5909..a3efc4da258a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1528,30 +1528,14 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct= *vma, pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); } =20 -static inline void zap_present_pte(struct mmu_gather *tlb, - struct vm_area_struct *vma, pte_t *pte, pte_t ptent, - unsigned long addr, struct zap_details *details, - int *rss, bool *force_flush, bool *force_break) +static inline void zap_present_folio_pte(struct mmu_gather *tlb, + struct vm_area_struct *vma, struct folio *folio, + struct page *page, pte_t *pte, pte_t ptent, unsigned long addr, + struct zap_details *details, int *rss, bool *force_flush, + bool *force_break) { struct mm_struct *mm =3D tlb->mm; bool delay_rmap =3D false; - struct folio *folio; - struct page *page; - - page =3D vm_normal_page(vma, addr, ptent); - if (!page) { - /* We don't need up-to-date accessed/dirty bits. */ - ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); - arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); - VM_WARN_ON_ONCE(userfaultfd_wp(vma)); - ksm_might_unmap_zero_page(mm, ptent); - return; - } - - folio =3D page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) - return; =20 if (!folio_test_anon(folio)) { ptent =3D ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); @@ -1586,6 +1570,33 @@ static inline void zap_present_pte(struct mmu_gather= *tlb, } } =20 +static inline void zap_present_pte(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned long addr, struct zap_details *details, + int *rss, bool *force_flush, bool *force_break) +{ + struct mm_struct *mm =3D tlb->mm; + struct folio *folio; + struct page *page; + + page =3D vm_normal_page(vma, addr, ptent); + if (!page) { + /* We don't need up-to-date accessed/dirty bits. */ + ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + arch_check_zapped_pte(vma, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + VM_WARN_ON_ONCE(userfaultfd_wp(vma)); + ksm_might_unmap_zero_page(mm, ptent); + return; + } + + folio =3D page_folio(page); + if (unlikely(!should_zap_folio(details, folio))) + return; + zap_present_folio_pte(tlb, vma, folio, page, pte, ptent, addr, details, + rss, force_flush, force_break); +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, --=20 2.43.0 From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DCAD81419A4 for ; Wed, 14 Feb 2024 20:45:05 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943507; cv=none; b=oz4cKB94B5qEHLaRVa+ismCBobiHUlvM1JCXbAyrXREv5bZg4wwZT3/x3ZKfzaIcYbFOys9A6TtYo8rUVHqGJapMLHcE6RqFoPEIKljZSNQIQmz90AujJYRcN5VLTCmDEk77p1ca4TLEtYoo82XbCuI8zYFf96A817ClnX5ubXU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943507; c=relaxed/simple; bh=klZ3/8gE/hLexWRn9Rfgs4Jd0fYkSsj1v6sMZOE0W70=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=okkVJqIpNBZpJV5jGh/k9pwO3YxQm4kpA6iNxjLzJQDSNz4VhdkousShmePLMN0zQ5sAfJWUuSjuuSrGaybpsYUpZ29aGHeLpBE1BLi4WJmTgTmXEJIsyQOkz1XVGJzk31nw0Tf2JlysC6WbZ0BhtDP+uCq9/zgAotrqN7ynPWw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=Idqvrsgu; arc=none smtp.client-ip=170.10.133.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="Idqvrsgu" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943505; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=4Q1ph86PH1Crd3Eqr5OMYEFX/2u3faALVbtCzM66/MM=; b=IdqvrsgunK4/c/ezi9TsDMpswAVhVcXptWySqJV5aEqIQXCePWwu6527P763Dx76ELhwwC iR8RkMTO0IuZrqnfK/q8h3F+1JsOY6Di1QZs5BUq/+Kfu3oErH6uZUpF2zI7TMOPtzvoSp W7A73Q12+FhiW81CFWbWaz1hcMcIjqo= Received: from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-670--0HI9YGROZaGy4w0dg4ihQ-1; Wed, 14 Feb 2024 15:45:01 -0500 X-MC-Unique: -0HI9YGROZaGy4w0dg4ihQ-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 864D985A589; Wed, 14 Feb 2024 20:45:00 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id ECCE01C06532; Wed, 14 Feb 2024 20:44:56 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 05/10] mm/mmu_gather: pass "delay_rmap" instead of encoded page to __tlb_remove_page_size() Date: Wed, 14 Feb 2024 21:44:30 +0100 Message-ID: <20240214204435.167852-6-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" We have two bits available in the encoded page pointer to store additional information. Currently, we use one bit to request delay of the rmap removal until after a TLB flush. We want to make use of the remaining bit internally for batching of multiple pages of the same folio, specifying that the next encoded page pointer in an array is actually "nr_pages". So pass page + delay_rmap flag instead of an encoded page, to handle the encoding internally. Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- arch/s390/include/asm/tlb.h | 13 ++++++------- include/asm-generic/tlb.h | 12 ++++++------ mm/mmu_gather.c | 7 ++++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index d1455a601adc..48df896d5b79 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -25,8 +25,7 @@ void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct encoded_page *page, - int page_size); + struct page *page, bool delay_rmap, int page_size); =20 #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb @@ -42,14 +41,14 @@ static inline bool __tlb_remove_page_size(struct mmu_ga= ther *tlb, * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. * - * s390 doesn't delay rmap removal, so there is nothing encoded in - * the page pointer. + * s390 doesn't delay rmap removal. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct encoded_page *page, - int page_size) + struct page *page, bool delay_rmap, int page_size) { - free_page_and_swap_cache(encoded_page_ptr(page)); + VM_WARN_ON_ONCE(delay_rmap); + + free_page_and_swap_cache(page); return false; } =20 diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 129a3a759976..2eb7b0d4f5d2 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -260,9 +260,8 @@ struct mmu_gather_batch { */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) =20 -extern bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct encoded_page *page, - int page_size); +extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *pa= ge, + bool delay_rmap, int page_size); =20 #ifdef CONFIG_SMP /* @@ -462,13 +461,14 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_g= ather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, encode_page(page, 0), page_size)) + if (__tlb_remove_page_size(tlb, page, false, page_size)) tlb_flush_mmu(tlb); } =20 -static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, stru= ct page *page, unsigned int flags) +static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, + struct page *page, bool delay_rmap) { - return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE); + return __tlb_remove_page_size(tlb, page, delay_rmap, PAGE_SIZE); } =20 /* tlb_remove_page diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 604ddf08affe..ac733d81b112 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -116,7 +116,8 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next =3D NULL; } =20 -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *p= age, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, + bool delay_rmap, int page_size) { struct mmu_gather_batch *batch; =20 @@ -131,13 +132,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, s= truct encoded_page *page, i * Add the page and check if we are full. If so * force a flush. */ - batch->encoded_pages[batch->nr++] =3D page; + batch->encoded_pages[batch->nr++] =3D encode_page(page, delay_rmap); if (batch->nr =3D=3D batch->max) { if (!tlb_next_batch(tlb)) return true; batch =3D tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page)); + VM_BUG_ON_PAGE(batch->nr > batch->max, page); =20 return false; } --=20 2.43.0 From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8DCF21420A1 for ; Wed, 14 Feb 2024 20:45:10 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943512; cv=none; b=sd6W2XXyTpeOH1t9GXJAwcLymrLLHADPRmyUJ+xFhL3Gc/DRtCp5JcbAJCP9ONXLlBvIHfGr689+NqCauBQCI/LdPJv0E0RK2lpXijGXgMmIHRKCo0TuGghIv9ewh7z7YOmsSY9AwKVHICpsiNJooyXffrGRDzIS4uvS4ePquPY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943512; c=relaxed/simple; bh=Ivx6k9RHC7xtfNym2WBe4tYwW7cQHPIIcIfbRJkpcmw=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=hnXWgi463pIarTZMxferUCsWERM9xLgxogsRLlv8BWQBilUPMBJ2w4xd4I0BuN04ZWcG0aBlzXVNIO86OKDbuzDLt9m+tkOEUJfqAE3JqyUuqkmEz34LIbCBAps6FSuzO5mePz5bM3lB0arIV4vwO+cXeRijquEwz7X/q70xGmM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=dWS7Tlj5; arc=none smtp.client-ip=170.10.133.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="dWS7Tlj5" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943509; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=BfteBvWtn9YL0u+ZWWzLkAZQ9TqgPhMIROF4I2AA+84=; b=dWS7Tlj5hvo++IKBwEcAZX8N1kyu5MFiYG5LceV724qhzHlhlWnCbuVYOcjmSFNc3hjRAH NvInRkO6BO9Jkw7pznPvqg7Wc8TXzrt/R8/GGIlh/4WSOSJJ0LwueRiKvzE0v9HgAg0f5v X682/Nf1TSZAb4roDM0/bce9MOD6bOs= Received: from mimecast-mx02.redhat.com (mx-ext.redhat.com [66.187.233.73]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-502-FFKMi1htPsyeY2_wWYIuqQ-1; Wed, 14 Feb 2024 15:45:06 -0500 X-MC-Unique: FFKMi1htPsyeY2_wWYIuqQ-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 768353C29A67; Wed, 14 Feb 2024 20:45:04 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id C376E1C060B1; Wed, 14 Feb 2024 20:45:00 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 06/10] mm/mmu_gather: define ENCODED_PAGE_FLAG_DELAY_RMAP Date: Wed, 14 Feb 2024 21:44:31 +0100 Message-ID: <20240214204435.167852-7-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" Nowadays, encoded pages are only used in mmu_gather handling. Let's update the documentation, and define ENCODED_PAGE_BIT_DELAY_RMAP. While at it, rename ENCODE_PAGE_BITS to ENCODED_PAGE_BITS. If encoded page pointers would ever be used in other context again, we'd likely want to change the defines to reflect their context (e.g., ENCODED_PAGE_FLAG_MMU_GATHER_DELAY_RMAP). For now, let's keep it simple. This is a preparation for using the remaining spare bit to indicate that the next item in an array of encoded pages is a "nr_pages" argument and not an encoded page. Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- include/linux/mm_types.h | 17 +++++++++++------ mm/mmu_gather.c | 5 +++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8b611e13153e..1b89eec0d6df 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -210,8 +210,8 @@ struct page { * * An 'encoded_page' pointer is a pointer to a regular 'struct page', but * with the low bits of the pointer indicating extra context-dependent - * information. Not super-common, but happens in mmu_gather and mlock - * handling, and this acts as a type system check on that use. + * information. Only used in mmu_gather handling, and this acts as a type + * system check on that use. * * We only really have two guaranteed bits in general, although you could * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) @@ -220,21 +220,26 @@ struct page { * Use the supplied helper functions to endcode/decode the pointer and bit= s. */ struct encoded_page; -#define ENCODE_PAGE_BITS 3ul + +#define ENCODED_PAGE_BITS 3ul + +/* Perform rmap removal after we have flushed the TLB. */ +#define ENCODED_PAGE_BIT_DELAY_RMAP 1ul + static __always_inline struct encoded_page *encode_page(struct page *page,= unsigned long flags) { - BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); + BUILD_BUG_ON(flags > ENCODED_PAGE_BITS); return (struct encoded_page *)(flags | (unsigned long)page); } =20 static inline unsigned long encoded_page_flags(struct encoded_page *page) { - return ENCODE_PAGE_BITS & (unsigned long)page; + return ENCODED_PAGE_BITS & (unsigned long)page; } =20 static inline struct page *encoded_page_ptr(struct encoded_page *page) { - return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); + return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page); } =20 /* diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index ac733d81b112..6540c99c6758 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -53,7 +53,7 @@ static void tlb_flush_rmap_batch(struct mmu_gather_batch = *batch, struct vm_area_ for (int i =3D 0; i < batch->nr; i++) { struct encoded_page *enc =3D batch->encoded_pages[i]; =20 - if (encoded_page_flags(enc)) { + if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page =3D encoded_page_ptr(enc); folio_remove_rmap_pte(page_folio(page), page, vma); } @@ -119,6 +119,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size) { + int flags =3D delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; =20 VM_BUG_ON(!tlb->end); @@ -132,7 +133,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, str= uct page *page, * Add the page and check if we are full. If so * force a flush. */ - batch->encoded_pages[batch->nr++] =3D encode_page(page, delay_rmap); + batch->encoded_pages[batch->nr++] =3D encode_page(page, flags); if (batch->nr =3D=3D batch->max) { if (!tlb_next_batch(tlb)) return true; --=20 2.43.0 From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4388E1420BD for ; Wed, 14 Feb 2024 20:45:16 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943517; cv=none; b=Pqw2pr8pb+eZNLOMXyiLSfteGtB1TQvBSkwUHh5Zyrv+xo/l9vLB2m+ANVHA0XnZUrU7O14aE3i+9FnlteZh3FUpyHp43K4+hZLkMrHXFbOiEZyn0z0kxsjfIXm5QhQHjSzhtmGv/IpIeRjIxMHXGYYF7E1a4lGkvNK++o9y7dE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943517; c=relaxed/simple; bh=3PQfbTkS/vd68uh/N1+tvV3UXJ8B0ItANJG44cGwxXU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Qx0nlfygZ840Ax1qqpPidkPxGyppbw7w6XCiWfcJ2Of+cwzJ0lZV4oYCLqYVP3VARZwJXOWHOLCjoUJvx501lA1TlLDVqqyjmZHApo/i5JDgF6ON5mwFWdBG19NrC/1UXTi1fDftYfX7lPNBH59w5VyRUqbROkiuSdt42Ds2UOk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=Zd5fCEfy; arc=none smtp.client-ip=170.10.133.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="Zd5fCEfy" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943515; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=X4BN9bDZrrapc7Xd1FDfmQmuEZxXG5tNILRPdL9R2lg=; b=Zd5fCEfyBfpu+FQ/1A82MOcZyIcWwqwMQVk9sVixLzuQ7mXDOJTMt5plnz+favBpSl/HD9 qB1X78F0GEPUL7oOIjCDoHE5rShqYP0TSfq4ptFvPjkgbgZAhQpq66QqlFDHG5cV+GnU85 +yp9au8rEpKzytB2V1u3JxIOlfVVQxs= Received: from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-520-D3IZQFsSNhyFuV3-aH76jg-1; Wed, 14 Feb 2024 15:45:09 -0500 X-MC-Unique: D3IZQFsSNhyFuV3-aH76jg-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 6ACBD811E79; Wed, 14 Feb 2024 20:45:08 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id B3DCE1C066A9; Wed, 14 Feb 2024 20:45:04 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 07/10] mm/mmu_gather: add tlb_remove_tlb_entries() Date: Wed, 14 Feb 2024 21:44:32 +0100 Message-ID: <20240214204435.167852-8-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" Let's add a helper that lets us batch-process multiple consecutive PTEs. Note that the loop will get optimized out on all architectures except on powerpc. We have to add an early define of __tlb_remove_tlb_entry() on ppc to make the compiler happy (and avoid making tlb_remove_tlb_entries() a macro). Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- arch/powerpc/include/asm/tlb.h | 2 ++ include/asm-generic/tlb.h | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index b3de6102a907..1ca7d4c4b90d 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -19,6 +19,8 @@ =20 #include =20 +static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *p= tep, + unsigned long address); #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry =20 #define tlb_flush tlb_flush diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 2eb7b0d4f5d2..95d60a4f468a 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -608,6 +608,26 @@ static inline void tlb_flush_p4d_range(struct mmu_gath= er *tlb, __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) =20 +/** + * tlb_remove_tlb_entries - remember unmapping of multiple consecutive pte= s for + * later tlb invalidation. + * + * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple + * consecutive ptes instead of only a single one. + */ +static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb, + pte_t *ptep, unsigned int nr, unsigned long address) +{ + tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr); + for (;;) { + __tlb_remove_tlb_entry(tlb, ptep, address); + if (--nr =3D=3D 0) + break; + ptep++; + address +=3D PAGE_SIZE; + } +} + #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz =3D huge_page_size(h); \ --=20 2.43.0 From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.129.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 56D601420C4 for ; Wed, 14 Feb 2024 20:45:19 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.129.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943522; cv=none; b=K4rqlOQVmK4nu7gcPa0cqzYrkSH4/vMm3GT2Nub9j64mV/yn2XqkgCZrBjExxMXmSAV/eoQ4k4Raj/YCyKXv+LHFeqWorviF6frzN7Cf+ybw5y6u8tlYoDu0SiUl+ln6PhU8yMHET0EcrGiFn9eygHLx9HSmuqaC5ROTNuPyEBs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943522; c=relaxed/simple; bh=yVPR7Q3CIhOU5Wcw4AhzaDbdv4j6l13Jvd2fwfCoQc0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=SS1WVhGRTaakrNl9fcSGMLfPZ0ewzVc+ReFNfnlQbQ/KBsJlPGtn/cjEl1wBbZBA1/pz+AWM68KLNl3ykdNEzIy8QzXsBbASAlQCvm8AaZ1eGb8dYyKzaoRAIRcFw7BnBc3SsXVGvudG4BTcFmsHFrizYove8KccAMvTrd0gjvw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=L/qMWcXV; arc=none smtp.client-ip=170.10.129.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="L/qMWcXV" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943518; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=Urk7YDR9ebLiDb1P6kmx7XPXJlWdS6zrYIBQssjJoDA=; b=L/qMWcXV9/ykjolnADjRhNa1pWCszJ2OnBUGp7vzX/0iIrTGQ7s51NQ598ld2cupOAeCFa zZ6qQxNlsYY+3alNybZAzrTlA0z9cz/Ov3CbnSPbst9JSjmSYjHK6EmxxsEbHsdELZaYjg Ngq4qatm+TDUX5uFOiOTcBeJeDHr0GU= Received: from mimecast-mx02.redhat.com (mx-ext.redhat.com [66.187.233.73]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-495-aCdWli_yP2W_uoiOTDRjgQ-1; Wed, 14 Feb 2024 15:45:14 -0500 X-MC-Unique: aCdWli_yP2W_uoiOTDRjgQ-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 5326C28B6A1D; Wed, 14 Feb 2024 20:45:13 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id A53741C03428; Wed, 14 Feb 2024 20:45:09 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 08/10] mm/mmu_gather: add __tlb_remove_folio_pages() Date: Wed, 14 Feb 2024 21:44:33 +0100 Message-ID: <20240214204435.167852-9-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" Add __tlb_remove_folio_pages(), which will remove multiple consecutive pages that belong to the same large folio, instead of only a single page. We'll be using this function when optimizing unmapping/zapping of large folios that are mapped by PTEs. We're using the remaining spare bit in an encoded_page to indicate that the next enoced page in an array contains actually shifted "nr_pages". Teach swap/freeing code about putting multiple folio references, and delayed rmap handling to remove page ranges of a folio. This extension allows for still gathering almost as many small folios as we used to (-1, because we have to prepare for a possibly bigger next entry), but still allows for gathering consecutive pages that belong to the same large folio. Note that we don't pass the folio pointer, because it is not required for now. Further, we don't support page_size !=3D PAGE_SIZE, it won't be required for simple PTE batching. We have to provide a separate s390 implementation, but it's fairly straight forward. Another, more invasive and likely more expensive, approach would be to use folio+range or a PFN range instead of page+nr_pages. But, we should do that consistently for the whole mmu_gather. For now, let's keep it simple and add "nr_pages" only. Note that it is now possible to gather significantly more pages: In the past, we were able to gather ~10000 pages, now we can also gather ~5000 folio fragments that span multiple pages. A folio fragment on x86-64 can span up to 512 pages (2 MiB THP) and on arm64 with 64k in theory 8192 pages (512 MiB THP). Gathering more memory is not considered something we should worry about, especially because these are already corner cases. While we can gather more total memory, we won't free more folio fragments. As long as page freeing time primarily only depends on the number of involved folios, there is no effective change for !preempt configurations. However, we'll adjust tlb_batch_pages_flush() separately to handle corner cases where page freeing time grows proportionally with the actual memory size. Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- arch/s390/include/asm/tlb.h | 17 +++++++++++ include/asm-generic/tlb.h | 8 +++++ include/linux/mm_types.h | 20 ++++++++++++ mm/mmu_gather.c | 61 +++++++++++++++++++++++++++++++------ mm/swap.c | 12 ++++++-- mm/swap_state.c | 15 +++++++-- 6 files changed, 119 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 48df896d5b79..e95b2c8081eb 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -26,6 +26,8 @@ void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); +static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, + struct page *page, unsigned int nr_pages, bool delay_rmap); =20 #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb @@ -52,6 +54,21 @@ static inline bool __tlb_remove_page_size(struct mmu_gat= her *tlb, return false; } =20 +static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, + struct page *page, unsigned int nr_pages, bool delay_rmap) +{ + struct encoded_page *encoded_pages[] =3D { + encode_page(page, ENCODED_PAGE_BIT_NR_PAGES_NEXT), + encode_nr_pages(nr_pages), + }; + + VM_WARN_ON_ONCE(delay_rmap); + VM_WARN_ON_ONCE(page_folio(page) !=3D page_folio(page + nr_pages - 1)); + + free_pages_and_swap_cache(encoded_pages, ARRAY_SIZE(encoded_pages)); + return false; +} + static inline void tlb_flush(struct mmu_gather *tlb) { __tlb_flush_mm_lazy(tlb->mm); diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 95d60a4f468a..bd00dd238b79 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -69,6 +69,7 @@ * * - tlb_remove_page() / __tlb_remove_page() * - tlb_remove_page_size() / __tlb_remove_page_size() + * - __tlb_remove_folio_pages() * * __tlb_remove_page_size() is the basic primitive that queues a page f= or * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a @@ -78,6 +79,11 @@ * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * + * __tlb_remove_folio_pages() is similar to __tlb_remove_page(), howeve= r, + * instead of removing a single page, remove the given number of consec= utive + * pages that are all part of the same (large) folio: just like calling + * __tlb_remove_page() on each page individually. + * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; impli= es a @@ -262,6 +268,8 @@ struct mmu_gather_batch { =20 extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *pa= ge, bool delay_rmap, int page_size); +bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, + unsigned int nr_pages, bool delay_rmap); =20 #ifdef CONFIG_SMP /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1b89eec0d6df..a7223ba3ea1e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -226,6 +226,15 @@ struct encoded_page; /* Perform rmap removal after we have flushed the TLB. */ #define ENCODED_PAGE_BIT_DELAY_RMAP 1ul =20 +/* + * The next item in an encoded_page array is the "nr_pages" argument, spec= ifying + * the number of consecutive pages starting from this page, that all belon= g to + * the same folio. For example, "nr_pages" corresponds to the number of fo= lio + * references that must be dropped. If this bit is not set, "nr_pages" is + * implicitly 1. + */ +#define ENCODED_PAGE_BIT_NR_PAGES_NEXT 2ul + static __always_inline struct encoded_page *encode_page(struct page *page,= unsigned long flags) { BUILD_BUG_ON(flags > ENCODED_PAGE_BITS); @@ -242,6 +251,17 @@ static inline struct page *encoded_page_ptr(struct enc= oded_page *page) return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page); } =20 +static __always_inline struct encoded_page *encode_nr_pages(unsigned long = nr) +{ + VM_WARN_ON_ONCE((nr << 2) >> 2 !=3D nr); + return (struct encoded_page *)(nr << 2); +} + +static __always_inline unsigned long encoded_nr_pages(struct encoded_page = *page) +{ + return ((unsigned long)page) >> 2; +} + /* * A swap entry has to fit into a "unsigned long", as the entry is hidden * in the "index" field of the swapper address space. diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 6540c99c6758..d175c0f1e2c8 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -50,12 +50,21 @@ static bool tlb_next_batch(struct mmu_gather *tlb) #ifdef CONFIG_SMP static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm= _area_struct *vma) { + struct encoded_page **pages =3D batch->encoded_pages; + for (int i =3D 0; i < batch->nr; i++) { - struct encoded_page *enc =3D batch->encoded_pages[i]; + struct encoded_page *enc =3D pages[i]; =20 if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) { struct page *page =3D encoded_page_ptr(enc); - folio_remove_rmap_pte(page_folio(page), page, vma); + unsigned int nr_pages =3D 1; + + if (unlikely(encoded_page_flags(enc) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr_pages =3D encoded_nr_pages(pages[++i]); + + folio_remove_rmap_ptes(page_folio(page), page, nr_pages, + vma); } } } @@ -89,18 +98,26 @@ static void tlb_batch_pages_flush(struct mmu_gather *tl= b) for (batch =3D &tlb->local; batch && batch->nr; batch =3D batch->next) { struct encoded_page **pages =3D batch->encoded_pages; =20 - do { + while (batch->nr) { /* * limit free batch count when PAGE_SIZE > 4K */ unsigned int nr =3D min(512U, batch->nr); =20 + /* + * Make sure we cover page + nr_pages, and don't leave + * nr_pages behind when capping the number of entries. + */ + if (unlikely(encoded_page_flags(pages[nr - 1]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr++; + free_pages_and_swap_cache(pages, nr); pages +=3D nr; batch->nr -=3D nr; =20 cond_resched(); - } while (batch->nr); + } } tlb->active =3D &tlb->local; } @@ -116,8 +133,9 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next =3D NULL; } =20 -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - bool delay_rmap, int page_size) +static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb, + struct page *page, unsigned int nr_pages, bool delay_rmap, + int page_size) { int flags =3D delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0; struct mmu_gather_batch *batch; @@ -126,6 +144,8 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, str= uct page *page, =20 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size !=3D page_size); + VM_WARN_ON_ONCE(nr_pages !=3D 1 && page_size !=3D PAGE_SIZE); + VM_WARN_ON_ONCE(page_folio(page) !=3D page_folio(page + nr_pages - 1)); #endif =20 batch =3D tlb->active; @@ -133,17 +153,40 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, s= truct page *page, * Add the page and check if we are full. If so * force a flush. */ - batch->encoded_pages[batch->nr++] =3D encode_page(page, flags); - if (batch->nr =3D=3D batch->max) { + if (likely(nr_pages =3D=3D 1)) { + batch->encoded_pages[batch->nr++] =3D encode_page(page, flags); + } else { + flags |=3D ENCODED_PAGE_BIT_NR_PAGES_NEXT; + batch->encoded_pages[batch->nr++] =3D encode_page(page, flags); + batch->encoded_pages[batch->nr++] =3D encode_nr_pages(nr_pages); + } + /* + * Make sure that we can always add another "page" + "nr_pages", + * requiring two entries instead of only a single one. + */ + if (batch->nr >=3D batch->max - 1) { if (!tlb_next_batch(tlb)) return true; batch =3D tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max, page); + VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page); =20 return false; } =20 +bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, + unsigned int nr_pages, bool delay_rmap) +{ + return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap, + PAGE_SIZE); +} + +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, + bool delay_rmap, int page_size) +{ + return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size); +} + #endif /* MMU_GATHER_NO_GATHER */ =20 #ifdef CONFIG_MMU_GATHER_TABLE_FREE diff --git a/mm/swap.c b/mm/swap.c index cd8f0150ba3a..e5380d732c0d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -967,11 +967,17 @@ void release_pages(release_pages_arg arg, int nr) unsigned int lock_batch; =20 for (i =3D 0; i < nr; i++) { + unsigned int nr_refs =3D 1; struct folio *folio; =20 /* Turn any of the argument types into a folio */ folio =3D page_folio(encoded_page_ptr(encoded[i])); =20 + /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ + if (unlikely(encoded_page_flags(encoded[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr_refs =3D encoded_nr_pages(encoded[++i]); + /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the @@ -990,14 +996,14 @@ void release_pages(release_pages_arg arg, int nr) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec =3D NULL; } - if (put_devmap_managed_page(&folio->page)) + if (put_devmap_managed_page_refs(&folio->page, nr_refs)) continue; - if (folio_put_testzero(folio)) + if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_page(&folio->page); continue; } =20 - if (!folio_put_testzero(folio)) + if (!folio_ref_sub_and_test(folio, nr_refs)) continue; =20 if (folio_test_large(folio)) { diff --git a/mm/swap_state.c b/mm/swap_state.c index 7255c01a1e4e..2f540748f7c0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -311,8 +311,19 @@ void free_page_and_swap_cache(struct page *page) void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { lru_add_drain(); - for (int i =3D 0; i < nr; i++) - free_swap_cache(encoded_page_ptr(pages[i])); + for (int i =3D 0; i < nr; i++) { + struct page *page =3D encoded_page_ptr(pages[i]); + + /* + * Skip over the "nr_pages" entry. It's sufficient to call + * free_swap_cache() only once per folio. + */ + if (unlikely(encoded_page_flags(pages[i]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + i++; + + free_swap_cache(page); + } release_pages(pages, nr); } =20 --=20 2.43.0 From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.129.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B63BE13B799 for ; Wed, 14 Feb 2024 20:45:23 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.129.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943525; cv=none; b=hEoErhyAKu4/88/L03wGy/s0dyT7HOrIuns2JsrpvHC0/vRG4zDLob1tB1tBx1Q1swhfWoA7I7ZeZ7Q5jujWNmZpFskRV54z+823mPk/IdgdP6Y71HpLMsG8FIYw5Wv8ZCwNHgnOsa+m8p0mmB7IqSRafW3cZPV8OZYNeNQlN3U= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943525; c=relaxed/simple; bh=NUzLgoVlOyf2LRkuaXGl5RfztDVrEk1Ld5b6CnFv0Uk=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=tKemtU61hFOQtTOSbbLzA1ukk0v5g6Ia0GOgHcWvZy9kvu2PvhyTZJSQOvYhC+eDFszexz+St2o+rhr0xi+tSNgkdoHCiA/B6ZilUEm+xeNf1Xg2u4QdVLHXoBNjp0jDkB4XiobkaLk56AQUU6muTZBjEPLqHc8EAwoW0caF4RQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=er1KdVDD; arc=none smtp.client-ip=170.10.129.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="er1KdVDD" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943522; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=pRS8kYaXs4+JoUa7KvLCUnXcrqNWL6ZvqI66J3j2gDY=; b=er1KdVDDSz7xQAFEHcwJm6DWl9aiGIxTHXGaDzrPQyjgUVS39A8ix1UgYR60JHcBOsMUMX ohxKay9lw0kGgYZLWVD165lzO2qqa8ieIpjDPjSHLqHmaUbl/D0aBFXVVOlfPnDWgDoi+T NEgc4azVxbtuOT6byB1+I/b33Yb6dqk= Received: from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-73-lVXRetU-PDCixQSdYcx5pA-1; Wed, 14 Feb 2024 15:45:18 -0500 X-MC-Unique: lVXRetU-PDCixQSdYcx5pA-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 503D9101A52A; Wed, 14 Feb 2024 20:45:17 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id 8ECBC1C05E0F; Wed, 14 Feb 2024 20:45:13 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 09/10] mm/mmu_gather: improve cond_resched() handling with large folios and expensive page freeing Date: Wed, 14 Feb 2024 21:44:34 +0100 Message-ID: <20240214204435.167852-10-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" In tlb_batch_pages_flush(), we can end up freeing up to 512 pages or now up to 256 folio fragments that span more than one page, before we conditionally reschedule. It's a pain that we have to handle cond_resched() in tlb_batch_pages_flush() manually and cannot simply handle it in release_pages() -- release_pages() can be called from atomic context. Well, in a perfect world we wouldn't have to make our code more complicated at all. With page poisoning and init_on_free, we might now run into soft lockups when we free a lot of rather large folio fragments, because page freeing time then depends on the actual memory size we are freeing instead of on the number of folios that are involved. In the absolute (unlikely) worst case, on arm64 with 64k we will be able to free up to 256 folio fragments that each span 512 MiB: zeroing out 128 GiB does sound like it might take a while. But instead of ignoring this unlikely case, let's just handle it. So, let's teach tlb_batch_pages_flush() that there are some configurations where page freeing is horribly slow, and let's reschedule more frequently -- similarly like we did for now before we had large folio fragments in there. Avoid yet another loop over all encoded pages in the common case by handling that separately. Note that with page poisoning/zeroing, we might now end up freeing only a single folio fragment at a time that might exceed the old 512 pages limit: but if we cannot even free a single MAX_ORDER page on a system without running into soft lockups, something else is already completely bogus. Freeing a PMD-mapped THP would similarly cause trouble. In theory, we might even free 511 order-0 pages + a single MAX_ORDER page, effectively having to zero out 8703 pages on arm64 with 64k, translating to ~544 MiB of memory: however, if 512 MiB doesn't result in soft lockups, 544 MiB is unlikely to result in soft lockups, so we won't care about that for the time being. In the future, we might want to detect if handling cond_resched() is required at all, and just not do any of that with full preemption enabled. Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- mm/mmu_gather.c | 58 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index d175c0f1e2c8..99b3e9408aa0 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -91,18 +91,21 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_= area_struct *vma) } #endif =20 -static void tlb_batch_pages_flush(struct mmu_gather *tlb) -{ - struct mmu_gather_batch *batch; +/* + * We might end up freeing a lot of pages. Reschedule on a regular + * basis to avoid soft lockups in configurations without full + * preemption enabled. The magic number of 512 folios seems to work. + */ +#define MAX_NR_FOLIOS_PER_FREE 512 =20 - for (batch =3D &tlb->local; batch && batch->nr; batch =3D batch->next) { - struct encoded_page **pages =3D batch->encoded_pages; +static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch) +{ + struct encoded_page **pages =3D batch->encoded_pages; + unsigned int nr, nr_pages; =20 - while (batch->nr) { - /* - * limit free batch count when PAGE_SIZE > 4K - */ - unsigned int nr =3D min(512U, batch->nr); + while (batch->nr) { + if (!page_poisoning_enabled_static() && !want_init_on_free()) { + nr =3D min(MAX_NR_FOLIOS_PER_FREE, batch->nr); =20 /* * Make sure we cover page + nr_pages, and don't leave @@ -111,14 +114,39 @@ static void tlb_batch_pages_flush(struct mmu_gather *= tlb) if (unlikely(encoded_page_flags(pages[nr - 1]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) nr++; + } else { + /* + * With page poisoning and init_on_free, the time it + * takes to free memory grows proportionally with the + * actual memory size. Therefore, limit based on the + * actual memory size and not the number of involved + * folios. + */ + for (nr =3D 0, nr_pages =3D 0; + nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE; + nr++) { + if (unlikely(encoded_page_flags(pages[nr]) & + ENCODED_PAGE_BIT_NR_PAGES_NEXT)) + nr_pages +=3D encoded_nr_pages(pages[++nr]); + else + nr_pages++; + } + } =20 - free_pages_and_swap_cache(pages, nr); - pages +=3D nr; - batch->nr -=3D nr; + free_pages_and_swap_cache(pages, nr); + pages +=3D nr; + batch->nr -=3D nr; =20 - cond_resched(); - } + cond_resched(); } +} + +static void tlb_batch_pages_flush(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + for (batch =3D &tlb->local; batch && batch->nr; batch =3D batch->next) + __tlb_batch_free_encoded_pages(batch); tlb->active =3D &tlb->local; } =20 --=20 2.43.0 From nobody Sun Dec 14 06:19:13 2025 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6EA8C1420C4 for ; Wed, 14 Feb 2024 20:45:28 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=170.10.133.124 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943530; cv=none; b=sgU6PYml6j2ytLM4u57UoEePQ9o283fIFLUliSs2IXHTCZ9xsjEOxPcIXolrgTpFw57EkdjoCbnBPAZsBTffDUE9mMXPqC1681jH4tdvu9a60uqlvBHelDCk+RnEW5uq055ZOZzGvVC874fITqOZ/sWi2zxbppY+sH5XYVzi7OU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1707943530; c=relaxed/simple; bh=GjDbS8TDvRp1vIA00Uk2HuEPKSBZHcrjP/rtVt+0Wq4=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=OOCg2kv1HjTmFlNQzwPFL8Kb/7iV5HT/D7cHR/Xe+jXUvQIzlUwRtDYYbtlC09XwnOZPdgj2TnzHT4+bGIZHKdxpbllwskAcfBbMu5y/xpUQD1QNvF19utCyDqsj5CwROtu1/Hgid3/Fa8HDu4h/lL+mKyk8buLxQks5gPjo5es= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com; spf=pass smtp.mailfrom=redhat.com; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b=cE2lImJR; arc=none smtp.client-ip=170.10.133.124 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=redhat.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=redhat.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=redhat.com header.i=@redhat.com header.b="cE2lImJR" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1707943527; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=MyxDp/8zupvql86OoDnfzfcwBZWGRlpds50qO6uo6X4=; b=cE2lImJR1nycmbWzPDCzUGSfciDDuVOc74/NsbmOeLfBJAVRuiR24a0TWBg9FmqWD4aqOX b4Ftyhi3fLG/joVLAK4DD6o5Nte3BxgHDnPauLV0sP1R2CWlETRM3hPqRouQZ8nRht7WpN qAOPInMZr67lH1qRMXiUH3nV9RPr3ZU= Received: from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-588-wHfYfIO7M6uwQlj4FVq0Rg-1; Wed, 14 Feb 2024 15:45:22 -0500 X-MC-Unique: wHfYfIO7M6uwQlj4FVq0Rg-1 Received: from smtp.corp.redhat.com (int-mx07.intmail.prod.int.rdu2.redhat.com [10.11.54.7]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id 96991811E79; Wed, 14 Feb 2024 20:45:21 +0000 (UTC) Received: from t14s.fritz.box (unknown [10.39.194.174]) by smtp.corp.redhat.com (Postfix) with ESMTP id 8E8091C066A9; Wed, 14 Feb 2024 20:45:17 +0000 (UTC) From: David Hildenbrand To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org, David Hildenbrand , Andrew Morton , Matthew Wilcox , Ryan Roberts , Catalin Marinas , Yin Fengwei , Michal Hocko , Will Deacon , "Aneesh Kumar K.V" , Nick Piggin , Peter Zijlstra , Michael Ellerman , Christophe Leroy , "Naveen N. Rao" , Heiko Carstens , Vasily Gorbik , Alexander Gordeev , Christian Borntraeger , Sven Schnelle , Arnd Bergmann , linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, linux-s390@vger.kernel.org Subject: [PATCH v3 10/10] mm/memory: optimize unmap/zap with PTE-mapped THP Date: Wed, 14 Feb 2024 21:44:35 +0100 Message-ID: <20240214204435.167852-11-david@redhat.com> In-Reply-To: <20240214204435.167852-1-david@redhat.com> References: <20240214204435.167852-1-david@redhat.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.7 Content-Type: text/plain; charset="utf-8" Similar to how we optimized fork(), let's implement PTE batching when consecutive (present) PTEs map consecutive pages of the same large folio. Most infrastructure we need for batching (mmu gather, rmap) is already there. We only have to add get_and_clear_full_ptes() and clear_full_ptes(). Similarly, extend zap_install_uffd_wp_if_needed() to process a PTE range. We won't bother sanity-checking the mapcount of all subpages, but only check the mapcount of the first subpage we process. If there is a real problem hiding somewhere, we can trigger it simply by using small folios, or when we zap single pages of a large folio. Ideally, we had that check in rmap code (including for delayed rmap), but then we cannot print the PTE. Let's keep it simple for now. If we ever have a cheap folio_mapcount(), we might just want to check for underflows there. To keep small folios as fast as possible force inlining of a specialized variant using __always_inline with nr=3D1. Reviewed-by: Ryan Roberts Signed-off-by: David Hildenbrand --- include/linux/pgtable.h | 70 +++++++++++++++++++++++++++++++ mm/memory.c | 92 +++++++++++++++++++++++++++++------------ 2 files changed, 136 insertions(+), 26 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index aab227e12493..49ab1f73b5c2 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -580,6 +580,76 @@ static inline pte_t ptep_get_and_clear_full(struct mm_= struct *mm, } #endif =20 +#ifndef get_and_clear_full_ptes +/** + * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages= of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simp= le + * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into t= he + * returned PTE. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For exa= mple, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte =3D ptep_get_and_clear_full(mm, addr, ptep, full); + while (--nr) { + ptep++; + addr +=3D PAGE_SIZE; + tmp_pte =3D ptep_get_and_clear_full(mm, addr, ptep, full); + if (pte_dirty(tmp_pte)) + pte =3D pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte =3D pte_mkyoung(pte); + } + return pte; +} +#endif + +#ifndef clear_full_ptes +/** + * clear_full_ptes - Clear present PTEs that map consecutive pages of the = same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simp= le + * loop over ptep_get_and_clear_full(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For exa= mple, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long add= r, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + ptep_get_and_clear_full(mm, addr, ptep, full); + if (--nr =3D=3D 0) + break; + ptep++; + addr +=3D PAGE_SIZE; + } +} +#endif =20 /* * If two threads concurrently fault at the same page, the thread that diff --git a/mm/memory.c b/mm/memory.c index a3efc4da258a..3b8e56eb08a3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1515,7 +1515,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_d= etails *details) */ static inline void zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte, + unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { /* Zap on anonymous always means dropping everything */ @@ -1525,20 +1525,27 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct= *vma, if (zap_drop_file_uffd_wp(details)) return; =20 - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + for (;;) { + /* the PFN in the PTE is irrelevant. */ + pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (--nr =3D=3D 0) + break; + pte++; + addr +=3D PAGE_SIZE; + } } =20 -static inline void zap_present_folio_pte(struct mmu_gather *tlb, +static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, - struct page *page, pte_t *pte, pte_t ptent, unsigned long addr, - struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, + unsigned long addr, struct zap_details *details, int *rss, + bool *force_flush, bool *force_break) { struct mm_struct *mm =3D tlb->mm; bool delay_rmap =3D false; =20 if (!folio_test_anon(folio)) { - ptent =3D ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); + ptent =3D get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); if (pte_dirty(ptent)) { folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { @@ -1548,36 +1555,49 @@ static inline void zap_present_folio_pte(struct mmu= _gather *tlb, } if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); - rss[mm_counter(folio)]--; + rss[mm_counter(folio)] -=3D nr; } else { /* We don't need up-to-date accessed/dirty bits. */ - ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); - rss[MM_ANONPAGES]--; + clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); + rss[MM_ANONPAGES] -=3D nr; } + /* Checking a single PTE in a batch is sufficient. */ arch_check_zapped_pte(vma, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); + zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, + ptent); =20 if (!delay_rmap) { - folio_remove_rmap_pte(folio, page, vma); + folio_remove_rmap_ptes(folio, page, nr, vma); + + /* Only sanity-check the first page in a batch. */ if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); } - if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { + if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush =3D true; *force_break =3D true; } } =20 -static inline void zap_present_pte(struct mmu_gather *tlb, +/* + * Zap or skip at least one present PTE, trying to batch-process subsequent + * PTEs that map consecutive pages of the same folio. + * + * Returns the number of processed (skipped or zapped) PTEs (at least 1). + */ +static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, - unsigned long addr, struct zap_details *details, - int *rss, bool *force_flush, bool *force_break) + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *force_flush, + bool *force_break) { + const fpb_t fpb_flags =3D FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm =3D tlb->mm; struct folio *folio; struct page *page; + int nr; =20 page =3D vm_normal_page(vma, addr, ptent); if (!page) { @@ -1587,14 +1607,29 @@ static inline void zap_present_pte(struct mmu_gathe= r *tlb, tlb_remove_tlb_entry(tlb, pte, addr); VM_WARN_ON_ONCE(userfaultfd_wp(vma)); ksm_might_unmap_zero_page(mm, ptent); - return; + return 1; } =20 folio =3D page_folio(page); if (unlikely(!should_zap_folio(details, folio))) - return; - zap_present_folio_pte(tlb, vma, folio, page, pte, ptent, addr, details, - rss, force_flush, force_break); + return 1; + + /* + * Make sure that the common "small folio" case is as fast as possible + * by keeping the batching logic separate. + */ + if (unlikely(folio_test_large(folio) && max_nr !=3D 1)) { + nr =3D folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, + NULL); + + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, + addr, details, rss, force_flush, + force_break); + return nr; + } + zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, + details, rss, force_flush, force_break); + return 1; } =20 static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -1609,6 +1644,7 @@ static unsigned long zap_pte_range(struct mmu_gather = *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; + int nr; =20 tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); @@ -1622,7 +1658,9 @@ static unsigned long zap_pte_range(struct mmu_gather = *tlb, pte_t ptent =3D ptep_get(pte); struct folio *folio; struct page *page; + int max_nr; =20 + nr =3D 1; if (pte_none(ptent)) continue; =20 @@ -1630,10 +1668,12 @@ static unsigned long zap_pte_range(struct mmu_gathe= r *tlb, break; =20 if (pte_present(ptent)) { - zap_present_pte(tlb, vma, pte, ptent, addr, details, - rss, &force_flush, &force_break); + max_nr =3D (end - addr) / PAGE_SIZE; + nr =3D zap_present_ptes(tlb, vma, pte, ptent, max_nr, + addr, details, rss, &force_flush, + &force_break); if (unlikely(force_break)) { - addr +=3D PAGE_SIZE; + addr +=3D nr * PAGE_SIZE; break; } continue; @@ -1687,8 +1727,8 @@ static unsigned long zap_pte_range(struct mmu_gather = *tlb, WARN_ON_ONCE(1); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); - } while (pte++, addr +=3D PAGE_SIZE, addr !=3D end); + zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); + } while (pte +=3D nr, addr +=3D PAGE_SIZE * nr, addr !=3D end); =20 add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); --=20 2.43.0