From nobody Mon Apr 6 09:08:52 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id F1C912E65D; Thu, 19 Mar 2026 23:37:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773963471; cv=none; b=MZqIjK7/7dVcvg31Q0pm644LIEpUnY2TDClY6t1sO02zmcCgunPkrnlokJfGpAY/wweTuSd9cKfJuJot4NCdddsnt1S0xhEtbzVs5dsrOiJnajqOgPfCvq92MRKgqdA7k9iktQwk41T3kbHPWwnxLUYcX4PldhVhMV03Px3PQso= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1773963471; c=relaxed/simple; bh=aIr7TfKEnhjyUIbFo1gldXXuVIRtbjWJ7wqzkAXHWqw=; h=Date:From:To:Cc:Subject:Message-ID:MIME-Version:Content-Type: Content-Disposition; b=HnEZ8QeoiK2LzDMh97zbHJktFORan9QF4vhZrwzIh1/nFSxa9Zp7IMaYGhRzX5uzefucBZMZzK0ha33IFinCgxe4FwyNovP4pbnXJXVz5i8tYwSkLXaNdEJtOnFT3wrNjxEPB4vfal/USuUe8/SaViKeI/Y5ZHjryIOg3KrWKBE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=u/GyXuLe; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="u/GyXuLe" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 02E63C19424; Thu, 19 Mar 2026 23:37:47 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1773963470; bh=aIr7TfKEnhjyUIbFo1gldXXuVIRtbjWJ7wqzkAXHWqw=; h=Date:From:To:Cc:Subject:From; b=u/GyXuLeOODnQupuxN4gi/j2QHgINgQ1hNo5CTyrVLNGqs8vzjdHngIwAzSmsTUjA X72eakAHMuD+3AoMGMVZ4MgvUmiOMpByerI48Um1rss2NBXNVg3HMNjUVCgkzGoaOk RS3HY2PyB54xhHd+wfTk4n6r19xdB8kp1YK3Sipgqcf1eAQCEknBD6SjYxDfBjQ3zH JU9/3P7Ugr8++7Vd3OAparfJWfY5OzHRTGY/JkuKAXRJtkLaDfZJtCp7GlQT6yTBzO vG73YKDoCvyxtZxBQ/DhTz8jIhyig9Ru9KgewBCwe3Nc+a0yWKUISfp1xC/9/uhOkt z3z9+ALBkNgIA== Date: Thu, 19 Mar 2026 16:37:45 -0700 From: Nathan Chancellor To: Mathieu Desnoyers , Thomas =?iso-8859-1?Q?Wei=DFschuh?= , Michal Clapinski Cc: Andrew Morton , Thomas Gleixner , Steven Rostedt , Masami Hiramatsu , linux-mm@kvack.org, linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org Subject: NULL pointer dereference when booting ppc64_guest_defconfig in QEMU on -next Message-ID: <20260319233745.GA769346@ax162> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="oKQGQYgdjErEaAqZ" Content-Disposition: inline Content-Transfer-Encoding: 8bit Content-Transfer-Encoding: quoted-printable --oKQGQYgdjErEaAqZ Content-Disposition: inline MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 SGkgYWxsLAoKSSBhbSBub3QgcmVhbGx5IHN1cmUgd2hvc2UgYnVnIHRoaXMgaXMsIGFzIGl0IG9u bHkgYXBwZWFycyB3aGVuIHRocmVlCnNlZW1pbmdseSBpbmRlcGVuZGVudCBwYXRjaCBzZXJpZXMg YXJlIGFwcGxpZWQgdG9nZXRoZXIsIHNvIEkgaGF2ZSBhZGRlZAp0aGUgcGF0Y2ggYXV0aG9ycyBh bmQgdGhlaXIgY29tbWl0dGVycyAoYWxvbmcgd2l0aCB0aGUgdHJhY2luZwptYWludGFpbmVycykg dG8gdGhpcyB0aHJlYWQuIEZlZWwgZnJlZSB0byBleHBhbmQgb3IgcmVkdWNlIHRoYXQgbGlzdCBh cwpuZWNlc3NhcnkuCgpPdXIgY29udGludW91cyBpbnRlZ3JhdGlvbiBoYXMgbm90aWNlZCBhIGNy YXNoIHdoZW4gYm9vdGluZwpwcGM2NF9ndWVzdF9kZWZjb25maWcgaW4gUUVNVSBvbiB0aGUgcGFz dCBmZXcgLW5leHQgdmVyc2lvbnMuCgogIGh0dHBzOi8vZ2l0aHViLmNvbS9DbGFuZ0J1aWx0TGlu dXgvY29udGludW91cy1pbnRlZ3JhdGlvbjIvYWN0aW9ucy9ydW5zLzI9CjMzMTExNTQ0OTIvam9i LzY3ODExNTI3MTEyCgpUaGlzIGRvZXMgbm90IGFwcGVhciB0byBiZSBjbGFuZyByZWxhdGVkLCBh cyBpdCBjYW4gYmUgcmVwcm9kdWNlZCB3aXRoCkdDQyAxNS4yLjAgYXMgd2VsbC4gVGhyb3VnaCBt dWx0aXBsZSBiaXNlY3RzLCBJIHdhcyBhYmxlIHRvIGxhbmQgb24KYXBwbHlpbmc6CgogIG1tOiBp bXByb3ZlIFJTUyBjb3VudGVyIGFwcHJveGltYXRpb24gYWNjdXJhY3kgZm9yIHByb2MgaW50ZXJm YWNlcyBbMV0KICB2ZHNvL2RhdGFzdG9yZTogQWxsb2NhdGUgZGF0YSBwYWdlcyBkeW5hbWljYWxs eSBbMl0KICBraG86IGZpeCBkZWZlcnJlZCBpbml0IG9mIGtobyBzY3JhdGNoIFszXQoKYW5kIHRo ZWlyIGRlcGVuZGVudCBjaGFuZ2VzIG9uIHRvcCBvZiA3LjAtcmM0IGlzIGVub3VnaCB0byByZXBy b2R1Y2UKdGhpcyAoYXQgbGVhc3Qgb24gdHdvIG9mIG15IG1hY2hpbmVzIHdpdGggdGhlIHNhbWUg Y29tbWFuZHMpLiBJIGhhdmUKYXR0YWNoZWQgdGhlIGRpZmYgZnJvbSB0aGUgcmVzdWx0IG9mIHRo ZSBmb2xsb3dpbmcgJ2dpdCBhcHBseScgY29tbWFuZHMKYmVsb3csIGRvbmUgaW4gYSBsaW51eC1u ZXh0IGNoZWNrb3V0LgoKICAkIGdpdCBjaGVja291dCB2Ny4wLXJjNAogIEhFQUQgaXMgbm93IGF0 IGYzMzhlNzczODM3OCBMaW51eCA3LjAtcmM0CgogICMgWzFdCiAgJCBnaXQgZGlmZiA2MGRkZjNl ZWQ0OTk5YmFlNDQwZDFjZjllNTg2OGNjYjNmMzA4YjY0Xi4uMDg3ZGQ2ZDJjYzEyYzgyOTQ1YT0K Yjg1OTE5NGMzMmU4ZTk3N2RhYWUzIHwgZ2l0IGFwcGx5IC0zdgogIC4uLgoKICAjIFsyXQogICMg Rml4IHRyaXZpYWwgY29uZmxpY3QgaW4gaW5pdC9tYWluLmMgYXJvdW5kIGhlYWRlcnMKICAkIGdp dCBkaWZmIGRjNDMyYWI3MTMwYmIzOWY1YTM1MTI4MWEwMmQ0YmM2MWU4NWExNGFeLi4wNTk4OGRi YTExNzkxY2NiYjQ1PQo4MjU0NDg0ODI2YjMyZjE3ZjRhZDIgfCBnaXQgYXBwbHkgLTN2CiAgLi4u CgogICMgWzNdCiAgIyBGaXggY29uZmxpY3QgaW4ga2VybmVsL2xpdmV1cGRhdGUva2V4ZWNfaGFu ZG92ZXIuYyBkdWUgdG8gbGFjayBvZiBraG9fbT0KZW1fcmV0cmlldmUoKSwganVzdCBhZGQgcGZu X2lzX2tob19zY3JhdGNoKCkKICAkIGdpdCBzaG93IDRhNzg0NjdmZmI1Mzc0NjM0ODY5NjgyMzJk YWVmMWU4YTJmMTA1ZTMgfCBnaXQgYXBwbHkgLTN2CiAgLi4uCgogICQgbWFrZSAtc2tqIiQobnBy b2MpIiBBUkNIPTNEcG93ZXJwYyBDUk9TU19DT01QSUxFPTNEcG93ZXJwYzY0LWxpbnV4LSBtcnA9 CnJvcGVyIHBwYzY0X2d1ZXN0X2RlZmNvbmZpZyB2bWxpbnV4CgogICQgY3VybCAtTFNzIGh0dHBz Oi8vZ2l0aHViLmNvbS9DbGFuZ0J1aWx0TGludXgvYm9vdC11dGlscy9yZWxlYXNlcy9kb3dubG89 CmFkLzIwMjQxMTIwLTA0NDQzNC9wcGM2NC1yb290ZnMuY3Bpby56c3QgfCB6c3RkIC1kID5yb290 ZnMuY3BpbwoKICAkIHFlbXUtc3lzdGVtLXBwYzY0IFwKICAgICAgLWRpc3BsYXkgbm9uZSBcCiAg ICAgIC1ub2RlZmF1bHRzIFwKICAgICAgLWNwdSBwb3dlcjggXAogICAgICAtbWFjaGluZSBwc2Vy aWVzIFwKICAgICAgLXZnYSBub25lIFwKICAgICAgLWtlcm5lbCB2bWxpbnV4IFwKICAgICAgLWlu aXRyZCByb290ZnMuY3BpbyBcCiAgICAgIC1tIDFHIFwKICAgICAgLXNlcmlhbCBtb246c3RkaW8K ICAuLi4KICBbICAgIDAuMDAwMDAwXVsgICAgVDBdIExpbnV4IHZlcnNpb24gNy4wLjAtcmM0LWRp cnR5IChuYXRoYW5AZnJhbWV3b3JrLWFtPQpkLXJ5emVuLW1heHBsdXMtMzk1KSAocG93ZXJwYzY0 LWxpbnV4LWdjYyAoR0NDKSAxNS4yLjAsIEdOVSBsZCAoR05VIEJpbnV0aWw9CnMpIDIuNDUpICMx IFNNUCBQUkVFTVBUIFRodSBNYXIgMTkgMTU6NDU6NTMgTVNUIDIwMjYKICAuLi4KICBbICAgIDAu MjE2NzY0XVsgICAgVDFdIHZnYWFyYjogbG9hZGVkCiAgWyAgICAwLjIxNzU5MF1bICAgIFQxXSBj bG9ja3NvdXJjZTogU3dpdGNoZWQgdG8gY2xvY2tzb3VyY2UgdGltZWJhc2UKICBbICAgIDAuMjIx MDA3XVsgICBUMTJdIEJVRzogS2VybmVsIE5VTEwgcG9pbnRlciBkZXJlZmVyZW5jZSBhdCAweDAw MDAwMDEwCiAgWyAgICAwLjIyMTA0OV1bICAgVDEyXSBGYXVsdGluZyBpbnN0cnVjdGlvbiBhZGRy ZXNzOiAweGMwMDAwMDAwMDA0NDk0N2MKICBbICAgIDAuMjIxMjM3XVsgICBUMTJdIE9vcHM6IEtl cm5lbCBhY2Nlc3Mgb2YgYmFkIGFyZWEsIHNpZzogMTEgWyMxXQogIFsgICAgMC4yMjEyNzZdWyAg IFQxMl0gQkUgUEFHRV9TSVpFPTNENjRLIE1NVT0zREhhc2ggIFNNUCBOUl9DUFVTPTNEMjA0OCA9 Ck5VTUEgcFNlcmllcwogIFsgICAgMC4yMjEzNTldWyAgIFQxMl0gTW9kdWxlcyBsaW5rZWQgaW46 CiAgWyAgICAwLjIyMTU1Nl1bICAgVDEyXSBDUFU6IDAgVUlEOiAwIFBJRDogMTIgQ29tbToga3dv cmtlci91NDowIE5vdCB0YWludD0KZWQgNy4wLjAtcmM0LWRpcnR5ICMxIFBSRUVNUFRMQVpZCiAg WyAgICAwLjIyMTYzMV1bICAgVDEyXSBIYXJkd2FyZSBuYW1lOiBJQk0gcFNlcmllcyAoZW11bGF0 ZWQgYnkgcWVtdSkgUE9XRT0KUjggKGFyY2hpdGVjdGVkKSAweDRkMDIwMCAweGYwMDAwMDQgb2Y6 U0xPRixIRUFEIHBTZXJpZXMKICBbICAgIDAuMjIxNzY1XVsgICBUMTJdIFdvcmtxdWV1ZTogdHJh Y2VfaW5pdF93cSB0cmFjZXJfaW5pdF90cmFjZWZzX3dvcmtfPQpmdW5jCiAgWyAgICAwLjIyMjA2 NV1bICAgVDEyXSBOSVA6ICBjMDAwMDAwMDAwNDQ5NDdjIExSOiBjMDAwMDAwMDAwNDFhNTg0IENU UjogYz0KMDAwMDAwMDAwNTNhYTkwCiAgWyAgICAwLjIyMjA4NF1bICAgVDEyXSBSRUdTOiBjMDAw MDAwMDAzYmM3OTYwIFRSQVA6IDAzODAgICBOb3QgdGFpbnRlZCAgKD0KNy4wLjAtcmM0LWRpcnR5 KQogIFsgICAgMC4yMjIxMTFdWyAgIFQxMl0gTVNSOiAgODAwMDAwMDAwMDAwOTAzMiA8U0YsRUUs TUUsSVIsRFIsUkk+ICBDUjogNDQ9CjAwMDIwNCAgWEVSOiAwMDAwMDAwMAogIFsgICAgMC4yMjIy ODddWyAgIFQxMl0gQ0ZBUjogYzAwMDAwMDAwMDQ0OTQyMCBJUlFNQVNLOiAwCiAgWyAgICAwLjIy MjI4N11bICAgVDEyXSBHUFIwMDogYzAwMDAwMDAwMDQxYTU4NCBjMDAwMDAwMDAzYmM3YzAwIGMw MDAwMDAwMD0KMWMwODEwMCBjMDAwMDAwMDAyODkyZjIwCiAgWyAgICAwLjIyMjI4N11bICAgVDEy XSBHUFIwNDogYzAwMDAwMDAwMTljZmE2OCBjMDAwMDAwMDAxOWNmYTYwIDAwMDAwMDAwMD0KMDAw MDAwMSAwMDAwMDAwMDAwMDAwMDY0CiAgWyAgICAwLjIyMjI4N11bICAgVDEyXSBHUFIwODogMDAw MDAwMDAwMDAwMDAwMiAwMDAwMDAwMDAwMDAwMDAwIGMwMDAwMDAwMD0KM2JiYTAwMCAwMDAwMDAw MDAwMDAwMDEwCiAgWyAgICAwLjIyMjI4N11bICAgVDEyXSBHUFIxMjogYzAwMDAwMDAwMDUzYWE5 MCBjMDAwMDAwMDAyYzUwMDAwIGMwMDAwMDAwMD0KMWFiMjVmOCBjMDAwMDAwMDAxNjI2NjkwCiAg WyAgICAwLjIyMjI4N11bICAgVDEyXSBHUFIxNjogMDAwMDAwMDAwMDAwMDAwMCAwMDAwMDAwMDAw MDAwMDAwIDAwMDAwMDAwMD0KMDAwMDAwMCAwMDAwMDAwMDAwMDAwMDAwCiAgWyAgICAwLjIyMjI4 N11bICAgVDEyXSBHUFIyMDogYzAwMDAwMDAwMTYyNDg2OCBjMDAwMDAwMDAxYWIyNzA4IGMwMDAw MDAwMD0KMTljZmEwOCBjMDAwMDAwMDAxYTAwZDE4CiAgWyAgICAwLjIyMjI4N11bICAgVDEyXSBH UFIyNDogYzAwMDAwMDAwMTljZmExOCBmZmZmZmZmZmZmZmZmZWY3IGMwMDAwMDAwMD0KMzA1MTIw NSBjMDAwMDAwMDAxOWNmYTY4CiAgWyAgICAwLjIyMjI4N11bICAgVDEyXSBHUFIyODogMDAwMDAw MDAwMDAwMDAwMCBjMDAwMDAwMDAxOWNmYTYwIGMwMDAwMDAwMD0KMjg5NGU5MCAwMDAwMDAwMDAw MDAwMDAwCiAgWyAgICAwLjIyMjUyNl1bICAgVDEyXSBOSVAgW2MwMDAwMDAwMDA0NDk0N2NdIF9f ZmluZF9ldmVudF9maWxlKzB4OWMvMHgxMTAKICBbICAgIDAuMjIyNTcyXVsgICBUMTJdIExSIFtj MDAwMDAwMDAwNDFhNTg0XSBpbml0X3RyYWNlcl90cmFjZWZzKzB4Mjc0LzB4PQpjYzAKICBbICAg IDAuMjIyNjQzXVsgICBUMTJdIENhbGwgVHJhY2U6CiAgWyAgICAwLjIyMjY5MF1bICAgVDEyXSBb YzAwMDAwMDAwM2JjN2MwMF0gW2MwMDAwMDAwMDBiOTQzYjBdIHRyYWNlZnNfY3JlYT0KdGVfZmls ZSsweDFhMC8weDJiMCAodW5yZWxpYWJsZSkKICBbICAgIDAuMjIyNzY2XVsgICBUMTJdIFtjMDAw MDAwMDAzYmM3YzUwXSBbYzAwMDAwMDAwMDQxYTU4NF0gaW5pdF90cmFjZXJfPQp0cmFjZWZzKzB4 Mjc0LzB4Y2MwCiAgWyAgICAwLjIyMjc5MV1bICAgVDEyXSBbYzAwMDAwMDAwM2JjN2RjMF0gW2Mw MDAwMDAwMDIwNDZmMWNdIHRyYWNlcl9pbml0Xz0KdHJhY2Vmc193b3JrX2Z1bmMrMHg1MC8weDMy MAogIFsgICAgMC4yMjI4MDldWyAgIFQxMl0gW2MwMDAwMDAwMDNiYzdlNTBdIFtjMDAwMDAwMDAw Mjc2OTU4XSBwcm9jZXNzX29uZV89CndvcmsrMHgxYjgvMHg1MzAKICBbICAgIDAuMjIyODI4XVsg ICBUMTJdIFtjMDAwMDAwMDAzYmM3ZjEwXSBbYzAwMDAwMDAwMDI3Nzc4Y10gd29ya2VyX3RocmVh PQpkKzB4MWRjLzB4M2QwCiAgWyAgICAwLjIyMjg4M11bICAgVDEyXSBbYzAwMDAwMDAwM2JjN2Y5 MF0gW2MwMDAwMDAwMDAyODRjNDRdIGt0aHJlYWQrMHgxOT0KNC8weDFiMAogIFsgICAgMC4yMjI5 MDBdWyAgIFQxMl0gW2MwMDAwMDAwMDNiYzdmZTBdIFtjMDAwMDAwMDAwMDBjZjMwXSBzdGFydF9r ZXJuZWw9Cl90aHJlYWQrMHgxNC8weDE4CiAgWyAgICAwLjIyMjk2MV1bICAgVDEyXSBDb2RlOiA3 YzY5MWI3OCA3ZjYzZGI3OCAyYzA5MDAwMCA0MDgyMDAxOCBlODljMDAwMD0KIDQ5MTA3ZjIxIDYw MDAwMDAwIDJjMDMwMDAwIDQxODIwMDQ4IGViZmYwMDAwIDdjM2ZmMDQwIDQxODIwMDM4IDxlOTNm MDAxMD4gPQo3ZmEzZWI3OCA4MTQ5MDA1OCBlODg5MDAxOAogIFsgICAgMC4yMjMxOTBdWyAgIFQx Ml0gLS0tWyBlbmQgdHJhY2UgMDAwMDAwMDAwMDAwMDAwMCBdLS0tCiAgLi4uCgpJbnRlcmVzdGlu Z2x5LCB0dXJuaW5nIG9uIENPTkZJR19LQVNBTiBhcHBlYXJzIHRvIGhpZGUgdGhpcywgbWF5YmUK cG9pbnRpbmcgdG8gc29tZSBzb3J0IG9mIG1lbW9yeSBjb3JydXB0aW9uIChvciBzb21ldGhpbmcg dGltaW5nCnJlbGF0ZWQpPyBJZiB0aGVyZSBpcyBhbnkgb3RoZXIgaW5mb3JtYXRpb24gSSBjYW4g cHJvdmlkZSwgSSBhbSBtb3JlCnRoYW4gaGFwcHkgdG8gZG8gc28uCgpbMV06IGh0dHBzOi8vbG9y ZS5rZXJuZWwub3JnLzIwMjYwMjI3MTUzNzMwLjE1NTY1NDItNC1tYXRoaWV1LmRlc25veWVyc0Bl ZmY9CmljaW9zLmNvbS8KWzJdOiBodHRwczovL2xvcmUua2VybmVsLm9yZy8yMDI2MDMwNC12ZHNv LXNwYXJjNjQtZ2VuZXJpYy0yLXY2LTMtZDhlYjNiMGUxPQo0MTBAbGludXRyb25peC5kZS8KWzNd OiBodHRwczovL2xvcmUua2VybmVsLm9yZy8yMDI2MDMxMTEyNTUzOS40MTIzNjcyLTItbWNsYXBp bnNraUBnb29nbGUuY29tLwoKQ2hlZXJzLApOYXRoYW4= --oKQGQYgdjErEaAqZ Content-Type: text/plain; charset="iso-8859-1" Content-Disposition: attachment; filename=diff Content-Transfer-Encoding: quoted-printable diff --git a/Documentation/core-api/percpu-counter-tree.rst b/Documentation= /core-api/percpu-counter-tree.rst new file mode 100644 index 000000000000..196da056e7b4 --- /dev/null +++ b/Documentation/core-api/percpu-counter-tree.rst @@ -0,0 +1,75 @@ +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D +The Hierarchical Per-CPU Counters (HPCC) +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D + +:Author: Mathieu Desnoyers + +Introduction +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D + +Counters come in many varieties, each with their own trade offs: + + * A global atomic counter provides a fast read access to the current + sum, at the expense of cache-line bouncing on updates. This leads to + poor performance of frequent updates from various cores on large SMP + systems. + + * A per-cpu split counter provides fast updates to per-cpu counters, + at the expense of a slower aggregation (sum). The sum operation needs + to iterate over all per-cpu counters to calculate the current total. + +The hierarchical per-cpu counters attempt to provide the best of both +worlds (fast updates, and fast sum) by relaxing requirements on the sum +accuracy. It allows quickly querying an approximated sum value, along +with the possible min/max ranges of the associated precise sum. The +exact precise sum can still be calculated with an iteration on all +per-cpu counter, but the availability of an approximated sum value with +possible precise sum min/max ranges allows eliminating candidates which +are certainly outside of a known target range without the overhead of +precise sums. + +Overview +=3D=3D=3D=3D=3D=3D=3D=3D + +The herarchical per-cpu counters are organized as a tree with the tree +root at the bottom (last level) and the first level of the tree +consisting of per-cpu counters. + +The intermediate tree levels contain carry propagation counters. When +reaching a threshold (batch size), the carry is propagated down the +tree. + +This allows reading an approximated value at the root, which has a +bounded accuracy (minimum/maximum possible precise sum range) determined +by the tree topology. + +Use Cases +=3D=3D=3D=3D=3D=3D=3D=3D=3D + +Use cases HPCC is meant to handle invove tracking resources which are +used across many CPUs to quickly sum as feedback for decision making to +apply throttling, quota limits, sort tasks, and perform memory or task +migration decisions. When considering approximated sums within the +accuracy range of the decision threshold, the user can either: + + * Be conservative and fast: Consider that the sum has reached the + limit as soon as the given limit is within the approximation range. + + * Be aggressive and fast: Consider that the sum is over the + limit only when the approximation range is over the given limit. + + * Be precise and slow: Do a precise comparison with the limit, which + requires a precise sum when the limit is within the approximated + range. + +One use-case for these hierarchical counters is to implement a two-pass +algorithm to speed up sorting picking a maximum/minimunm sum value from +a set. A first pass compares the approximated values, and then a second +pass only needs the precise sum for counter trees which are within the +possible precise sum range of the counter tree chosen by the first pass. + +Functions and structures +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D + +.. kernel-doc:: include/linux/percpu_counter_tree.h +.. kernel-doc:: lib/percpu_counter_tree.c diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index ac4129d1d741..612a6da6127a 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -35,6 +35,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *prese= rvation); int kho_add_subtree(const char *name, void *fdt); void kho_remove_subtree(void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); +bool pfn_is_kho_scratch(unsigned long pfn); =20 void kho_memory_init(void); =20 @@ -109,6 +110,11 @@ static inline int kho_retrieve_subtree(const char *nam= e, phys_addr_t *phys) return -EOPNOTSUPP; } =20 +static inline bool pfn_is_kho_scratch(unsigned long pfn) +{ + return false; +} + static inline void kho_memory_init(void) { } =20 static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6ec5e9ac0699..3e217414e12d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -614,11 +614,9 @@ static inline void memtest_report_meminfo(struct seq_f= ile *m) { } #ifdef CONFIG_MEMBLOCK_KHO_SCRATCH void memblock_set_kho_scratch_only(void); void memblock_clear_kho_scratch_only(void); -void memmap_init_kho_scratch_pages(void); #else static inline void memblock_set_kho_scratch_only(void) { } static inline void memblock_clear_kho_scratch_only(void) { } -static inline void memmap_init_kho_scratch_pages(void) {} #endif =20 #endif /* _LINUX_MEMBLOCK_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f06..b2e478b14c87 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,38 +3057,47 @@ static inline bool get_user_page_fast_only(unsigned= long addr, { return get_user_pages_fast_only(addr, 1, gup_flags, pagep) =3D=3D 1; } + +static inline struct percpu_counter_tree_level_item *get_rss_stat_items(st= ruct mm_struct *mm) +{ + unsigned long ptr =3D (unsigned long)mm; + + ptr +=3D offsetof(struct mm_struct, flexible_array); + return (struct percpu_counter_tree_level_item *)ptr; +} + /* * per-process(per-mm_struct) statistics. */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int membe= r) { - return percpu_counter_read_positive(&mm->rss_stat[member]); + return percpu_counter_tree_approximate_sum_positive(&mm->rss_stat[member]= ); } =20 static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int m= ember) { - return percpu_counter_sum_positive(&mm->rss_stat[member]); + return percpu_counter_tree_precise_sum_positive(&mm->rss_stat[member]); } =20 void mm_trace_rss_stat(struct mm_struct *mm, int member); =20 static inline void add_mm_counter(struct mm_struct *mm, int member, long v= alue) { - percpu_counter_add(&mm->rss_stat[member], value); + percpu_counter_tree_add(&mm->rss_stat[member], value); =20 mm_trace_rss_stat(mm, member); } =20 static inline void inc_mm_counter(struct mm_struct *mm, int member) { - percpu_counter_inc(&mm->rss_stat[member]); + percpu_counter_tree_add(&mm->rss_stat[member], 1); =20 mm_trace_rss_stat(mm, member); } =20 static inline void dec_mm_counter(struct mm_struct *mm, int member) { - percpu_counter_dec(&mm->rss_stat[member]); + percpu_counter_tree_add(&mm->rss_stat[member], -1); =20 mm_trace_rss_stat(mm, member); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3cc8ae722886..1a808d78245d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -1118,6 +1118,19 @@ typedef struct { DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } __private mm_flags_t; =20 +/* + * The alignment of the mm_struct flexible array is based on the largest + * alignment of its content: + * __alignof__(struct percpu_counter_tree_level_item) provides a + * cacheline aligned alignment on SMP systems, else alignment on + * unsigned long on UP systems. + */ +#ifdef CONFIG_SMP +# define __mm_struct_flexible_array_aligned __aligned(__alignof__(struct p= ercpu_counter_tree_level_item)) +#else +# define __mm_struct_flexible_array_aligned __aligned(__alignof__(unsigned= long)) +#endif + struct kioctx_table; struct iommu_mm_data; struct mm_struct { @@ -1263,7 +1276,7 @@ struct mm_struct { unsigned long saved_e_flags; #endif =20 - struct percpu_counter rss_stat[NR_MM_COUNTERS]; + struct percpu_counter_tree rss_stat[NR_MM_COUNTERS]; =20 struct linux_binfmt *binfmt; =20 @@ -1374,10 +1387,13 @@ struct mm_struct { } __randomize_layout; =20 /* - * The mm_cpumask needs to be at the end of mm_struct, because it - * is dynamically sized based on nr_cpu_ids. + * The rss hierarchical counter items, mm_cpumask, and mm_cid + * masks need to be at the end of mm_struct, because they are + * dynamically sized based on nr_cpu_ids. + * The content of the flexible array needs to be placed in + * decreasing alignment requirement order. */ - char flexible_array[] __aligned(__alignof__(unsigned long)); + char flexible_array[] __mm_struct_flexible_array_aligned; }; =20 /* Copy value to the first system word of mm flags, non-atomically. */ @@ -1414,24 +1430,30 @@ static inline void __mm_flags_set_mask_bits_word(st= ruct mm_struct *mm, MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; =20 -#define MM_STRUCT_FLEXIBLE_ARRAY_INIT \ -{ \ - [0 ... sizeof(cpumask_t) + MM_CID_STATIC_SIZE - 1] =3D 0 \ +#define MM_STRUCT_FLEXIBLE_ARRAY_INIT \ +{ \ + [0 ... (PERCPU_COUNTER_TREE_ITEMS_STATIC_SIZE * NR_MM_COUNTERS) + sizeof(= cpumask_t) + MM_CID_STATIC_SIZE - 1] =3D 0 \ } =20 -/* Pointer magic because the dynamic array size confuses some compilers. */ -static inline void mm_init_cpumask(struct mm_struct *mm) +static inline size_t get_rss_stat_items_size(void) { - unsigned long cpu_bitmap =3D (unsigned long)mm; - - cpu_bitmap +=3D offsetof(struct mm_struct, flexible_array); - cpumask_clear((struct cpumask *)cpu_bitmap); + return percpu_counter_tree_items_size() * NR_MM_COUNTERS; } =20 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { - return (struct cpumask *)&mm->flexible_array; + unsigned long ptr =3D (unsigned long)mm; + + ptr +=3D offsetof(struct mm_struct, flexible_array); + /* Skip RSS stats counters. */ + ptr +=3D get_rss_stat_items_size(); + return (struct cpumask *)ptr; +} + +static inline void mm_init_cpumask(struct mm_struct *mm) +{ + cpumask_clear((struct cpumask *)mm_cpumask(mm)); } =20 #ifdef CONFIG_LRU_GEN @@ -1523,6 +1545,8 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_st= ruct *mm) unsigned long bitmap =3D (unsigned long)mm; =20 bitmap +=3D offsetof(struct mm_struct, flexible_array); + /* Skip RSS stats counters. */ + bitmap +=3D get_rss_stat_items_size(); /* Skip cpu_bitmap */ bitmap +=3D cpumask_size(); return (struct cpumask *)bitmap; diff --git a/include/linux/percpu_counter_tree.h b/include/linux/percpu_cou= nter_tree.h new file mode 100644 index 000000000000..828c763edd4a --- /dev/null +++ b/include/linux/percpu_counter_tree.h @@ -0,0 +1,367 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR MIT */ +/* SPDX-FileCopyrightText: 2025 Mathieu Desnoyers */ + +#ifndef _PERCPU_COUNTER_TREE_H +#define _PERCPU_COUNTER_TREE_H + +#include +#include +#include + +#ifdef CONFIG_SMP + +#if NR_CPUS =3D=3D (1U << 0) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 0 +#elif NR_CPUS <=3D (1U << 1) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 1 +#elif NR_CPUS <=3D (1U << 2) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 3 +#elif NR_CPUS <=3D (1U << 3) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 7 +#elif NR_CPUS <=3D (1U << 4) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 7 +#elif NR_CPUS <=3D (1U << 5) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 11 +#elif NR_CPUS <=3D (1U << 6) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 21 +#elif NR_CPUS <=3D (1U << 7) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 21 +#elif NR_CPUS <=3D (1U << 8) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 37 +#elif NR_CPUS <=3D (1U << 9) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 73 +#elif NR_CPUS <=3D (1U << 10) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 149 +#elif NR_CPUS <=3D (1U << 11) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 293 +#elif NR_CPUS <=3D (1U << 12) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 585 +#elif NR_CPUS <=3D (1U << 13) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 1173 +#elif NR_CPUS <=3D (1U << 14) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 2341 +#elif NR_CPUS <=3D (1U << 15) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 4681 +#elif NR_CPUS <=3D (1U << 16) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 4681 +#elif NR_CPUS <=3D (1U << 17) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 8777 +#elif NR_CPUS <=3D (1U << 18) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 17481 +#elif NR_CPUS <=3D (1U << 19) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 34953 +#elif NR_CPUS <=3D (1U << 20) +# define PERCPU_COUNTER_TREE_STATIC_NR_ITEMS 69905 +#else +# error "Unsupported number of CPUs." +#endif + +struct percpu_counter_tree_level_item { + atomic_long_t count; /* + * Count the number of carry for this tree item. + * The carry counter is kept at the order of the + * carry accounted for at this tree level. + */ +} ____cacheline_aligned_in_smp; + +#define PERCPU_COUNTER_TREE_ITEMS_STATIC_SIZE \ + (PERCPU_COUNTER_TREE_STATIC_NR_ITEMS * sizeof(struct percpu_counter_tree_= level_item)) + +struct percpu_counter_tree { + /* Fast-path fields. */ + unsigned long __percpu *level0; /* Pointer to per-CPU split counters (tre= e level 0). */ + unsigned long level0_bit_mask; /* Bit mask to apply to detect carry propa= gation from tree level 0. */ + union { + unsigned long *i; /* Approximate sum for single-CPU topology. */ + atomic_long_t *a; /* Approximate sum for SMP topology. */ + } approx_sum; + long bias; /* Bias to apply to counter precise and approximate values. = */ + + /* Slow-path fields. */ + struct percpu_counter_tree_level_item *items; /* Array of tree items for = levels 1 to N. */ + unsigned long batch_size; /* + * The batch size is the increment step at level 0 which + * triggers a carry propagation. The batch size is required + * to be greater than 1, and a power of 2. + */ + /* + * The tree approximate sum is guaranteed to be within this accuracy rang= e: + * (precise_sum - approx_accuracy_range.under) <=3D approx_sum <=3D (prec= ise_sum + approx_accuracy_range.over). + * This accuracy is derived from the hardware topology and the tree batch= _size. + * The "under" accuracy is larger than the "over" accuracy because the ne= gative range of a + * two's complement signed integer is one unit larger than the positive r= ange. This delta + * is summed for each tree item, which leads to a significantly larger "u= nder" accuracy range + * compared to the "over" accuracy range. + */ + struct { + unsigned long under; + unsigned long over; + } approx_accuracy_range; +}; + +size_t percpu_counter_tree_items_size(void); +int percpu_counter_tree_init_many(struct percpu_counter_tree *counters, st= ruct percpu_counter_tree_level_item *items, + unsigned int nr_counters, unsigned long batch_size, gfp_t gfp_flags); +int percpu_counter_tree_init(struct percpu_counter_tree *counter, struct p= ercpu_counter_tree_level_item *items, + unsigned long batch_size, gfp_t gfp_flags); +void percpu_counter_tree_destroy_many(struct percpu_counter_tree *counter,= unsigned int nr_counters); +void percpu_counter_tree_destroy(struct percpu_counter_tree *counter); +void percpu_counter_tree_add(struct percpu_counter_tree *counter, long inc= ); +long percpu_counter_tree_precise_sum(struct percpu_counter_tree *counter); +int percpu_counter_tree_approximate_compare(struct percpu_counter_tree *a,= struct percpu_counter_tree *b); +int percpu_counter_tree_approximate_compare_value(struct percpu_counter_tr= ee *counter, long v); +int percpu_counter_tree_precise_compare(struct percpu_counter_tree *a, str= uct percpu_counter_tree *b); +int percpu_counter_tree_precise_compare_value(struct percpu_counter_tree *= counter, long v); +void percpu_counter_tree_set(struct percpu_counter_tree *counter, long v); +int percpu_counter_tree_subsystem_init(void); + +/** + * percpu_counter_tree_approximate_sum() - Return approximate counter sum. + * @counter: The counter to sum. + * + * Querying the approximate sum is fast, but it is only accurate within + * the bounds delimited by percpu_counter_tree_approximate_accuracy_range(= ). + * This is meant to be used when speed is preferred over accuracy. + * + * Return: The current approximate counter sum. + */ +static inline +long percpu_counter_tree_approximate_sum(struct percpu_counter_tree *count= er) +{ + unsigned long v; + + if (!counter->level0_bit_mask) + v =3D READ_ONCE(*counter->approx_sum.i); + else + v =3D atomic_long_read(counter->approx_sum.a); + return (long) (v + (unsigned long)READ_ONCE(counter->bias)); +} + +/** + * percpu_counter_tree_approximate_accuracy_range - Query the accuracy ran= ge for a counter tree. + * @counter: Counter to query. + * @under: Pointer to a variable to be incremented of the approximation + * accuracy range below the precise sum. + * @over: Pointer to a variable to be incremented of the approximation + * accuracy range above the precise sum. + * + * Query the accuracy range limits for the counter. + * Because of two's complement binary representation, the "under" range is= typically + * slightly larger than the "over" range. + * Those values are derived from the hardware topology and the counter tre= e batch size. + * They are invariant for a given counter tree. + * Using this function should not be typically required, see the following= functions instead: + * * percpu_counter_tree_approximate_compare(), + * * percpu_counter_tree_approximate_compare_value(), + * * percpu_counter_tree_precise_compare(), + * * percpu_counter_tree_precise_compare_value(). + */ +static inline +void percpu_counter_tree_approximate_accuracy_range(struct percpu_counter_= tree *counter, + unsigned long *under, unsigned long *over) +{ + *under +=3D counter->approx_accuracy_range.under; + *over +=3D counter->approx_accuracy_range.over; +} + +#else /* !CONFIG_SMP */ + +#define PERCPU_COUNTER_TREE_ITEMS_STATIC_SIZE 0 + +struct percpu_counter_tree_level_item; + +struct percpu_counter_tree { + atomic_long_t count; +}; + +static inline +size_t percpu_counter_tree_items_size(void) +{ + return 0; +} + +static inline +int percpu_counter_tree_init_many(struct percpu_counter_tree *counters, st= ruct percpu_counter_tree_level_item *items, + unsigned int nr_counters, unsigned long batch_size, gfp_t gfp_flags) +{ + for (unsigned int i =3D 0; i < nr_counters; i++) + atomic_long_set(&counters[i].count, 0); + return 0; +} + +static inline +int percpu_counter_tree_init(struct percpu_counter_tree *counter, struct p= ercpu_counter_tree_level_item *items, + unsigned long batch_size, gfp_t gfp_flags) +{ + return percpu_counter_tree_init_many(counter, items, 1, batch_size, gfp_f= lags); +} + +static inline +void percpu_counter_tree_destroy_many(struct percpu_counter_tree *counter,= unsigned int nr_counters) +{ +} + +static inline +void percpu_counter_tree_destroy(struct percpu_counter_tree *counter) +{ +} + +static inline +long percpu_counter_tree_precise_sum(struct percpu_counter_tree *counter) +{ + return atomic_long_read(&counter->count); +} + +static inline +int percpu_counter_tree_precise_compare(struct percpu_counter_tree *a, str= uct percpu_counter_tree *b) +{ + long count_a =3D percpu_counter_tree_precise_sum(a), + count_b =3D percpu_counter_tree_precise_sum(b); + + if (count_a =3D=3D count_b) + return 0; + if (count_a < count_b) + return -1; + return 1; +} + +static inline +int percpu_counter_tree_precise_compare_value(struct percpu_counter_tree *= counter, long v) +{ + long count =3D percpu_counter_tree_precise_sum(counter); + + if (count =3D=3D v) + return 0; + if (count < v) + return -1; + return 1; +} + +static inline +int percpu_counter_tree_approximate_compare(struct percpu_counter_tree *a,= struct percpu_counter_tree *b) +{ + return percpu_counter_tree_precise_compare(a, b); +} + +static inline +int percpu_counter_tree_approximate_compare_value(struct percpu_counter_tr= ee *counter, long v) +{ + return percpu_counter_tree_precise_compare_value(counter, v); +} + +static inline +void percpu_counter_tree_set(struct percpu_counter_tree *counter, long v) +{ + atomic_long_set(&counter->count, v); +} + +static inline +void percpu_counter_tree_approximate_accuracy_range(struct percpu_counter_= tree *counter, + unsigned long *under, unsigned long *over) +{ +} + +static inline +void percpu_counter_tree_add(struct percpu_counter_tree *counter, long inc) +{ + atomic_long_add(inc, &counter->count); +} + +static inline +long percpu_counter_tree_approximate_sum(struct percpu_counter_tree *count= er) +{ + return percpu_counter_tree_precise_sum(counter); +} + +static inline +int percpu_counter_tree_subsystem_init(void) +{ + return 0; +} + +#endif /* CONFIG_SMP */ + +/** + * percpu_counter_tree_approximate_sum_positive() - Return a positive appr= oximate counter sum. + * @counter: The counter to sum. + * + * Return an approximate counter sum which is guaranteed to be greater + * or equal to 0. + * + * Return: The current positive approximate counter sum. + */ +static inline +long percpu_counter_tree_approximate_sum_positive(struct percpu_counter_tr= ee *counter) +{ + long v =3D percpu_counter_tree_approximate_sum(counter); + return v > 0 ? v : 0; +} + +/** + * percpu_counter_tree_precise_sum_positive() - Return a positive precise = counter sum. + * @counter: The counter to sum. + * + * Return a precise counter sum which is guaranteed to be greater + * or equal to 0. + * + * Return: The current positive precise counter sum. + */ +static inline +long percpu_counter_tree_precise_sum_positive(struct percpu_counter_tree *= counter) +{ + long v =3D percpu_counter_tree_precise_sum(counter); + return v > 0 ? v : 0; +} + +/** + * percpu_counter_tree_approximate_min_max_range() - Return the approximat= ion min and max precise values. + * @approx_sum: Approximated sum. + * @under: Tree accuracy range (under). + * @over: Tree accuracy range (over). + * @precise_min: Minimum possible value for precise sum (output). + * @precise_max: Maximum possible value for precise sum (output). + * + * Calculate the minimum and maximum precise values for a given + * approximation and (under, over) accuracy range. + * + * The range of the approximation as a function of the precise sum is expr= essed as: + * + * approx_sum >=3D precise_sum - approx_accuracy_range.under + * approx_sum <=3D precise_sum + approx_accuracy_range.over + * + * Therefore, the range of the precise sum as a function of the approximat= ion is expressed as: + * + * precise_sum <=3D approx_sum + approx_accuracy_range.under + * precise_sum >=3D approx_sum - approx_accuracy_range.over + */ +static inline +void percpu_counter_tree_approximate_min_max_range(long approx_sum, unsign= ed long under, unsigned long over, + long *precise_min, long *precise_max) +{ + *precise_min =3D approx_sum - over; + *precise_max =3D approx_sum + under; +} + +/** + * percpu_counter_tree_approximate_min_max() - Return the tree approximati= on, min and max possible precise values. + * @counter: The counter to sum. + * @approx_sum: Approximate sum (output). + * @precise_min: Minimum possible value for precise sum (output). + * @precise_max: Maximum possible value for precise sum (output). + * + * Return the approximate sum, minimum and maximum precise values for + * a counter. + */ +static inline +void percpu_counter_tree_approximate_min_max(struct percpu_counter_tree *c= ounter, + long *approx_sum, long *precise_min, long *precise_max) +{ + unsigned long under =3D 0, over =3D 0; + long v =3D percpu_counter_tree_approximate_sum(counter); + + percpu_counter_tree_approximate_accuracy_range(counter, &under, &over); + percpu_counter_tree_approximate_min_max_range(v, under, over, precise_min= , precise_max); + *approx_sum =3D v; +} + +#endif /* _PERCPU_COUNTER_TREE_H */ diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h index a91fa24b06e0..0b530428db71 100644 --- a/include/linux/vdso_datastore.h +++ b/include/linux/vdso_datastore.h @@ -2,9 +2,15 @@ #ifndef _LINUX_VDSO_DATASTORE_H #define _LINUX_VDSO_DATASTORE_H =20 +#ifdef CONFIG_HAVE_GENERIC_VDSO #include =20 extern const struct vm_special_mapping vdso_vvar_mapping; struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, uns= igned long addr); =20 +void __init vdso_setup_data_pages(void); +#else /* !CONFIG_HAVE_GENERIC_VDSO */ +static inline void vdso_setup_data_pages(void) { } +#endif /* CONFIG_HAVE_GENERIC_VDSO */ + #endif /* _LINUX_VDSO_DATASTORE_H */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index cd7920c81f85..290ccb9fd25d 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -448,7 +448,7 @@ TRACE_EVENT(rss_stat, */ __entry->curr =3D current->mm =3D=3D mm && !(current->flags & PF_KTHREAD= ); __entry->member =3D member; - __entry->size =3D (percpu_counter_sum_positive(&mm->rss_stat[member]) + __entry->size =3D (percpu_counter_tree_approximate_sum_positive(&mm->rss= _stat[member]) << PAGE_SHIFT); ), =20 diff --git a/init/main.c b/init/main.c index 1cb395dd94e4..453ac9dff2da 100644 --- a/init/main.c +++ b/init/main.c @@ -105,6 +105,8 @@ #include #include #include +#include +#include #include =20 #include @@ -1067,6 +1069,7 @@ void start_kernel(void) vfs_caches_init_early(); sort_main_extable(); trap_init(); + percpu_counter_tree_subsystem_init(); mm_core_init(); maple_tree_init(); poking_init(); @@ -1119,6 +1122,7 @@ void start_kernel(void) srcu_init(); hrtimers_init(); softirq_init(); + vdso_setup_data_pages(); timekeeping_init(); time_init(); =20 diff --git a/kernel/fork.c b/kernel/fork.c index bc2bf58b93b6..0de4c8727055 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -134,6 +134,11 @@ */ #define MAX_THREADS FUTEX_TID_MASK =20 +/* + * Batch size of rss stat approximation + */ +#define RSS_STAT_BATCH_SIZE 32 + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -627,14 +632,12 @@ static void check_mm(struct mm_struct *mm) "Please make sure 'struct resident_page_types[]' is updated as well"); =20 for (i =3D 0; i < NR_MM_COUNTERS; i++) { - long x =3D percpu_counter_sum(&mm->rss_stat[i]); - - if (unlikely(x)) { + if (unlikely(percpu_counter_tree_precise_compare_value(&mm->rss_stat[i],= 0) !=3D 0)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:= %d\n", - mm, resident_page_types[i], x, + mm, resident_page_types[i], + percpu_counter_tree_precise_sum(&mm->rss_stat[i]), current->comm, task_pid_nr(current)); - } } =20 if (mm_pgtables_bytes(mm)) @@ -732,7 +735,7 @@ void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); - percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); + percpu_counter_tree_destroy_many(mm->rss_stat, NR_MM_COUNTERS); =20 free_mm(mm); } @@ -1125,8 +1128,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm= , struct task_struct *p, if (mm_alloc_cid(mm, p)) goto fail_cid; =20 - if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, - NR_MM_COUNTERS)) + if (percpu_counter_tree_init_many(mm->rss_stat, get_rss_stat_items(mm), + NR_MM_COUNTERS, RSS_STAT_BATCH_SIZE, + GFP_KERNEL_ACCOUNT)) goto fail_pcpu; =20 mm->user_ns =3D get_user_ns(user_ns); @@ -3008,7 +3012,7 @@ void __init mm_cache_init(void) * dynamically sized based on the maximum CPU number this system * can have, taking hotplug into account (nr_cpu_ids). */ - mm_size =3D sizeof(struct mm_struct) + cpumask_size() + mm_cid_size(); + mm_size =3D sizeof(struct mm_struct) + cpumask_size() + mm_cid_size() + g= et_rss_stat_items_size(); =20 mm_cachep =3D kmem_cache_create_usercopy("mm_struct", mm_size, ARCH_MIN_MMSTRUCT_ALIGN, diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_h= andover.c index cc68a3692905..ce2786faf044 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1333,6 +1333,23 @@ int kho_retrieve_subtree(const char *name, phys_addr= _t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); =20 +bool pfn_is_kho_scratch(unsigned long pfn) +{ + unsigned int i; + phys_addr_t scratch_start, scratch_end, phys =3D __pfn_to_phys(pfn); + + for (i =3D 0; i < kho_scratch_cnt; i++) { + scratch_start =3D kho_scratch[i].addr; + scratch_end =3D kho_scratch[i].addr + kho_scratch[i].size; + + if (scratch_start <=3D phys && phys < scratch_end) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(pfn_is_kho_scratch); + static __init int kho_out_fdt_setup(void) { void *root =3D kho_out.fdt; @@ -1421,12 +1438,27 @@ static __init int kho_init(void) } fs_initcall(kho_init); =20 +static void __init kho_init_scratch_pages(void) +{ + if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) + return; + + for (int i =3D 0; i < kho_scratch_cnt; i++) { + unsigned long pfn =3D PFN_DOWN(kho_scratch[i].addr); + unsigned long end_pfn =3D PFN_UP(kho_scratch[i].addr + kho_scratch[i].si= ze); + int nid =3D early_pfn_to_nid(pfn); + + for (; pfn < end_pfn; pfn++) + init_deferred_page(pfn, nid); + } +} + static void __init kho_release_scratch(void) { phys_addr_t start, end; u64 i; =20 - memmap_init_kho_scratch_pages(); + kho_init_scratch_pages(); =20 /* * Mark scratch mem as CMA before we return it. That way we @@ -1453,6 +1485,7 @@ void __init kho_memory_init(void) kho_mem_deserialize(phys_to_virt(kho_in.mem_map_phys)); } else { kho_reserve_scratch(); + kho_init_scratch_pages(); } } =20 diff --git a/lib/Kconfig b/lib/Kconfig index 0f2fb9610647..0b8241e5b548 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -52,6 +52,18 @@ config PACKING_KUNIT_TEST =20 When in doubt, say N. =20 +config PERCPU_COUNTER_TREE_TEST + tristate "Hierarchical Per-CPU counter test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds Kunit tests for the hierarchical per-cpu counters. + + For more information on KUnit and unit tests in general, + please refer to the KUnit documentation in Documentation/dev-tools/kuni= t/. + + When in doubt, say N. + config BITREVERSE tristate =20 diff --git a/lib/Makefile b/lib/Makefile index 1b9ee167517f..abc32420b581 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -181,6 +181,7 @@ obj-$(CONFIG_TEXTSEARCH_KMP) +=3D ts_kmp.o obj-$(CONFIG_TEXTSEARCH_BM) +=3D ts_bm.o obj-$(CONFIG_TEXTSEARCH_FSM) +=3D ts_fsm.o obj-$(CONFIG_SMP) +=3D percpu_counter.o +obj-$(CONFIG_SMP) +=3D percpu_counter_tree.o obj-$(CONFIG_AUDIT_GENERIC) +=3D audit.o obj-$(CONFIG_AUDIT_COMPAT_GENERIC) +=3D compat_audit.o =20 diff --git a/lib/percpu_counter_tree.c b/lib/percpu_counter_tree.c new file mode 100644 index 000000000000..beb1144e6450 --- /dev/null +++ b/lib/percpu_counter_tree.c @@ -0,0 +1,702 @@ +// SPDX-License-Identifier: GPL-2.0+ OR MIT +// SPDX-FileCopyrightText: 2025 Mathieu Desnoyers + +/* + * Split Counters With Tree Approximation Propagation + * + * * Propagation diagram when reaching batch size thresholds (=B1 batch si= ze): + * + * Example diagram for 8 CPUs: + * + * log2(8) =3D 3 levels + * + * At each level, each pair propagates its values to the next level when + * reaching the batch size thresholds. + * + * Counters at levels 0, 1, 2 can be kept on a single byte ([-128 .. +127]= range), + * although it may be relevant to keep them on "long" counters for + * simplicity. (complexity vs memory footprint tradeoff) + * + * Counter at level 3 can be kept on a "long" counter. + * + * Level 0: 0 1 2 3 4 5 6 7 + * | / | / | / | / + * | / | / | / | / + * | / | / | / | / + * Level 1: 0 1 2 3 + * | / | / + * | / | / + * | / | / + * Level 2: 0 1 + * | / + * | / + * | / + * Level 3: 0 + * + * * Approximation accuracy: + * + * BATCH(level N): Level N batch size. + * + * Example for BATCH(level 0) =3D 32. + * + * BATCH(level 0) =3D 32 + * BATCH(level 1) =3D 64 + * BATCH(level 2) =3D 128 + * BATCH(level N) =3D BATCH(level 0) * 2^N + * + * per-counter global + * accuracy accuracy + * Level 0: [ -32 .. +31] =B1256 (8 * 32) + * Level 1: [ -64 .. +63] =B1256 (4 * 64) + * Level 2: [-128 .. +127] =B1256 (2 * 128) + * Total: ------ =B1768 (log2(nr_cpu_ids) * BATCH(level 0) *= nr_cpu_ids) + * + * Note that the global accuracy can be calculated more precisely + * by taking into account that the positive accuracy range is + * 31 rather than 32. + * + * ----- + * + * Approximate Sum Carry Propagation + * + * Let's define a number of counter bits for each level, e.g.: + * + * log2(BATCH(level 0)) =3D log2(32) =3D 5 + * Let's assume, for this example, a 32-bit architecture (sizeof(long) =3D= =3D 4). + * + * nr_bit value_mask range + * Level 0: 5 bits v 0 .. +31 + * Level 1: 1 bit (v & ~((1UL << 5) - 1)) 0 .. +63 + * Level 2: 1 bit (v & ~((1UL << 6) - 1)) 0 .. +127 + * Level 3: 25 bits (v & ~((1UL << 7) - 1)) 0 .. 2^32-1 + * + * Note: Use a "long" per-cpu counter at level 0 to allow precise sum. + * + * Note: Use cacheline aligned counters at levels above 0 to prevent false= sharing. + * If memory footprint is an issue, a specialized allocator could be= used + * to eliminate padding. + * + * Example with expanded values: + * + * counter_add(counter, inc): + * + * if (!inc) + * return; + * + * res =3D percpu_add_return(counter @ Level 0, inc); + * orig =3D res - inc; + * if (inc < 0) { + * inc =3D -(-inc & ~0b00011111); // Clear used bits + * // xor bit 5: underflow + * if ((inc ^ orig ^ res) & 0b00100000) + * inc -=3D 0b00100000; + * } else { + * inc &=3D ~0b00011111; // Clear used bits + * // xor bit 5: overflow + * if ((inc ^ orig ^ res) & 0b00100000) + * inc +=3D 0b00100000; + * } + * if (!inc) + * return; + * + * res =3D atomic_long_add_return(counter @ Level 1, inc); + * orig =3D res - inc; + * if (inc < 0) { + * inc =3D -(-inc & ~0b00111111); // Clear used bits + * // xor bit 6: underflow + * if ((inc ^ orig ^ res) & 0b01000000) + * inc -=3D 0b01000000; + * } else { + * inc &=3D ~0b00111111; // Clear used bits + * // xor bit 6: overflow + * if ((inc ^ orig ^ res) & 0b01000000) + * inc +=3D 0b01000000; + * } + * if (!inc) + * return; + * + * res =3D atomic_long_add_return(counter @ Level 2, inc); + * orig =3D res - inc; + * if (inc < 0) { + * inc =3D -(-inc & ~0b01111111); // Clear used bits + * // xor bit 7: underflow + * if ((inc ^ orig ^ res) & 0b10000000) + * inc -=3D 0b10000000; + * } else { + * inc &=3D ~0b01111111; // Clear used bits + * // xor bit 7: overflow + * if ((inc ^ orig ^ res) & 0b10000000) + * inc +=3D 0b10000000; + * } + * if (!inc) + * return; + * + * atomic_long_add(counter @ Level 3, inc); + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_NR_LEVELS 5 + +/* + * The counter configuration is selected at boot time based on the + * hardware topology. + */ +struct counter_config { + unsigned int nr_items; /* + * nr_items is the number of items in the tree for levels 1 + * up to and including the final level (approximate sum). + * It excludes the level 0 per-CPU counters. + */ + unsigned char nr_levels; /* + * nr_levels is the number of hierarchical counter tree levels. + * It excludes the final level (approximate sum). + */ + unsigned char n_arity_order[MAX_NR_LEVELS]; /* + * n-arity of tree nodes for each level from + * 0 to (nr_levels - 1). + */ +}; + +static const struct counter_config per_nr_cpu_order_config[] =3D { + [0] =3D { .nr_items =3D 0, .nr_levels =3D 0, .n_arity_order =3D { 0 } }, + [1] =3D { .nr_items =3D 1, .nr_levels =3D 1, .n_arity_order =3D { 1 } }, + [2] =3D { .nr_items =3D 3, .nr_levels =3D 2, .n_arity_order =3D { 1, 1 }= }, + [3] =3D { .nr_items =3D 7, .nr_levels =3D 3, .n_arity_order =3D { 1, 1, = 1 } }, + [4] =3D { .nr_items =3D 7, .nr_levels =3D 3, .n_arity_order =3D { 2, 1, = 1 } }, + [5] =3D { .nr_items =3D 11, .nr_levels =3D 3, .n_arity_order =3D { 2, 2,= 1 } }, + [6] =3D { .nr_items =3D 21, .nr_levels =3D 3, .n_arity_order =3D { 2, 2,= 2 } }, + [7] =3D { .nr_items =3D 21, .nr_levels =3D 3, .n_arity_order =3D { 3, 2,= 2 } }, + [8] =3D { .nr_items =3D 37, .nr_levels =3D 3, .n_arity_order =3D { 3, 3,= 2 } }, + [9] =3D { .nr_items =3D 73, .nr_levels =3D 3, .n_arity_order =3D { 3, 3,= 3 } }, + [10] =3D { .nr_items =3D 149, .nr_levels =3D 4, .n_arity_order =3D { 3, = 3, 2, 2 } }, + [11] =3D { .nr_items =3D 293, .nr_levels =3D 4, .n_arity_order =3D { 3, = 3, 3, 2 } }, + [12] =3D { .nr_items =3D 585, .nr_levels =3D 4, .n_arity_order =3D { 3, = 3, 3, 3 } }, + [13] =3D { .nr_items =3D 1173, .nr_levels =3D 5, .n_arity_order =3D { 3,= 3, 3, 2, 2 } }, + [14] =3D { .nr_items =3D 2341, .nr_levels =3D 5, .n_arity_order =3D { 3,= 3, 3, 3, 2 } }, + [15] =3D { .nr_items =3D 4681, .nr_levels =3D 5, .n_arity_order =3D { 3,= 3, 3, 3, 3 } }, + [16] =3D { .nr_items =3D 4681, .nr_levels =3D 5, .n_arity_order =3D { 4,= 3, 3, 3, 3 } }, + [17] =3D { .nr_items =3D 8777, .nr_levels =3D 5, .n_arity_order =3D { 4,= 4, 3, 3, 3 } }, + [18] =3D { .nr_items =3D 17481, .nr_levels =3D 5, .n_arity_order =3D { 4= , 4, 4, 3, 3 } }, + [19] =3D { .nr_items =3D 34953, .nr_levels =3D 5, .n_arity_order =3D { 4= , 4, 4, 4, 3 } }, + [20] =3D { .nr_items =3D 69905, .nr_levels =3D 5, .n_arity_order =3D { 4= , 4, 4, 4, 4 } }, +}; + +static const struct counter_config *counter_config; /* Hierarchical counte= r configuration for the hardware topology. */ +static unsigned int nr_cpus_order; /* Order of nr_cpu_ids. */ +static unsigned long accuracy_multiplier; /* Calculate accuracy for a giv= en batch size (multiplication factor). */ + +static +int __percpu_counter_tree_init(struct percpu_counter_tree *counter, + unsigned long batch_size, gfp_t gfp_flags, + unsigned long __percpu *level0, + struct percpu_counter_tree_level_item *items) +{ + /* Batch size must be greater than 1, and a power of 2. */ + if (WARN_ON(batch_size <=3D 1 || (batch_size & (batch_size - 1)))) + return -EINVAL; + counter->batch_size =3D batch_size; + counter->bias =3D 0; + counter->level0 =3D level0; + counter->items =3D items; + if (!nr_cpus_order) { + counter->approx_sum.i =3D per_cpu_ptr(counter->level0, 0); + counter->level0_bit_mask =3D 0; + } else { + counter->approx_sum.a =3D &counter->items[counter_config->nr_items - 1].= count; + counter->level0_bit_mask =3D 1UL << get_count_order(batch_size); + } + /* + * Each tree item signed integer has a negative range which is + * one unit greater than the positive range. + */ + counter->approx_accuracy_range.under =3D batch_size * accuracy_multiplier; + counter->approx_accuracy_range.over =3D (batch_size - 1) * accuracy_multi= plier; + return 0; +} + +/** + * percpu_counter_tree_init_many() - Initialize many per-CPU counter trees. + * @counters: An array of @nr_counters counters to initialize. + * Their memory is provided by the caller. + * @items: Pointer to memory area where to store tree items. + * This memory is provided by the caller. + * Its size needs to be at least @nr_counters * percpu_counter_tree_ite= ms_size(). + * @nr_counters: The number of counter trees to initialize + * @batch_size: The batch size is the increment step at level 0 which trig= gers a + * carry propagation. + * The batch size is required to be greater than 1, and a power of 2. + * @gfp_flags: gfp flags to pass to the per-CPU allocator. + * + * Initialize many per-CPU counter trees using a single per-CPU + * allocator invocation for @nr_counters counters. + * + * Return: + * * %0: Success + * * %-EINVAL: - Invalid @batch_size argument + * * %-ENOMEM: - Out of memory + */ +int percpu_counter_tree_init_many(struct percpu_counter_tree *counters, st= ruct percpu_counter_tree_level_item *items, + unsigned int nr_counters, unsigned long batch_size, gfp_t gfp_flags) +{ + void __percpu *level0, *level0_iter; + size_t counter_size =3D sizeof(*counters->level0), + items_size =3D percpu_counter_tree_items_size(); + void *items_iter; + unsigned int i; + int ret; + + memset(items, 0, items_size * nr_counters); + level0 =3D __alloc_percpu_gfp(nr_counters * counter_size, + __alignof__(*counters->level0), gfp_flags); + if (!level0) + return -ENOMEM; + level0_iter =3D level0; + items_iter =3D items; + for (i =3D 0; i < nr_counters; i++) { + ret =3D __percpu_counter_tree_init(&counters[i], batch_size, gfp_flags, = level0_iter, items_iter); + if (ret) + goto free_level0; + level0_iter +=3D counter_size; + items_iter +=3D items_size; + } + return 0; + +free_level0: + free_percpu(level0); + return ret; +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_init_many); + +/** + * percpu_counter_tree_init() - Initialize one per-CPU counter tree. + * @counter: Counter to initialize. + * Its memory is provided by the caller. + * @items: Pointer to memory area where to store tree items. + * This memory is provided by the caller. + * Its size needs to be at least percpu_counter_tree_items_size(). + * @batch_size: The batch size is the increment step at level 0 which trig= gers a + * carry propagation. + * The batch size is required to be greater than 1, and a power of 2. + * @gfp_flags: gfp flags to pass to the per-CPU allocator. + * + * Initialize one per-CPU counter tree. + * + * Return: + * * %0: Success + * * %-EINVAL: - Invalid @batch_size argument + * * %-ENOMEM: - Out of memory + */ +int percpu_counter_tree_init(struct percpu_counter_tree *counter, struct p= ercpu_counter_tree_level_item *items, + unsigned long batch_size, gfp_t gfp_flags) +{ + return percpu_counter_tree_init_many(counter, items, 1, batch_size, gfp_f= lags); +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_init); + +/** + * percpu_counter_tree_destroy_many() - Destroy many per-CPU counter trees. + * @counters: Array of counters trees to destroy. + * @nr_counters: The number of counter trees to destroy. + * + * Release internal resources allocated for @nr_counters per-CPU counter t= rees. + */ + +void percpu_counter_tree_destroy_many(struct percpu_counter_tree *counters= , unsigned int nr_counters) +{ + free_percpu(counters->level0); +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_destroy_many); + +/** + * percpu_counter_tree_destroy() - Destroy one per-CPU counter tree. + * @counter: Counter to destroy. + * + * Release internal resources allocated for one per-CPU counter tree. + */ +void percpu_counter_tree_destroy(struct percpu_counter_tree *counter) +{ + return percpu_counter_tree_destroy_many(counter, 1); +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_destroy); + +static +long percpu_counter_tree_carry(long orig, long res, long inc, unsigned lon= g bit_mask) +{ + if (inc < 0) { + inc =3D -(-inc & ~(bit_mask - 1)); + /* + * xor bit_mask: underflow. + * + * If inc has bit set, decrement an additional bit if + * there is _no_ bit transition between orig and res. + * Else, inc has bit cleared, decrement an additional + * bit if there is a bit transition between orig and + * res. + */ + if ((inc ^ orig ^ res) & bit_mask) + inc -=3D bit_mask; + } else { + inc &=3D ~(bit_mask - 1); + /* + * xor bit_mask: overflow. + * + * If inc has bit set, increment an additional bit if + * there is _no_ bit transition between orig and res. + * Else, inc has bit cleared, increment an additional + * bit if there is a bit transition between orig and + * res. + */ + if ((inc ^ orig ^ res) & bit_mask) + inc +=3D bit_mask; + } + return inc; +} + +/* + * It does not matter through which path the carry propagates up the + * tree, therefore there is no need to disable preemption because the + * cpu number is only used to favor cache locality. + */ +static +void percpu_counter_tree_add_slowpath(struct percpu_counter_tree *counter,= long inc) +{ + unsigned int level_items, nr_levels =3D counter_config->nr_levels, + level, n_arity_order; + unsigned long bit_mask; + struct percpu_counter_tree_level_item *item =3D counter->items; + unsigned int cpu =3D raw_smp_processor_id(); + + WARN_ON_ONCE(!nr_cpus_order); /* Should never be called for 1 cpu. */ + + n_arity_order =3D counter_config->n_arity_order[0]; + bit_mask =3D counter->level0_bit_mask << n_arity_order; + level_items =3D 1U << (nr_cpus_order - n_arity_order); + + for (level =3D 1; level < nr_levels; level++) { + /* + * For the purpose of carry propagation, the + * intermediate level counters only need to keep track + * of the bits relevant for carry propagation. We + * therefore don't care about higher order bits. + * Note that this optimization is unwanted if the + * intended use is to track counters within intermediate + * levels of the topology. + */ + if (abs(inc) & (bit_mask - 1)) { + atomic_long_t *count =3D &item[cpu & (level_items - 1)].count; + unsigned long orig, res; + + res =3D atomic_long_add_return_relaxed(inc, count); + orig =3D res - inc; + inc =3D percpu_counter_tree_carry(orig, res, inc, bit_mask); + if (likely(!inc)) + return; + } + item +=3D level_items; + n_arity_order =3D counter_config->n_arity_order[level]; + level_items >>=3D n_arity_order; + bit_mask <<=3D n_arity_order; + } + atomic_long_add(inc, counter->approx_sum.a); +} + +/** + * percpu_counter_tree_add() - Add to a per-CPU counter tree. + * @counter: Counter added to. + * @inc: Increment value (either positive or negative). + * + * Add @inc to a per-CPU counter tree. This is a fast-path which will + * typically increment per-CPU counters as long as there is no carry + * greater or equal to the counter tree batch size. + */ +void percpu_counter_tree_add(struct percpu_counter_tree *counter, long inc) +{ + unsigned long bit_mask =3D counter->level0_bit_mask, orig, res; + + res =3D this_cpu_add_return(*counter->level0, inc); + orig =3D res - inc; + inc =3D percpu_counter_tree_carry(orig, res, inc, bit_mask); + if (likely(!inc)) + return; + percpu_counter_tree_add_slowpath(counter, inc); +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_add); + +static +long percpu_counter_tree_precise_sum_unbiased(struct percpu_counter_tree *= counter) +{ + unsigned long sum =3D 0; + int cpu; + + for_each_possible_cpu(cpu) + sum +=3D *per_cpu_ptr(counter->level0, cpu); + return (long) sum; +} + +/** + * percpu_counter_tree_precise_sum() - Return precise counter sum. + * @counter: The counter to sum. + * + * Querying the precise sum is relatively expensive because it needs to + * iterate over all CPUs. + * This is meant to be used when accuracy is preferred over speed. + * + * Return: The current precise counter sum. + */ +long percpu_counter_tree_precise_sum(struct percpu_counter_tree *counter) +{ + return percpu_counter_tree_precise_sum_unbiased(counter) + READ_ONCE(coun= ter->bias); +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_precise_sum); + +static +int compare_delta(long delta, unsigned long accuracy_pos, unsigned long ac= curacy_neg) +{ + if (delta >=3D 0) { + if (delta <=3D accuracy_pos) + return 0; + else + return 1; + } else { + if (-delta <=3D accuracy_neg) + return 0; + else + return -1; + } +} + +/** + * percpu_counter_tree_approximate_compare - Approximated comparison of tw= o counter trees. + * @a: First counter to compare. + * @b: Second counter to compare. + * + * Evaluate an approximate comparison of two counter trees. + * This approximation comparison is fast, and provides an accurate + * answer if the counters are found to be either less than or greater + * than the other. However, if the approximated comparison returns + * 0, the counters respective sums are found to be within the two + * counters accuracy range. + * + * Return: + * * %0 - Counters @a and @b do not differ by more than the sum of their = respective + * accuracy ranges. + * * %-1 - Counter @a less than counter @b. + * * %1 - Counter @a is greater than counter @b. + */ +int percpu_counter_tree_approximate_compare(struct percpu_counter_tree *a,= struct percpu_counter_tree *b) +{ + return compare_delta(percpu_counter_tree_approximate_sum(a) - percpu_coun= ter_tree_approximate_sum(b), + a->approx_accuracy_range.over + b->approx_accuracy_range.under, + a->approx_accuracy_range.under + b->approx_accuracy_range.over); +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_approximate_compare); + +/** + * percpu_counter_tree_approximate_compare_value - Approximated comparison= of a counter tree against a given value. + * @counter: Counter to compare. + * @v: Value to compare. + * + * Evaluate an approximate comparison of a counter tree against a given va= lue. + * This approximation comparison is fast, and provides an accurate + * answer if the counter is found to be either less than or greater + * than the value. However, if the approximated comparison returns + * 0, the value is within the counter accuracy range. + * + * Return: + * * %0 - The value @v is within the accuracy range of the counter. + * * %-1 - The value @v is less than the counter. + * * %1 - The value @v is greater than the counter. + */ +int percpu_counter_tree_approximate_compare_value(struct percpu_counter_tr= ee *counter, long v) +{ + return compare_delta(v - percpu_counter_tree_approximate_sum(counter), + counter->approx_accuracy_range.under, + counter->approx_accuracy_range.over); +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_approximate_compare_value); + +/** + * percpu_counter_tree_precise_compare - Precise comparison of two counter= trees. + * @a: First counter to compare. + * @b: Second counter to compare. + * + * Evaluate a precise comparison of two counter trees. + * As an optimization, it uses the approximate counter comparison + * to quickly compare counters which are far apart. Only cases where + * counter sums are within the accuracy range require precise counter + * sums. + * + * Return: + * * %0 - Counters are equal. + * * %-1 - Counter @a less than counter @b. + * * %1 - Counter @a is greater than counter @b. + */ +int percpu_counter_tree_precise_compare(struct percpu_counter_tree *a, str= uct percpu_counter_tree *b) +{ + long count_a =3D percpu_counter_tree_approximate_sum(a), + count_b =3D percpu_counter_tree_approximate_sum(b); + unsigned long accuracy_a, accuracy_b; + long delta =3D count_a - count_b; + int res; + + res =3D compare_delta(delta, + a->approx_accuracy_range.over + b->approx_accuracy_range.under, + a->approx_accuracy_range.under + b->approx_accuracy_range.over); + /* The values are distanced enough for an accurate approximated compariso= n. */ + if (res) + return res; + + /* + * The approximated comparison is within the accuracy range, therefore at= least one + * precise sum is needed. Sum the counter which has the largest accuracy = first. + */ + if (delta >=3D 0) { + accuracy_a =3D a->approx_accuracy_range.under; + accuracy_b =3D b->approx_accuracy_range.over; + } else { + accuracy_a =3D a->approx_accuracy_range.over; + accuracy_b =3D b->approx_accuracy_range.under; + } + if (accuracy_b < accuracy_a) { + count_a =3D percpu_counter_tree_precise_sum(a); + res =3D compare_delta(count_a - count_b, + b->approx_accuracy_range.under, + b->approx_accuracy_range.over); + if (res) + return res; + /* Precise sum of second counter is required. */ + count_b =3D percpu_counter_tree_precise_sum(b); + } else { + count_b =3D percpu_counter_tree_precise_sum(b); + res =3D compare_delta(count_a - count_b, + a->approx_accuracy_range.over, + a->approx_accuracy_range.under); + if (res) + return res; + /* Precise sum of second counter is required. */ + count_a =3D percpu_counter_tree_precise_sum(a); + } + if (count_a - count_b < 0) + return -1; + if (count_a - count_b > 0) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_precise_compare); + +/** + * percpu_counter_tree_precise_compare_value - Precise comparison of a cou= nter tree against a given value. + * @counter: Counter to compare. + * @v: Value to compare. + * + * Evaluate a precise comparison of a counter tree against a given value. + * As an optimization, it uses the approximate counter comparison + * to quickly identify whether the counter and value are far apart. + * Only cases where the value is within the counter accuracy range + * require a precise counter sum. + * + * Return: + * * %0 - The value @v is equal to the counter. + * * %-1 - The value @v is less than the counter. + * * %1 - The value @v is greater than the counter. + */ +int percpu_counter_tree_precise_compare_value(struct percpu_counter_tree *= counter, long v) +{ + long count =3D percpu_counter_tree_approximate_sum(counter); + int res; + + res =3D compare_delta(v - count, + counter->approx_accuracy_range.under, + counter->approx_accuracy_range.over); + /* The values are distanced enough for an accurate approximated compariso= n. */ + if (res) + return res; + + /* Precise sum is required. */ + count =3D percpu_counter_tree_precise_sum(counter); + if (v - count < 0) + return -1; + if (v - count > 0) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_precise_compare_value); + +static +void percpu_counter_tree_set_bias(struct percpu_counter_tree *counter, lon= g bias) +{ + WRITE_ONCE(counter->bias, bias); +} + +/** + * percpu_counter_tree_set - Set the counter tree sum to a given value. + * @counter: Counter to set. + * @v: Value to set. + * + * Set the counter sum to a given value. It can be useful for instance + * to reset the counter sum to 0. Note that even after setting the + * counter sum to a given value, the counter sum approximation can + * return any value within the accuracy range around that value. + */ +void percpu_counter_tree_set(struct percpu_counter_tree *counter, long v) +{ + percpu_counter_tree_set_bias(counter, + v - percpu_counter_tree_precise_sum_unbiased(counter)); +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_set); + +/* + * percpu_counter_tree_items_size - Query the size required for counter tr= ee items. + * + * Query the size of the memory area required to hold the counter tree + * items. This depends on the hardware topology and is invariant after + * boot. + * + * Return: Size required to hold tree items. + */ +size_t percpu_counter_tree_items_size(void) +{ + if (!nr_cpus_order) + return 0; + return counter_config->nr_items * sizeof(struct percpu_counter_tree_level= _item); +} +EXPORT_SYMBOL_GPL(percpu_counter_tree_items_size); + +static void __init calculate_accuracy_topology(void) +{ + unsigned int nr_levels =3D counter_config->nr_levels, level; + unsigned int level_items =3D 1U << nr_cpus_order; + unsigned long batch_size =3D 1; + + for (level =3D 0; level < nr_levels; level++) { + unsigned int n_arity_order =3D counter_config->n_arity_order[level]; + + /* + * The accuracy multiplier is derived from a batch size of 1 + * to speed up calculating the accuracy at tree initialization. + */ + accuracy_multiplier +=3D batch_size * level_items; + batch_size <<=3D n_arity_order; + level_items >>=3D n_arity_order; + } +} + +int __init percpu_counter_tree_subsystem_init(void) +{ + nr_cpus_order =3D get_count_order(nr_cpu_ids); + if (WARN_ON_ONCE(nr_cpus_order >=3D ARRAY_SIZE(per_nr_cpu_order_config)))= { + printk(KERN_ERR "Unsupported number of CPUs (%u)\n", nr_cpu_ids); + return -1; + } + counter_config =3D &per_nr_cpu_order_config[nr_cpus_order]; + calculate_accuracy_topology(); + return 0; +} diff --git a/lib/tests/Makefile b/lib/tests/Makefile index 05f74edbc62b..d282aa23d273 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -56,4 +56,6 @@ obj-$(CONFIG_UTIL_MACROS_KUNIT) +=3D util_macros_kunit.o obj-$(CONFIG_RATELIMIT_KUNIT_TEST) +=3D test_ratelimit.o obj-$(CONFIG_UUID_KUNIT_TEST) +=3D uuid_kunit.o =20 +obj-$(CONFIG_PERCPU_COUNTER_TREE_TEST) +=3D percpu_counter_tree_kunit.o + obj-$(CONFIG_TEST_RUNTIME_MODULE) +=3D module/ diff --git a/lib/tests/percpu_counter_tree_kunit.c b/lib/tests/percpu_count= er_tree_kunit.c new file mode 100644 index 000000000000..a79176655c4b --- /dev/null +++ b/lib/tests/percpu_counter_tree_kunit.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0+ OR MIT +// SPDX-FileCopyrightText: 2026 Mathieu Desnoyers + +#include +#include +#include +#include +#include + +struct multi_thread_test_data { + long increment; + int nr_inc; + int counter_index; +}; + +#define NR_COUNTERS 2 + +/* Hierarchical per-CPU counter instances. */ +static struct percpu_counter_tree counter[NR_COUNTERS]; +static struct percpu_counter_tree_level_item *items; + +/* Global atomic counters for validation. */ +static atomic_long_t global_counter[NR_COUNTERS]; + +static DECLARE_WAIT_QUEUE_HEAD(kernel_threads_wq); +static atomic_t kernel_threads_to_run; + +static void complete_work(void) +{ + if (atomic_dec_and_test(&kernel_threads_to_run)) + wake_up(&kernel_threads_wq); +} + +static void hpcc_print_info(struct kunit *test) +{ + kunit_info(test, "Running test with %d CPUs\n", num_online_cpus()); +} + +static void add_to_counter(int counter_index, unsigned int nr_inc, long in= crement) +{ + unsigned int i; + + for (i =3D 0; i < nr_inc; i++) { + percpu_counter_tree_add(&counter[counter_index], increment); + atomic_long_add(increment, &global_counter[counter_index]); + } +} + +static void check_counters(struct kunit *test) +{ + int counter_index; + + /* Compare each counter with its global counter. */ + for (counter_index =3D 0; counter_index < NR_COUNTERS; counter_index++) { + long v =3D atomic_long_read(&global_counter[counter_index]); + long approx_sum =3D percpu_counter_tree_approximate_sum(&counter[counter= _index]); + unsigned long under_accuracy =3D 0, over_accuracy =3D 0; + long precise_min, precise_max; + + /* Precise comparison. */ + KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&counter[counter_i= ndex]), v); + KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_precise_compare_value(&coun= ter[counter_index], v)); + + /* Approximate comparison. */ + KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare_value(&= counter[counter_index], v)); + + /* Accuracy limits checks. */ + percpu_counter_tree_approximate_accuracy_range(&counter[counter_index], = &under_accuracy, &over_accuracy); + + KUNIT_EXPECT_GE(test, (long)(approx_sum - (v - under_accuracy)), 0); + KUNIT_EXPECT_LE(test, (long)(approx_sum - (v + over_accuracy)), 0); + KUNIT_EXPECT_GT(test, (long)(approx_sum - (v - under_accuracy - 1)), 0); + KUNIT_EXPECT_LT(test, (long)(approx_sum - (v + over_accuracy + 1)), 0); + + /* Precise min/max range check. */ + percpu_counter_tree_approximate_min_max_range(approx_sum, under_accuracy= , over_accuracy, &precise_min, &precise_max); + + KUNIT_EXPECT_GE(test, v - precise_min, 0); + KUNIT_EXPECT_LE(test, v - precise_max, 0); + KUNIT_EXPECT_GT(test, v - (precise_min - 1), 0); + KUNIT_EXPECT_LT(test, v - (precise_max + 1), 0); + } + /* Compare each counter with the second counter. */ + KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&counter[0]), percp= u_counter_tree_precise_sum(&counter[1])); + KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_precise_compare(&counter[0],= &counter[1])); + KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare(&counter= [0], &counter[1])); +} + +static int multi_thread_worker_fn(void *data) +{ + struct multi_thread_test_data *td =3D data; + + add_to_counter(td->counter_index, td->nr_inc, td->increment); + complete_work(); + kfree(td); + return 0; +} + +static void test_run_on_specific_cpu(struct kunit *test, int target_cpu, i= nt counter_index, unsigned int nr_inc, long increment) +{ + struct task_struct *task; + struct multi_thread_test_data *td =3D kzalloc(sizeof(struct multi_thread_= test_data), GFP_KERNEL); + + KUNIT_EXPECT_PTR_NE(test, td, NULL); + td->increment =3D increment; + td->nr_inc =3D nr_inc; + td->counter_index =3D counter_index; + atomic_inc(&kernel_threads_to_run); + task =3D kthread_run_on_cpu(multi_thread_worker_fn, td, target_cpu, "kuni= t_multi_thread_worker"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, task); +} + +static void init_kthreads(void) +{ + atomic_set(&kernel_threads_to_run, 1); +} + +static void fini_kthreads(void) +{ + /* Release our own reference. */ + complete_work(); + /* Wait for all others threads to run. */ + wait_event(kernel_threads_wq, (atomic_read(&kernel_threads_to_run) =3D=3D= 0)); +} + +static void test_sync_kthreads(void) +{ + fini_kthreads(); + init_kthreads(); +} + +static void init_counters(struct kunit *test, unsigned long batch_size) +{ + int i, ret; + + items =3D kzalloc(percpu_counter_tree_items_size() * NR_COUNTERS, GFP_KER= NEL); + KUNIT_EXPECT_PTR_NE(test, items, NULL); + ret =3D percpu_counter_tree_init_many(counter, items, NR_COUNTERS, batch_= size, GFP_KERNEL); + KUNIT_EXPECT_EQ(test, ret, 0); + + for (i =3D 0; i < NR_COUNTERS; i++) + atomic_long_set(&global_counter[i], 0); +} + +static void fini_counters(void) +{ + percpu_counter_tree_destroy_many(counter, NR_COUNTERS); + kfree(items); +} + +enum up_test_inc_type { + INC_ONE, + INC_MINUS_ONE, + INC_RANDOM, +}; + +/* + * Single-threaded tests. Those use many threads to run on various CPUs, + * but synchronize for completion of each thread before running the + * next, effectively making sure there are no concurrent updates. + */ +static void do_hpcc_test_single_thread(struct kunit *test, int _cpu0, int = _cpu1, enum up_test_inc_type type) +{ + unsigned long batch_size_order =3D 5; + int cpu0 =3D _cpu0; + int cpu1 =3D _cpu1; + int i; + + init_counters(test, 1UL << batch_size_order); + init_kthreads(); + for (i =3D 0; i < 10000; i++) { + long increment; + + switch (type) { + case INC_ONE: + increment =3D 1; + break; + case INC_MINUS_ONE: + increment =3D -1; + break; + case INC_RANDOM: + increment =3D (long) get_random_long() % 50000; + break; + } + if (_cpu0 < 0) + cpu0 =3D cpumask_any_distribute(cpu_online_mask); + if (_cpu1 < 0) + cpu1 =3D cpumask_any_distribute(cpu_online_mask); + test_run_on_specific_cpu(test, cpu0, 0, 1, increment); + test_sync_kthreads(); + test_run_on_specific_cpu(test, cpu1, 1, 1, increment); + test_sync_kthreads(); + check_counters(test); + } + fini_kthreads(); + fini_counters(); +} + +static void hpcc_test_single_thread_first(struct kunit *test) +{ + int cpu =3D cpumask_first(cpu_online_mask); + + do_hpcc_test_single_thread(test, cpu, cpu, INC_ONE); + do_hpcc_test_single_thread(test, cpu, cpu, INC_MINUS_ONE); + do_hpcc_test_single_thread(test, cpu, cpu, INC_RANDOM); +} + +static void hpcc_test_single_thread_first_random(struct kunit *test) +{ + int cpu =3D cpumask_first(cpu_online_mask); + + do_hpcc_test_single_thread(test, cpu, -1, INC_ONE); + do_hpcc_test_single_thread(test, cpu, -1, INC_MINUS_ONE); + do_hpcc_test_single_thread(test, cpu, -1, INC_RANDOM); +} + +static void hpcc_test_single_thread_random(struct kunit *test) +{ + do_hpcc_test_single_thread(test, -1, -1, INC_ONE); + do_hpcc_test_single_thread(test, -1, -1, INC_MINUS_ONE); + do_hpcc_test_single_thread(test, -1, -1, INC_RANDOM); +} + +/* Multi-threaded SMP tests. */ + +static void do_hpcc_multi_thread_increment_each_cpu(struct kunit *test, un= signed long batch_size, unsigned int nr_inc, long increment) +{ + int cpu; + + init_counters(test, batch_size); + init_kthreads(); + for_each_online_cpu(cpu) { + test_run_on_specific_cpu(test, cpu, 0, nr_inc, increment); + test_run_on_specific_cpu(test, cpu, 1, nr_inc, increment); + } + fini_kthreads(); + check_counters(test); + fini_counters(); +} + +static void do_hpcc_multi_thread_increment_even_cpus(struct kunit *test, u= nsigned long batch_size, unsigned int nr_inc, long increment) +{ + int cpu; + + init_counters(test, batch_size); + init_kthreads(); + for_each_online_cpu(cpu) { + test_run_on_specific_cpu(test, cpu, 0, nr_inc, increment); + test_run_on_specific_cpu(test, cpu & ~1, 1, nr_inc, increment); /* even = cpus. */ + } + fini_kthreads(); + check_counters(test); + fini_counters(); +} + +static void do_hpcc_multi_thread_increment_single_cpu(struct kunit *test, = unsigned long batch_size, unsigned int nr_inc, long increment) +{ + int cpu; + + init_counters(test, batch_size); + init_kthreads(); + for_each_online_cpu(cpu) { + test_run_on_specific_cpu(test, cpu, 0, nr_inc, increment); + test_run_on_specific_cpu(test, cpumask_first(cpu_online_mask), 1, nr_inc= , increment); + } + fini_kthreads(); + check_counters(test); + fini_counters(); +} + +static void do_hpcc_multi_thread_increment_random_cpu(struct kunit *test, = unsigned long batch_size, unsigned int nr_inc, long increment) +{ + int cpu; + + init_counters(test, batch_size); + init_kthreads(); + for_each_online_cpu(cpu) { + test_run_on_specific_cpu(test, cpu, 0, nr_inc, increment); + test_run_on_specific_cpu(test, cpumask_any_distribute(cpu_online_mask), = 1, nr_inc, increment); + } + fini_kthreads(); + check_counters(test); + fini_counters(); +} + +static void hpcc_test_multi_thread_batch_increment(struct kunit *test) +{ + unsigned long batch_size_order; + + for (batch_size_order =3D 2; batch_size_order < 10; batch_size_order++) { + unsigned int nr_inc; + + for (nr_inc =3D 1; nr_inc < 1024; nr_inc *=3D 2) { + long increment; + + for (increment =3D 1; increment < 100000; increment *=3D 10) { + do_hpcc_multi_thread_increment_each_cpu(test, 1UL << batch_size_order,= nr_inc, increment); + do_hpcc_multi_thread_increment_even_cpus(test, 1UL << batch_size_order= , nr_inc, increment); + do_hpcc_multi_thread_increment_single_cpu(test, 1UL << batch_size_orde= r, nr_inc, increment); + do_hpcc_multi_thread_increment_random_cpu(test, 1UL << batch_size_orde= r, nr_inc, increment); + } + } + } +} + +static void hpcc_test_multi_thread_random_walk(struct kunit *test) +{ + unsigned long batch_size_order =3D 5; + int loop; + + for (loop =3D 0; loop < 100; loop++) { + int i; + + init_counters(test, 1UL << batch_size_order); + init_kthreads(); + for (i =3D 0; i < 1000; i++) { + long increment =3D (long) get_random_long() % 512; + unsigned int nr_inc =3D ((unsigned long) get_random_long()) % 1024; + + test_run_on_specific_cpu(test, cpumask_any_distribute(cpu_online_mask),= 0, nr_inc, increment); + test_run_on_specific_cpu(test, cpumask_any_distribute(cpu_online_mask),= 1, nr_inc, increment); + } + fini_kthreads(); + check_counters(test); + fini_counters(); + } +} + +static void hpcc_test_init_one(struct kunit *test) +{ + struct percpu_counter_tree pct; + struct percpu_counter_tree_level_item *counter_items; + int ret; + + counter_items =3D kzalloc(percpu_counter_tree_items_size(), GFP_KERNEL); + KUNIT_EXPECT_PTR_NE(test, counter_items, NULL); + ret =3D percpu_counter_tree_init(&pct, counter_items, 32, GFP_KERNEL); + KUNIT_EXPECT_EQ(test, ret, 0); + + percpu_counter_tree_destroy(&pct); + kfree(counter_items); +} + +static void hpcc_test_set(struct kunit *test) +{ + static long values[] =3D { + 5, 100, 127, 128, 255, 256, 4095, 4096, 500000, 0, + -5, -100, -127, -128, -255, -256, -4095, -4096, -500000, + }; + struct percpu_counter_tree pct; + struct percpu_counter_tree_level_item *counter_items; + int i, ret; + + counter_items =3D kzalloc(percpu_counter_tree_items_size(), GFP_KERNEL); + KUNIT_EXPECT_PTR_NE(test, counter_items, NULL); + ret =3D percpu_counter_tree_init(&pct, counter_items, 32, GFP_KERNEL); + KUNIT_EXPECT_EQ(test, ret, 0); + + for (i =3D 0; i < ARRAY_SIZE(values); i++) { + long v =3D values[i]; + + percpu_counter_tree_set(&pct, v); + KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&pct), v); + KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare_value(&= pct, v)); + + percpu_counter_tree_add(&pct, v); + KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&pct), 2 * v); + KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare_value(&= pct, 2 * v)); + + percpu_counter_tree_add(&pct, -2 * v); + KUNIT_EXPECT_EQ(test, percpu_counter_tree_precise_sum(&pct), 0); + KUNIT_EXPECT_EQ(test, 0, percpu_counter_tree_approximate_compare_value(&= pct, 0)); + } + + percpu_counter_tree_destroy(&pct); + kfree(counter_items); +} + +static struct kunit_case hpcc_test_cases[] =3D { + KUNIT_CASE(hpcc_print_info), + KUNIT_CASE(hpcc_test_single_thread_first), + KUNIT_CASE(hpcc_test_single_thread_first_random), + KUNIT_CASE(hpcc_test_single_thread_random), + KUNIT_CASE(hpcc_test_multi_thread_batch_increment), + KUNIT_CASE(hpcc_test_multi_thread_random_walk), + KUNIT_CASE(hpcc_test_init_one), + KUNIT_CASE(hpcc_test_set), + {} +}; + +static struct kunit_suite hpcc_test_suite =3D { + .name =3D "percpu_counter_tree", + .test_cases =3D hpcc_test_cases, +}; + +kunit_test_suite(hpcc_test_suite); + +MODULE_DESCRIPTION("Test cases for hierarchical per-CPU counters"); +MODULE_LICENSE("Dual MIT/GPL"); diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index a565c30c71a0..faebf5b7cd6e 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -1,64 +1,92 @@ // SPDX-License-Identifier: GPL-2.0-only =20 -#include -#include +#include +#include #include #include #include #include #include =20 -/* - * The vDSO data page. - */ +static u8 vdso_initdata[VDSO_NR_PAGES * PAGE_SIZE] __aligned(PAGE_SIZE) __= initdata =3D {}; + #ifdef CONFIG_GENERIC_GETTIMEOFDAY -static union { - struct vdso_time_data data; - u8 page[PAGE_SIZE]; -} vdso_time_data_store __page_aligned_data; -struct vdso_time_data *vdso_k_time_data =3D &vdso_time_data_store.data; -static_assert(sizeof(vdso_time_data_store) =3D=3D PAGE_SIZE); +struct vdso_time_data *vdso_k_time_data __refdata =3D + (void *)&vdso_initdata[VDSO_TIME_PAGE_OFFSET * PAGE_SIZE]; + +static_assert(sizeof(struct vdso_time_data) <=3D PAGE_SIZE); #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ =20 #ifdef CONFIG_VDSO_GETRANDOM -static union { - struct vdso_rng_data data; - u8 page[PAGE_SIZE]; -} vdso_rng_data_store __page_aligned_data; -struct vdso_rng_data *vdso_k_rng_data =3D &vdso_rng_data_store.data; -static_assert(sizeof(vdso_rng_data_store) =3D=3D PAGE_SIZE); +struct vdso_rng_data *vdso_k_rng_data __refdata =3D + (void *)&vdso_initdata[VDSO_RNG_PAGE_OFFSET * PAGE_SIZE]; + +static_assert(sizeof(struct vdso_rng_data) <=3D PAGE_SIZE); #endif /* CONFIG_VDSO_GETRANDOM */ =20 #ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA -static union { - struct vdso_arch_data data; - u8 page[VDSO_ARCH_DATA_SIZE]; -} vdso_arch_data_store __page_aligned_data; -struct vdso_arch_data *vdso_k_arch_data =3D &vdso_arch_data_store.data; +struct vdso_arch_data *vdso_k_arch_data __refdata =3D + (void *)&vdso_initdata[VDSO_ARCH_PAGES_START * PAGE_SIZE]; #endif /* CONFIG_ARCH_HAS_VDSO_ARCH_DATA */ =20 +void __init vdso_setup_data_pages(void) +{ + unsigned int order =3D get_order(VDSO_NR_PAGES * PAGE_SIZE); + struct page *pages; + + /* + * Allocate the data pages dynamically. SPARC does not support mapping + * static pages to be mapped into userspace. + * It is also a requirement for mlockall() support. + * + * Do not use folios. In time namespaces the pages are mapped in a differ= ent order + * to userspace, which is not handled by the folio optimizations in finis= h_fault(). + */ + pages =3D alloc_pages(GFP_KERNEL, order); + if (!pages) + panic("Unable to allocate VDSO storage pages"); + + /* The pages are mapped one-by-one into userspace and each one needs to b= e refcounted. */ + split_page(pages, order); + + /* Move the data already written by other subsystems to the new pages */ + memcpy(page_address(pages), vdso_initdata, VDSO_NR_PAGES * PAGE_SIZE); + + if (IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) + vdso_k_time_data =3D page_address(pages + VDSO_TIME_PAGE_OFFSET); + + if (IS_ENABLED(CONFIG_VDSO_GETRANDOM)) + vdso_k_rng_data =3D page_address(pages + VDSO_RNG_PAGE_OFFSET); + + if (IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA)) + vdso_k_arch_data =3D page_address(pages + VDSO_ARCH_PAGES_START); +} + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct page *timens_page =3D find_timens_vvar_page(vma); - unsigned long addr, pfn; - vm_fault_t err; + struct page *page, *timens_page; + + timens_page =3D find_timens_vvar_page(vma); =20 switch (vmf->pgoff) { case VDSO_TIME_PAGE_OFFSET: if (!IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) return VM_FAULT_SIGBUS; - pfn =3D __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + page =3D virt_to_page(vdso_k_time_data); if (timens_page) { /* * Fault in VVAR page too, since it will be accessed * to get clock data anyway. */ + unsigned long addr; + vm_fault_t err; + addr =3D vmf->address + VDSO_TIMENS_PAGE_OFFSET * PAGE_SIZE; - err =3D vmf_insert_pfn(vma, addr, pfn); + err =3D vmf_insert_page(vma, addr, page); if (unlikely(err & VM_FAULT_ERROR)) return err; - pfn =3D page_to_pfn(timens_page); + page =3D timens_page; } break; case VDSO_TIMENS_PAGE_OFFSET: @@ -71,24 +99,25 @@ static vm_fault_t vvar_fault(const struct vm_special_ma= pping *sm, */ if (!IS_ENABLED(CONFIG_TIME_NS) || !timens_page) return VM_FAULT_SIGBUS; - pfn =3D __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + page =3D virt_to_page(vdso_k_time_data); break; case VDSO_RNG_PAGE_OFFSET: if (!IS_ENABLED(CONFIG_VDSO_GETRANDOM)) return VM_FAULT_SIGBUS; - pfn =3D __phys_to_pfn(__pa_symbol(vdso_k_rng_data)); + page =3D virt_to_page(vdso_k_rng_data); break; case VDSO_ARCH_PAGES_START ... VDSO_ARCH_PAGES_END: if (!IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA)) return VM_FAULT_SIGBUS; - pfn =3D __phys_to_pfn(__pa_symbol(vdso_k_arch_data)) + - vmf->pgoff - VDSO_ARCH_PAGES_START; + page =3D virt_to_page(vdso_k_arch_data) + vmf->pgoff - VDSO_ARCH_PAGES_S= TART; break; default: return VM_FAULT_SIGBUS; } =20 - return vmf_insert_pfn(vma, vmf->address, pfn); + get_page(page); + vmf->page =3D page; + return 0; } =20 const struct vm_special_mapping vdso_vvar_mapping =3D { @@ -100,7 +129,7 @@ struct vm_area_struct *vdso_install_vvar_mapping(struct= mm_struct *mm, unsigned { return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | - VM_PFNMAP | VM_SEALED_SYSMAP, + VM_MIXEDMAP | VM_SEALED_SYSMAP, &vdso_vvar_mapping); } =20 diff --git a/mm/memblock.c b/mm/memblock.c index b3ddfdec7a80..ae6a5af46bd7 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -959,28 +959,6 @@ __init void memblock_clear_kho_scratch_only(void) { kho_scratch_only =3D false; } - -__init void memmap_init_kho_scratch_pages(void) -{ - phys_addr_t start, end; - unsigned long pfn; - int nid; - u64 i; - - if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) - return; - - /* - * Initialize struct pages for free scratch memory. - * The struct pages for reserved scratch memory will be set up in - * reserve_bootmem_region() - */ - __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, - MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { - for (pfn =3D PFN_UP(start); pfn < PFN_DOWN(end); pfn++) - init_deferred_page(pfn, nid); - } -} #endif =20 /** diff --git a/mm/mm_init.c b/mm/mm_init.c index df34797691bd..7363b5b0d22a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -786,7 +786,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { struct page *page =3D pfn_to_page(pfn); =20 - __init_deferred_page(pfn, nid); + if (!pfn_is_kho_scratch(pfn)) + __init_deferred_page(pfn, nid); =20 /* * no need for atomic set_bit because the struct @@ -1996,9 +1997,12 @@ static void __init deferred_free_pages(unsigned long= pfn, =20 /* Free a large naturally-aligned chunk if possible */ if (nr_pages =3D=3D MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { - for (i =3D 0; i < nr_pages; i +=3D pageblock_nr_pages) + for (i =3D 0; i < nr_pages; i +=3D pageblock_nr_pages) { + if (pfn_is_kho_scratch(page_to_pfn(page + i))) + continue; init_pageblock_migratetype(page + i, MIGRATE_MOVABLE, false); + } __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY); return; } @@ -2007,7 +2011,7 @@ static void __init deferred_free_pages(unsigned long = pfn, accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE); =20 for (i =3D 0; i < nr_pages; i++, page++, pfn++) { - if (pageblock_aligned(pfn)) + if (pageblock_aligned(pfn) && !pfn_is_kho_scratch(pfn)) init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); __free_pages_core(page, 0, MEMINIT_EARLY); @@ -2078,9 +2082,11 @@ deferred_init_memmap_chunk(unsigned long start_pfn, = unsigned long end_pfn, unsigned long mo_pfn =3D ALIGN(spfn + 1, MAX_ORDER_NR_PAGES); unsigned long chunk_end =3D min(mo_pfn, epfn); =20 - nr_pages +=3D deferred_init_pages(zone, spfn, chunk_end); - deferred_free_pages(spfn, chunk_end - spfn); + // KHO scratch is MAX_ORDER_NR_PAGES aligned. + if (!pfn_is_kho_scratch(spfn)) + deferred_init_pages(zone, spfn, chunk_end); =20 + deferred_free_pages(spfn, chunk_end - spfn); spfn =3D chunk_end; =20 if (can_resched) @@ -2088,6 +2094,7 @@ deferred_init_memmap_chunk(unsigned long start_pfn, u= nsigned long end_pfn, else touch_nmi_watchdog(); } + nr_pages +=3D epfn - spfn; } =20 return nr_pages; --oKQGQYgdjErEaAqZ--