[PATCH v7 27/31] gpu: nova-core: Hopper/Blackwell: larger WPR2 (GSP) heap

John Hubbard posted 31 patches 2 weeks, 5 days ago
There is a newer version of this series
[PATCH v7 27/31] gpu: nova-core: Hopper/Blackwell: larger WPR2 (GSP) heap
Posted by John Hubbard 2 weeks, 5 days ago
Hopper, Blackwell and later GPUs require a larger heap for WPR2.

Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 drivers/gpu/nova-core/gsp/fw.rs | 61 +++++++++++++++++++++++++--------
 1 file changed, 47 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/nova-core/gsp/fw.rs b/drivers/gpu/nova-core/gsp/fw.rs
index 4a8ba2721dd1..c2eee984bd4d 100644
--- a/drivers/gpu/nova-core/gsp/fw.rs
+++ b/drivers/gpu/nova-core/gsp/fw.rs
@@ -121,21 +121,41 @@ enum GspFwHeapParams {}
 /// Minimum required alignment for the GSP heap.
 const GSP_HEAP_ALIGNMENT: Alignment = Alignment::new::<{ 1 << 20 }>();
 
+// These constants override the generated bindings for architecture-specific heap sizing.
+// See Open RM: kgspCalculateGspFwHeapSize and related functions.
+//
+// 14MB for Hopper/Blackwell+.
+const GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100: u64 = 14 * num::usize_as_u64(SZ_1M);
+// 142MB client alloc for ~188MB total.
+const GSP_FW_HEAP_PARAM_CLIENT_ALLOC_SIZE_GH100: u64 = 142 * num::usize_as_u64(SZ_1M);
+// Hopper/Blackwell+ minimum heap size: 170MB (88 + 12 + 70).
+// See Open RM: GSP_FW_HEAP_SIZE_OVERRIDE_LIBOS3_BAREMETAL_MIN_MB for the base 88MB,
+// plus Hopper+ additions in kgspCalculateGspFwHeapSize_GH100.
+const GSP_FW_HEAP_SIZE_OVERRIDE_LIBOS3_BAREMETAL_MIN_MB_HOPPER: u64 = 170;
+
 impl GspFwHeapParams {
     /// Returns the amount of GSP-RM heap memory used during GSP-RM boot and initialization (up to
     /// and including the first client subdevice allocation).
-    fn base_rm_size(_chipset: Chipset) -> u64 {
-        // TODO: this needs to be updated to return the correct value for Hopper+ once support for
-        // them is added:
-        // u64::from(bindings::GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100)
-        u64::from(bindings::GSP_FW_HEAP_PARAM_BASE_RM_SIZE_TU10X)
+    fn base_rm_size(chipset: Chipset) -> u64 {
+        use crate::gpu::Architecture;
+        match chipset.arch() {
+            Architecture::Hopper | Architecture::Blackwell => {
+                GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100
+            }
+            _ => u64::from(bindings::GSP_FW_HEAP_PARAM_BASE_RM_SIZE_TU10X),
+        }
     }
 
     /// Returns the amount of heap memory required to support a single channel allocation.
-    fn client_alloc_size() -> u64 {
-        u64::from(bindings::GSP_FW_HEAP_PARAM_CLIENT_ALLOC_SIZE)
-            .align_up(GSP_HEAP_ALIGNMENT)
-            .unwrap_or(u64::MAX)
+    fn client_alloc_size(chipset: Chipset) -> Result<u64> {
+        use crate::gpu::Architecture;
+        let size = match chipset.arch() {
+            Architecture::Hopper | Architecture::Blackwell => {
+                GSP_FW_HEAP_PARAM_CLIENT_ALLOC_SIZE_GH100
+            }
+            _ => u64::from(bindings::GSP_FW_HEAP_PARAM_CLIENT_ALLOC_SIZE),
+        };
+        size.align_up(GSP_HEAP_ALIGNMENT).ok_or(EINVAL)
     }
 
     /// Returns the amount of memory to reserve for management purposes for a framebuffer of size
@@ -179,12 +199,25 @@ impl LibosParams {
                 * num::usize_as_u64(SZ_1M),
     };
 
+    /// Hopper/Blackwell+ GPUs need a larger minimum heap size than the bindings specify.
+    /// The r570 bindings set LIBOS3_BAREMETAL_MIN_MB to 88MB, but Hopper/Blackwell+ actually
+    /// requires 170MB (88 + 12 + 70).
+    const LIBOS_HOPPER: LibosParams = LibosParams {
+        carveout_size: num::u32_as_u64(bindings::GSP_FW_HEAP_PARAM_OS_SIZE_LIBOS3_BAREMETAL),
+        allowed_heap_size: GSP_FW_HEAP_SIZE_OVERRIDE_LIBOS3_BAREMETAL_MIN_MB_HOPPER
+            * num::usize_as_u64(SZ_1M)
+            ..num::u32_as_u64(bindings::GSP_FW_HEAP_SIZE_OVERRIDE_LIBOS3_BAREMETAL_MAX_MB)
+                * num::usize_as_u64(SZ_1M),
+    };
+
     /// Returns the libos parameters corresponding to `chipset`.
     pub(crate) fn from_chipset(chipset: Chipset) -> &'static LibosParams {
-        if chipset < Chipset::GA102 {
-            &Self::LIBOS2
-        } else {
-            &Self::LIBOS3
+        use crate::gpu::Architecture;
+        match chipset.arch() {
+            Architecture::Turing => &Self::LIBOS2,
+            Architecture::Ampere if chipset == Chipset::GA100 => &Self::LIBOS2,
+            Architecture::Ampere | Architecture::Ada => &Self::LIBOS3,
+            Architecture::Hopper | Architecture::Blackwell => &Self::LIBOS_HOPPER,
         }
     }
 
@@ -198,7 +231,7 @@ pub(crate) fn wpr_heap_size(&self, chipset: Chipset, fb_size: u64) -> Result<u64
             // RM boot working memory,
             .saturating_add(GspFwHeapParams::base_rm_size(chipset))
             // One RM client,
-            .saturating_add(GspFwHeapParams::client_alloc_size())
+            .saturating_add(GspFwHeapParams::client_alloc_size(chipset)?)
             // Overhead for memory management.
             .saturating_add(GspFwHeapParams::management_overhead(fb_size)?)
             // Clamp to the supported heap sizes.
-- 
2.53.0
Re: [PATCH v7 27/31] gpu: nova-core: Hopper/Blackwell: larger WPR2 (GSP) heap
Posted by kernel test robot 2 weeks, 5 days ago
Hi John,

kernel test robot noticed the following build errors:

[auto build test ERROR on d19ab42867ae7c68be84ed957d95712b7934773f]

url:    https://github.com/intel-lab-lkp/linux/commits/John-Hubbard/gpu-nova-core-Hopper-Blackwell-basic-GPU-identification/20260318-203344
base:   d19ab42867ae7c68be84ed957d95712b7934773f
patch link:    https://lore.kernel.org/r/20260317225355.549853-28-jhubbard%40nvidia.com
patch subject: [PATCH v7 27/31] gpu: nova-core: Hopper/Blackwell: larger WPR2 (GSP) heap
config: x86_64-rhel-9.4-rust (https://download.01.org/0day-ci/archive/20260318/202603181742.8HLcTchk-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
rustc: rustc 1.88.0 (6b00bc388 2025-06-23)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260318/202603181742.8HLcTchk-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603181742.8HLcTchk-lkp@intel.com/

All errors (new ones prefixed by >>):

   PATH=/opt/cross/clang-20/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
   INFO PATH=/opt/cross/rustc-1.88.0-bindgen-0.72.1/cargo/bin:/opt/cross/clang-20/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
   /usr/bin/timeout -k 100 12h /usr/bin/make KCFLAGS=\ -fno-crash-diagnostics\ -Wno-error=return-type\ -Wreturn-type\ -funsigned-char\ -Wundef\ -falign-functions=64 W=1 --keep-going LLVM=1 -j32 -C source O=/kbuild/obj/consumer/x86_64-rhel-9.4-rust ARCH=x86_64 SHELL=/bin/bash rustfmtcheck 
   make: Entering directory '/kbuild/src/consumer'
   make[1]: Entering directory '/kbuild/obj/consumer/x86_64-rhel-9.4-rust'
>> Diff in drivers/gpu/nova-core/gsp/fw.rs:139:
        fn base_rm_size(chipset: Chipset) -> u64 {
            use crate::gpu::Architecture;
            match chipset.arch() {
   -            Architecture::Hopper | Architecture::Blackwell => {
   -                GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100
   -            }
   +            Architecture::Hopper | Architecture::Blackwell => GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100,
                _ => u64::from(bindings::GSP_FW_HEAP_PARAM_BASE_RM_SIZE_TU10X),
            }
        }
>> Diff in drivers/gpu/nova-core/gsp/fw.rs:139:
        fn base_rm_size(chipset: Chipset) -> u64 {
            use crate::gpu::Architecture;
            match chipset.arch() {
   -            Architecture::Hopper | Architecture::Blackwell => {
   -                GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100
   -            }
   +            Architecture::Hopper | Architecture::Blackwell => GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100,
                _ => u64::from(bindings::GSP_FW_HEAP_PARAM_BASE_RM_SIZE_TU10X),
            }
        }
>> Diff in drivers/gpu/nova-core/gsp/fw.rs:139:
        fn base_rm_size(chipset: Chipset) -> u64 {
            use crate::gpu::Architecture;
            match chipset.arch() {
   -            Architecture::Hopper | Architecture::Blackwell => {
   -                GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100
   -            }
   +            Architecture::Hopper | Architecture::Blackwell => GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100,
                _ => u64::from(bindings::GSP_FW_HEAP_PARAM_BASE_RM_SIZE_TU10X),
            }
        }
   make[2]: *** [Makefile:1916: rustfmt] Error 123
   make[2]: Target 'rustfmtcheck' not remade because of errors.
   make[1]: Leaving directory '/kbuild/obj/consumer/x86_64-rhel-9.4-rust'
   make[1]: *** [Makefile:248: __sub-make] Error 2
   make[1]: Target 'rustfmtcheck' not remade because of errors.
   make: *** [Makefile:248: __sub-make] Error 2
   make: Target 'rustfmtcheck' not remade because of errors.
   make: Leaving directory '/kbuild/src/consumer'

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v7 27/31] gpu: nova-core: Hopper/Blackwell: larger WPR2 (GSP) heap
Posted by John Hubbard 2 weeks, 5 days ago
On 3/18/26 9:12 AM, kernel test robot wrote:
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202603181742.8HLcTchk-lkp@intel.com/

OK, some rustfmtcheck failures. This has revealed a major gap in my
build flow: although I have rustfmt(1) set up to run upon saving
files in my code editor, not all of my scripts run it.

The code saving approach worked so well that I completely forgot
about "make rustfmt" and "make rustfmtcheck". So now those are
part of all scripts and testing here.

Both issues (this one in gsp/fw.rs, and the one in gsp/boot.rs
reported against patch 31/31) are fixed in v8.

Sorry about the failures.

thanks,
-- 
John Hubbard

> 
> All errors (new ones prefixed by >>):
> 
>    PATH=/opt/cross/clang-20/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
>    INFO PATH=/opt/cross/rustc-1.88.0-bindgen-0.72.1/cargo/bin:/opt/cross/clang-20/bin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
>    /usr/bin/timeout -k 100 12h /usr/bin/make KCFLAGS=\ -fno-crash-diagnostics\ -Wno-error=return-type\ -Wreturn-type\ -funsigned-char\ -Wundef\ -falign-functions=64 W=1 --keep-going LLVM=1 -j32 -C source O=/kbuild/obj/consumer/x86_64-rhel-9.4-rust ARCH=x86_64 SHELL=/bin/bash rustfmtcheck 
>    make: Entering directory '/kbuild/src/consumer'
>    make[1]: Entering directory '/kbuild/obj/consumer/x86_64-rhel-9.4-rust'
>>> Diff in drivers/gpu/nova-core/gsp/fw.rs:139:
>         fn base_rm_size(chipset: Chipset) -> u64 {
>             use crate::gpu::Architecture;
>             match chipset.arch() {
>    -            Architecture::Hopper | Architecture::Blackwell => {
>    -                GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100
>    -            }
>    +            Architecture::Hopper | Architecture::Blackwell => GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100,
>                 _ => u64::from(bindings::GSP_FW_HEAP_PARAM_BASE_RM_SIZE_TU10X),
>             }
>         }
>>> Diff in drivers/gpu/nova-core/gsp/fw.rs:139:
>         fn base_rm_size(chipset: Chipset) -> u64 {
>             use crate::gpu::Architecture;
>             match chipset.arch() {
>    -            Architecture::Hopper | Architecture::Blackwell => {
>    -                GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100
>    -            }
>    +            Architecture::Hopper | Architecture::Blackwell => GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100,
>                 _ => u64::from(bindings::GSP_FW_HEAP_PARAM_BASE_RM_SIZE_TU10X),
>             }
>         }
>>> Diff in drivers/gpu/nova-core/gsp/fw.rs:139:
>         fn base_rm_size(chipset: Chipset) -> u64 {
>             use crate::gpu::Architecture;
>             match chipset.arch() {
>    -            Architecture::Hopper | Architecture::Blackwell => {
>    -                GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100
>    -            }
>    +            Architecture::Hopper | Architecture::Blackwell => GSP_FW_HEAP_PARAM_BASE_RM_SIZE_GH100,
>                 _ => u64::from(bindings::GSP_FW_HEAP_PARAM_BASE_RM_SIZE_TU10X),
>             }
>         }
>    make[2]: *** [Makefile:1916: rustfmt] Error 123
>    make[2]: Target 'rustfmtcheck' not remade because of errors.
>    make[1]: Leaving directory '/kbuild/obj/consumer/x86_64-rhel-9.4-rust'
>    make[1]: *** [Makefile:248: __sub-make] Error 2
>    make[1]: Target 'rustfmtcheck' not remade because of errors.
>    make: *** [Makefile:248: __sub-make] Error 2
>    make: Target 'rustfmtcheck' not remade because of errors.
>    make: Leaving directory '/kbuild/src/consumer'
>