[v9] gpu: nova-core: firmware: Hopper/Blackwell support

[PATCH v9 05/31] gpu: nova-core: set DMA mask width based on GPU architecture

Posted by John Hubbard 1 week ago

Replace the hardcoded 47-bit DMA mask with per-architecture values.
Add Architecture::dma_mask() with an exhaustive match, so new
architectures get a compile-time reminder to specify their width.

Set the DMA mask in Gpu::new(). Gpu owns all DMA allocations for
the device, so no concurrent allocations can exist while the
constructor is still running.

Move Spec creation into probe() so the dev_info is printed early,
and pass Spec into Gpu::new().

Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 drivers/gpu/nova-core/driver.rs | 24 ++++++--------------
 drivers/gpu/nova-core/gpu.rs    | 39 +++++++++++++++++++++++----------
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/nova-core/driver.rs b/drivers/gpu/nova-core/driver.rs
index 84b0e1703150..bb82e63af044 100644
--- a/drivers/gpu/nova-core/driver.rs
+++ b/drivers/gpu/nova-core/driver.rs
@@ -4,8 +4,6 @@
     auxiliary,
     device::Core,
     devres::Devres,
-    dma::Device,
-    dma::DmaMask,
     pci,
     pci::{
         Class,
@@ -23,7 +21,10 @@
     },
 };
 
-use crate::gpu::Gpu;
+use crate::gpu::{
+    Gpu,
+    Spec, //
+};
 
 /// Counter for generating unique auxiliary device IDs.
 static AUXILIARY_ID_COUNTER: Atomic<u32> = Atomic::new(0);
@@ -38,14 +39,6 @@ pub(crate) struct NovaCore {
 
 const BAR0_SIZE: usize = SZ_16M;
 
-// For now we only support Ampere which can use up to 47-bit DMA addresses.
-//
-// TODO: Add an abstraction for this to support newer GPUs which may support
-// larger DMA addresses. Limiting these GPUs to smaller address widths won't
-// have any adverse affects, unless installed on systems which require larger
-// DMA addresses. These systems should be quite rare.
-const GPU_DMA_BITS: u32 = 47;
-
 pub(crate) type Bar0 = pci::Bar<BAR0_SIZE>;
 
 kernel::pci_device_table!(
@@ -84,18 +77,15 @@ fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, E
             pdev.enable_device_mem()?;
             pdev.set_master();
 
-            // SAFETY: No concurrent DMA allocations or mappings can be made because
-            // the device is still being probed and therefore isn't being used by
-            // other threads of execution.
-            unsafe { pdev.dma_set_mask_and_coherent(DmaMask::new::<GPU_DMA_BITS>())? };
-
             let bar = Arc::pin_init(
                 pdev.iomap_region_sized::<BAR0_SIZE>(0, c"nova-core/bar0"),
                 GFP_KERNEL,
             )?;
+            let spec = Spec::new(pdev.as_ref(), bar.access(pdev.as_ref())?)?;
+            dev_info!(pdev, "NVIDIA ({})\n", spec);
 
             Ok(try_pin_init!(Self {
-                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?),
+                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?, spec),
                 _reg <- auxiliary::Registration::new(
                     pdev.as_ref(),
                     c"nova-drm",
diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
index 685ae4c81268..f70bfbda1614 100644
--- a/drivers/gpu/nova-core/gpu.rs
+++ b/drivers/gpu/nova-core/gpu.rs
@@ -3,6 +3,10 @@
 use kernel::{
     device,
     devres::Devres,
+    dma::{
+        Device,
+        DmaMask, //
+    },
     fmt,
     pci,
     prelude::*,
@@ -160,6 +164,16 @@ pub(crate) enum Architecture {
     BlackwellGB20x = 0x1b,
 }
 
+impl Architecture {
+    /// Returns the DMA mask supported by this architecture.
+    pub(crate) const fn dma_mask(&self) -> DmaMask {
+        match self {
+            Self::Turing | Self::Ampere | Self::Ada => DmaMask::new::<47>(),
+            Self::Hopper | Self::BlackwellGB10x | Self::BlackwellGB20x => DmaMask::new::<52>(),
+        }
+    }
+}
+
 impl TryFrom<u8> for Architecture {
     type Error = Error;
 
@@ -212,7 +226,7 @@ pub(crate) struct Spec {
 }
 
 impl Spec {
-    fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
+    pub(crate) fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
         // Some brief notes about boot0 and boot42, in chronological order:
         //
         // NV04 through NV50:
@@ -244,7 +258,6 @@ fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
     }
 
     /// Returns this GPU's chipset.
-    #[expect(dead_code)]
     pub(crate) fn chipset(self) -> Chipset {
         self.chipset
     }
@@ -292,36 +305,40 @@ pub(crate) struct Gpu {
 
 impl Gpu {
     pub(crate) fn new<'a>(
-        pdev: &'a pci::Device<device::Bound>,
+        pdev: &'a pci::Device<device::Core>,
         devres_bar: Arc<Devres<Bar0>>,
         bar: &'a Bar0,
+        spec: Spec,
     ) -> impl PinInit<Self, Error> + 'a {
-        try_pin_init!(Self {
-            spec: Spec::new(pdev.as_ref(), bar).inspect(|spec| {
-                dev_info!(pdev,"NVIDIA ({})\n", spec);
-            })?,
+        let dma_mask = spec.chipset().arch().dma_mask();
 
+        try_pin_init!(Self {
             // We must wait for GFW_BOOT completion before doing any significant setup on the GPU.
             _: {
+                // SAFETY: `Gpu` owns all DMA allocations for this device, and we are
+                // still constructing it, so no concurrent DMA allocations can exist.
+                unsafe { pdev.dma_set_mask_and_coherent(dma_mask)? };
+
                 gfw::wait_gfw_boot_completion(bar)
                     .inspect_err(|_| dev_err!(pdev, "GFW boot did not complete\n"))?;
             },
 
-            sysmem_flush: SysmemFlush::register(pdev.as_ref(), bar, spec.chipset)?,
+            sysmem_flush: SysmemFlush::register(pdev.as_ref(), bar, spec.chipset())?,
 
             gsp_falcon: Falcon::new(
                 pdev.as_ref(),
-                spec.chipset,
+                spec.chipset(),
             )
             .inspect(|falcon| falcon.clear_swgen0_intr(bar))?,
 
-            sec2_falcon: Falcon::new(pdev.as_ref(), spec.chipset)?,
+            sec2_falcon: Falcon::new(pdev.as_ref(), spec.chipset())?,
 
             gsp <- Gsp::new(pdev),
 
-            _: { gsp.boot(pdev, bar, spec.chipset, gsp_falcon, sec2_falcon)? },
+            _: { gsp.boot(pdev, bar, spec.chipset(), gsp_falcon, sec2_falcon)? },
 
             bar: devres_bar,
+            spec,
         })
     }
 
-- 
2.53.0

Re: [PATCH v9 05/31] gpu: nova-core: set DMA mask width based on GPU architecture

Posted by Alexandre Courbot 2 days, 11 hours ago

On Thu Mar 26, 2026 at 10:38 AM JST, John Hubbard wrote:
> Replace the hardcoded 47-bit DMA mask with per-architecture values.
> Add Architecture::dma_mask() with an exhaustive match, so new
> architectures get a compile-time reminder to specify their width.
>
> Set the DMA mask in Gpu::new(). Gpu owns all DMA allocations for
> the device, so no concurrent allocations can exist while the
> constructor is still running.
>
> Move Spec creation into probe() so the dev_info is printed early,
> and pass Spec into Gpu::new().
>
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> ---
>  drivers/gpu/nova-core/driver.rs | 24 ++++++--------------
>  drivers/gpu/nova-core/gpu.rs    | 39 +++++++++++++++++++++++----------
>  2 files changed, 35 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/nova-core/driver.rs b/drivers/gpu/nova-core/driver.rs
> index 84b0e1703150..bb82e63af044 100644
> --- a/drivers/gpu/nova-core/driver.rs
> +++ b/drivers/gpu/nova-core/driver.rs
> @@ -4,8 +4,6 @@
>      auxiliary,
>      device::Core,
>      devres::Devres,
> -    dma::Device,
> -    dma::DmaMask,
>      pci,
>      pci::{
>          Class,
> @@ -23,7 +21,10 @@
>      },
>  };
>  
> -use crate::gpu::Gpu;
> +use crate::gpu::{
> +    Gpu,
> +    Spec, //
> +};
>  
>  /// Counter for generating unique auxiliary device IDs.
>  static AUXILIARY_ID_COUNTER: Atomic<u32> = Atomic::new(0);
> @@ -38,14 +39,6 @@ pub(crate) struct NovaCore {
>  
>  const BAR0_SIZE: usize = SZ_16M;
>  
> -// For now we only support Ampere which can use up to 47-bit DMA addresses.
> -//
> -// TODO: Add an abstraction for this to support newer GPUs which may support
> -// larger DMA addresses. Limiting these GPUs to smaller address widths won't
> -// have any adverse affects, unless installed on systems which require larger
> -// DMA addresses. These systems should be quite rare.
> -const GPU_DMA_BITS: u32 = 47;
> -
>  pub(crate) type Bar0 = pci::Bar<BAR0_SIZE>;
>  
>  kernel::pci_device_table!(
> @@ -84,18 +77,15 @@ fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, E
>              pdev.enable_device_mem()?;
>              pdev.set_master();
>  
> -            // SAFETY: No concurrent DMA allocations or mappings can be made because
> -            // the device is still being probed and therefore isn't being used by
> -            // other threads of execution.
> -            unsafe { pdev.dma_set_mask_and_coherent(DmaMask::new::<GPU_DMA_BITS>())? };
> -
>              let bar = Arc::pin_init(
>                  pdev.iomap_region_sized::<BAR0_SIZE>(0, c"nova-core/bar0"),
>                  GFP_KERNEL,
>              )?;
> +            let spec = Spec::new(pdev.as_ref(), bar.access(pdev.as_ref())?)?;
> +            dev_info!(pdev, "NVIDIA ({})\n", spec);

Now that we have moved to DMA mask into `Gpu::new`, what is the point of
creating `spec` here? Its sole purpose is to be passed to `Gpu::new`,
and that doesn't change by the end of the series.

>  
>              Ok(try_pin_init!(Self {
> -                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?),
> +                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?, spec),
>                  _reg <- auxiliary::Registration::new(
>                      pdev.as_ref(),
>                      c"nova-drm",
> diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
> index 685ae4c81268..f70bfbda1614 100644
> --- a/drivers/gpu/nova-core/gpu.rs
> +++ b/drivers/gpu/nova-core/gpu.rs
> @@ -3,6 +3,10 @@
>  use kernel::{
>      device,
>      devres::Devres,
> +    dma::{
> +        Device,
> +        DmaMask, //
> +    },
>      fmt,
>      pci,
>      prelude::*,
> @@ -160,6 +164,16 @@ pub(crate) enum Architecture {
>      BlackwellGB20x = 0x1b,
>  }
>  
> +impl Architecture {
> +    /// Returns the DMA mask supported by this architecture.
> +    pub(crate) const fn dma_mask(&self) -> DmaMask {
> +        match self {
> +            Self::Turing | Self::Ampere | Self::Ada => DmaMask::new::<47>(),
> +            Self::Hopper | Self::BlackwellGB10x | Self::BlackwellGB20x => DmaMask::new::<52>(),
> +        }
> +    }
> +}
> +
>  impl TryFrom<u8> for Architecture {
>      type Error = Error;
>  
> @@ -212,7 +226,7 @@ pub(crate) struct Spec {
>  }
>  
>  impl Spec {
> -    fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
> +    pub(crate) fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
>          // Some brief notes about boot0 and boot42, in chronological order:
>          //
>          // NV04 through NV50:
> @@ -244,7 +258,6 @@ fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
>      }
>  
>      /// Returns this GPU's chipset.
> -    #[expect(dead_code)]
>      pub(crate) fn chipset(self) -> Chipset {
>          self.chipset
>      }
> @@ -292,36 +305,40 @@ pub(crate) struct Gpu {
>  
>  impl Gpu {
>      pub(crate) fn new<'a>(
> -        pdev: &'a pci::Device<device::Bound>,
> +        pdev: &'a pci::Device<device::Core>,
>          devres_bar: Arc<Devres<Bar0>>,
>          bar: &'a Bar0,
> +        spec: Spec,
>      ) -> impl PinInit<Self, Error> + 'a {
> -        try_pin_init!(Self {
> -            spec: Spec::new(pdev.as_ref(), bar).inspect(|spec| {
> -                dev_info!(pdev,"NVIDIA ({})\n", spec);
> -            })?,
> +        let dma_mask = spec.chipset().arch().dma_mask();
>  
> +        try_pin_init!(Self {
>              // We must wait for GFW_BOOT completion before doing any significant setup on the GPU.
>              _: {
> +                // SAFETY: `Gpu` owns all DMA allocations for this device, and we are
> +                // still constructing it, so no concurrent DMA allocations can exist.
> +                unsafe { pdev.dma_set_mask_and_coherent(dma_mask)? };
> +
>                  gfw::wait_gfw_boot_completion(bar)
>                      .inspect_err(|_| dev_err!(pdev, "GFW boot did not complete\n"))?;
>              },
>  
> -            sysmem_flush: SysmemFlush::register(pdev.as_ref(), bar, spec.chipset)?,
> +            sysmem_flush: SysmemFlush::register(pdev.as_ref(), bar, spec.chipset())?,
>  
>              gsp_falcon: Falcon::new(
>                  pdev.as_ref(),
> -                spec.chipset,
> +                spec.chipset(),
>              )
>              .inspect(|falcon| falcon.clear_swgen0_intr(bar))?,
>  
> -            sec2_falcon: Falcon::new(pdev.as_ref(), spec.chipset)?,
> +            sec2_falcon: Falcon::new(pdev.as_ref(), spec.chipset())?,

Ok, so you've removed the local variable as per my v8 feedback - now you
can also access `chipset` (the member, not the method - which should not
be needed anyway as `spec` doesn't need to be created in `driver.rs`)
directly, and remove some noise from the diff.

Re: [PATCH v9 05/31] gpu: nova-core: set DMA mask width based on GPU architecture

Posted by John Hubbard 2 days, 4 hours ago

On 3/30/26 7:32 AM, Alexandre Courbot wrote:
> On Thu Mar 26, 2026 at 10:38 AM JST, John Hubbard wrote:
...
>>  kernel::pci_device_table!(
>> @@ -84,18 +77,15 @@ fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, E
>>              pdev.enable_device_mem()?;
>>              pdev.set_master();
>>  
>> -            // SAFETY: No concurrent DMA allocations or mappings can be made because
>> -            // the device is still being probed and therefore isn't being used by
>> -            // other threads of execution.
>> -            unsafe { pdev.dma_set_mask_and_coherent(DmaMask::new::<GPU_DMA_BITS>())? };
>> -
>>              let bar = Arc::pin_init(
>>                  pdev.iomap_region_sized::<BAR0_SIZE>(0, c"nova-core/bar0"),
>>                  GFP_KERNEL,
>>              )?;
>> +            let spec = Spec::new(pdev.as_ref(), bar.access(pdev.as_ref())?)?;
>> +            dev_info!(pdev, "NVIDIA ({})\n", spec);
> 
> Now that we have moved to DMA mask into `Gpu::new`, what is the point of
> creating `spec` here? Its sole purpose is to be passed to `Gpu::new`,
> and that doesn't change by the end of the series.

Agreed. For v10, I've moved Spec creation back inside Gpu::new() where
it was before.

The original reason for pulling it into probe() was that the DMA mask
lived there (per your v6 feedback about the safety argument). Now that
the DMA mask moved into Gpu::new() with a better safety justification,
there is no reason for Spec to remain in probe().

...
>>  impl Gpu {
>>      pub(crate) fn new<'a>(
>> -        pdev: &'a pci::Device<device::Bound>,
>> +        pdev: &'a pci::Device<device::Core>,
>>          devres_bar: Arc<Devres<Bar0>>,
>>          bar: &'a Bar0,
>> +        spec: Spec,
>>      ) -> impl PinInit<Self, Error> + 'a {
>> -        try_pin_init!(Self {
>> -            spec: Spec::new(pdev.as_ref(), bar).inspect(|spec| {
>> -                dev_info!(pdev,"NVIDIA ({})\n", spec);
>> -            })?,
>> +        let dma_mask = spec.chipset().arch().dma_mask();
>>  
>> +        try_pin_init!(Self {
>>              // We must wait for GFW_BOOT completion before doing any significant setup on the GPU.
>>              _: {
>> +                // SAFETY: `Gpu` owns all DMA allocations for this device, and we are
>> +                // still constructing it, so no concurrent DMA allocations can exist.
>> +                unsafe { pdev.dma_set_mask_and_coherent(dma_mask)? };
>> +
>>                  gfw::wait_gfw_boot_completion(bar)
>>                      .inspect_err(|_| dev_err!(pdev, "GFW boot did not complete\n"))?;
>>              },
>>  
>> -            sysmem_flush: SysmemFlush::register(pdev.as_ref(), bar, spec.chipset)?,
>> +            sysmem_flush: SysmemFlush::register(pdev.as_ref(), bar, spec.chipset())?,
>>  
>>              gsp_falcon: Falcon::new(
>>                  pdev.as_ref(),
>> -                spec.chipset,
>> +                spec.chipset(),
>>              )
>>              .inspect(|falcon| falcon.clear_swgen0_intr(bar))?,
>>  
>> -            sec2_falcon: Falcon::new(pdev.as_ref(), spec.chipset)?,
>> +            sec2_falcon: Falcon::new(pdev.as_ref(), spec.chipset())?,
> 
> Ok, so you've removed the local variable as per my v8 feedback - now you
> can also access `chipset` (the member, not the method - which should not
> be needed anyway as `spec` doesn't need to be created in `driver.rs`)
> directly, and remove some noise from the diff.
> 

Done. With Spec back inside Gpu::new(), the try_pin_init! macro gives
direct field access to spec.chipset, so the diff no longer touches those
lines at all.

thanks,
-- 
John Hubbard