[PATCH v7 05/31] gpu: nova-core: set DMA mask width based on GPU architecture

John Hubbard posted 31 patches 2 weeks, 5 days ago
There is a newer version of this series
[PATCH v7 05/31] gpu: nova-core: set DMA mask width based on GPU architecture
Posted by John Hubbard 2 weeks, 5 days ago
Replace the hardcoded 47-bit DMA mask with per-architecture values.
Hopper and Blackwell support 52-bit DMA addresses, while Turing,
Ampere, and Ada use 47-bit.

Add Architecture::dma_mask() as a const method with an exhaustive
match, so new architectures get a compile-time reminder to specify
their DMA mask width. Move Spec creation into probe() so the
architecture is known before setting the DMA mask, and pass the Spec
into Gpu::new().

Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
---
 drivers/gpu/nova-core/driver.rs | 28 +++++++--------
 drivers/gpu/nova-core/gpu.rs    | 60 +++++++++++++++++++--------------
 2 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/nova-core/driver.rs b/drivers/gpu/nova-core/driver.rs
index 84b0e1703150..41227d29934e 100644
--- a/drivers/gpu/nova-core/driver.rs
+++ b/drivers/gpu/nova-core/driver.rs
@@ -5,7 +5,6 @@
     device::Core,
     devres::Devres,
     dma::Device,
-    dma::DmaMask,
     pci,
     pci::{
         Class,
@@ -23,7 +22,10 @@
     },
 };
 
-use crate::gpu::Gpu;
+use crate::gpu::{
+    Gpu,
+    Spec, //
+};
 
 /// Counter for generating unique auxiliary device IDs.
 static AUXILIARY_ID_COUNTER: Atomic<u32> = Atomic::new(0);
@@ -38,14 +40,6 @@ pub(crate) struct NovaCore {
 
 const BAR0_SIZE: usize = SZ_16M;
 
-// For now we only support Ampere which can use up to 47-bit DMA addresses.
-//
-// TODO: Add an abstraction for this to support newer GPUs which may support
-// larger DMA addresses. Limiting these GPUs to smaller address widths won't
-// have any adverse affects, unless installed on systems which require larger
-// DMA addresses. These systems should be quite rare.
-const GPU_DMA_BITS: u32 = 47;
-
 pub(crate) type Bar0 = pci::Bar<BAR0_SIZE>;
 
 kernel::pci_device_table!(
@@ -84,18 +78,20 @@ fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, E
             pdev.enable_device_mem()?;
             pdev.set_master();
 
-            // SAFETY: No concurrent DMA allocations or mappings can be made because
-            // the device is still being probed and therefore isn't being used by
-            // other threads of execution.
-            unsafe { pdev.dma_set_mask_and_coherent(DmaMask::new::<GPU_DMA_BITS>())? };
-
             let bar = Arc::pin_init(
                 pdev.iomap_region_sized::<BAR0_SIZE>(0, c"nova-core/bar0"),
                 GFP_KERNEL,
             )?;
+            let spec = Spec::new(pdev.as_ref(), bar.access(pdev.as_ref())?)?;
+            dev_info!(pdev, "NVIDIA ({})\n", spec);
+
+            // SAFETY: No concurrent DMA allocations or mappings can be made because
+            // the device is still being probed and therefore isn't being used by
+            // other threads of execution.
+            unsafe { pdev.dma_set_mask_and_coherent(spec.chipset().arch().dma_mask())? };
 
             Ok(try_pin_init!(Self {
-                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?),
+                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?, spec),
                 _reg <- auxiliary::Registration::new(
                     pdev.as_ref(),
                     c"nova-drm",
diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
index 8f317d213908..9e140463603b 100644
--- a/drivers/gpu/nova-core/gpu.rs
+++ b/drivers/gpu/nova-core/gpu.rs
@@ -3,6 +3,7 @@
 use kernel::{
     device,
     devres::Devres,
+    dma::DmaMask,
     fmt,
     pci,
     prelude::*,
@@ -162,6 +163,19 @@ pub(crate) enum Architecture {
     Blackwell = 0x1b,
 }
 
+impl Architecture {
+    /// Returns the DMA mask supported by this architecture.
+    ///
+    /// Hopper and Blackwell support 52-bit DMA addresses, while earlier
+    /// architectures (Turing, Ampere, Ada) support 47-bit.
+    pub(crate) const fn dma_mask(&self) -> DmaMask {
+        match self {
+            Self::Turing | Self::Ampere | Self::Ada => DmaMask::new::<47>(),
+            Self::Hopper | Self::Blackwell => DmaMask::new::<52>(),
+        }
+    }
+}
+
 impl TryFrom<u8> for Architecture {
     type Error = Error;
 
@@ -211,7 +225,7 @@ pub(crate) struct Spec {
 }
 
 impl Spec {
-    fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
+    pub(crate) fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
         // Some brief notes about boot0 and boot42, in chronological order:
         //
         // NV04 through NV50:
@@ -292,38 +306,34 @@ pub(crate) fn new<'a>(
         pdev: &'a pci::Device<device::Bound>,
         devres_bar: Arc<Devres<Bar0>>,
         bar: &'a Bar0,
+        spec: Spec,
     ) -> impl PinInit<Self, Error> + 'a {
-        pin_init::pin_init_scope(move || {
-            let spec = Spec::new(pdev.as_ref(), bar)?;
-            dev_info!(pdev, "NVIDIA ({})\n", spec);
-
-            let chipset = spec.chipset();
+        let chipset = spec.chipset();
 
-            Ok(try_pin_init!(Self {
-                // We must wait for GFW_BOOT completion before doing any significant setup
-                // on the GPU.
-                _: {
-                    gfw::wait_gfw_boot_completion(bar)
-                        .inspect_err(|_| dev_err!(pdev, "GFW boot did not complete\n"))?;
-                },
+        try_pin_init!(Self {
+            // We must wait for GFW_BOOT completion before doing any significant setup
+            // on the GPU.
+            _: {
+                gfw::wait_gfw_boot_completion(bar)
+                    .inspect_err(|_| dev_err!(pdev, "GFW boot did not complete\n"))?;
+            },
 
-                sysmem_flush: SysmemFlush::register(pdev.as_ref(), bar, chipset)?,
+            sysmem_flush: SysmemFlush::register(pdev.as_ref(), bar, chipset)?,
 
-                gsp_falcon: Falcon::new(
-                    pdev.as_ref(),
-                    chipset,
-                )
-                .inspect(|falcon| falcon.clear_swgen0_intr(bar))?,
+            gsp_falcon: Falcon::new(
+                pdev.as_ref(),
+                chipset,
+            )
+            .inspect(|falcon| falcon.clear_swgen0_intr(bar))?,
 
-                sec2_falcon: Falcon::new(pdev.as_ref(), chipset)?,
+            sec2_falcon: Falcon::new(pdev.as_ref(), chipset)?,
 
-                gsp <- Gsp::new(pdev),
+            gsp <- Gsp::new(pdev),
 
-                _: { gsp.boot(pdev, bar, chipset, gsp_falcon, sec2_falcon)? },
+            _: { gsp.boot(pdev, bar, chipset, gsp_falcon, sec2_falcon)? },
 
-                bar: devres_bar,
-                spec,
-            }))
+            bar: devres_bar,
+            spec,
         })
     }
 
-- 
2.53.0
Re: [PATCH v7 05/31] gpu: nova-core: set DMA mask width based on GPU architecture
Posted by Alexandre Courbot 2 weeks ago
On Wed Mar 18, 2026 at 7:53 AM JST, John Hubbard wrote:
> Replace the hardcoded 47-bit DMA mask with per-architecture values.
> Hopper and Blackwell support 52-bit DMA addresses, while Turing,
> Ampere, and Ada use 47-bit.
>
> Add Architecture::dma_mask() as a const method with an exhaustive
> match, so new architectures get a compile-time reminder to specify
> their DMA mask width. Move Spec creation into probe() so the
> architecture is known before setting the DMA mask, and pass the Spec
> into Gpu::new().
>
> Cc: Danilo Krummrich <dakr@kernel.org>
> Cc: Gary Guo <gary@garyguo.net>

Why the Ccs? Some patches in the series seem to have random people Cc'd
to them.

> Signed-off-by: John Hubbard <jhubbard@nvidia.com>
> ---
>  drivers/gpu/nova-core/driver.rs | 28 +++++++--------
>  drivers/gpu/nova-core/gpu.rs    | 60 +++++++++++++++++++--------------
>  2 files changed, 47 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/gpu/nova-core/driver.rs b/drivers/gpu/nova-core/driver.rs
> index 84b0e1703150..41227d29934e 100644
> --- a/drivers/gpu/nova-core/driver.rs
> +++ b/drivers/gpu/nova-core/driver.rs
> @@ -5,7 +5,6 @@
>      device::Core,
>      devres::Devres,
>      dma::Device,
> -    dma::DmaMask,
>      pci,
>      pci::{
>          Class,
> @@ -23,7 +22,10 @@
>      },
>  };
>  
> -use crate::gpu::Gpu;
> +use crate::gpu::{
> +    Gpu,
> +    Spec, //
> +};
>  
>  /// Counter for generating unique auxiliary device IDs.
>  static AUXILIARY_ID_COUNTER: Atomic<u32> = Atomic::new(0);
> @@ -38,14 +40,6 @@ pub(crate) struct NovaCore {
>  
>  const BAR0_SIZE: usize = SZ_16M;
>  
> -// For now we only support Ampere which can use up to 47-bit DMA addresses.
> -//
> -// TODO: Add an abstraction for this to support newer GPUs which may support
> -// larger DMA addresses. Limiting these GPUs to smaller address widths won't
> -// have any adverse affects, unless installed on systems which require larger
> -// DMA addresses. These systems should be quite rare.
> -const GPU_DMA_BITS: u32 = 47;
> -
>  pub(crate) type Bar0 = pci::Bar<BAR0_SIZE>;
>  
>  kernel::pci_device_table!(
> @@ -84,18 +78,20 @@ fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, E
>              pdev.enable_device_mem()?;
>              pdev.set_master();
>  
> -            // SAFETY: No concurrent DMA allocations or mappings can be made because
> -            // the device is still being probed and therefore isn't being used by
> -            // other threads of execution.
> -            unsafe { pdev.dma_set_mask_and_coherent(DmaMask::new::<GPU_DMA_BITS>())? };
> -
>              let bar = Arc::pin_init(
>                  pdev.iomap_region_sized::<BAR0_SIZE>(0, c"nova-core/bar0"),
>                  GFP_KERNEL,
>              )?;
> +            let spec = Spec::new(pdev.as_ref(), bar.access(pdev.as_ref())?)?;
> +            dev_info!(pdev, "NVIDIA ({})\n", spec);
> +
> +            // SAFETY: No concurrent DMA allocations or mappings can be made because
> +            // the device is still being probed and therefore isn't being used by
> +            // other threads of execution.
> +            unsafe { pdev.dma_set_mask_and_coherent(spec.chipset().arch().dma_mask())? };
>  
>              Ok(try_pin_init!(Self {
> -                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?),
> +                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?, spec),
>                  _reg <- auxiliary::Registration::new(
>                      pdev.as_ref(),
>                      c"nova-drm",
> diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
> index 8f317d213908..9e140463603b 100644
> --- a/drivers/gpu/nova-core/gpu.rs
> +++ b/drivers/gpu/nova-core/gpu.rs
> @@ -3,6 +3,7 @@
>  use kernel::{
>      device,
>      devres::Devres,
> +    dma::DmaMask,
>      fmt,
>      pci,
>      prelude::*,
> @@ -162,6 +163,19 @@ pub(crate) enum Architecture {
>      Blackwell = 0x1b,
>  }
>  
> +impl Architecture {
> +    /// Returns the DMA mask supported by this architecture.
> +    ///
> +    /// Hopper and Blackwell support 52-bit DMA addresses, while earlier
> +    /// architectures (Turing, Ampere, Ada) support 47-bit.

This last sentence is unneeded, we describe what methods provide in
doccomments, not how they do it or what the result will be.

> +    pub(crate) const fn dma_mask(&self) -> DmaMask {
> +        match self {
> +            Self::Turing | Self::Ampere | Self::Ada => DmaMask::new::<47>(),
> +            Self::Hopper | Self::Blackwell => DmaMask::new::<52>(),
> +        }
> +    }
> +}

I see you introduce a `Gpu` HAL in the next patch. I think this should
also be part of the HAL - there is no benefit in having this method
const since the architecture is probed at runtime anyway.

> +
>  impl TryFrom<u8> for Architecture {
>      type Error = Error;
>  
> @@ -211,7 +225,7 @@ pub(crate) struct Spec {
>  }
>  
>  impl Spec {
> -    fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
> +    pub(crate) fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
>          // Some brief notes about boot0 and boot42, in chronological order:
>          //
>          // NV04 through NV50:
> @@ -292,38 +306,34 @@ pub(crate) fn new<'a>(
>          pdev: &'a pci::Device<device::Bound>,
>          devres_bar: Arc<Devres<Bar0>>,
>          bar: &'a Bar0,
> +        spec: Spec,
>      ) -> impl PinInit<Self, Error> + 'a {
> -        pin_init::pin_init_scope(move || {
> -            let spec = Spec::new(pdev.as_ref(), bar)?;
> -            dev_info!(pdev, "NVIDIA ({})\n", spec);
> -
> -            let chipset = spec.chipset();
> +        let chipset = spec.chipset();
>  
> -            Ok(try_pin_init!(Self {
> -                // We must wait for GFW_BOOT completion before doing any significant setup
> -                // on the GPU.
> -                _: {
> -                    gfw::wait_gfw_boot_completion(bar)
> -                        .inspect_err(|_| dev_err!(pdev, "GFW boot did not complete\n"))?;
> -                },
> +        try_pin_init!(Self {

What, and now we undo what we just did in patch 4? 0_o; What was that
all for?

I did a `git diff` between this step in the series and the state two
steps above, and it seems to confirm my intuition on patch 4: you just
need a few more `Copy` implementations. They can be added in this patch,
and patch 4 dropped altogether.
Re: [PATCH v7 05/31] gpu: nova-core: set DMA mask width based on GPU architecture
Posted by John Hubbard 1 week, 5 days ago
On 3/23/26 6:02 AM, Alexandre Courbot wrote:
> On Wed Mar 18, 2026 at 7:53 AM JST, John Hubbard wrote:
>> Replace the hardcoded 47-bit DMA mask with per-architecture values.
>> Hopper and Blackwell support 52-bit DMA addresses, while Turing,
>> Ampere, and Ada use 47-bit.
>>
>> Add Architecture::dma_mask() as a const method with an exhaustive
>> match, so new architectures get a compile-time reminder to specify
>> their DMA mask width. Move Spec creation into probe() so the
>> architecture is known before setting the DMA mask, and pass the Spec
>> into Gpu::new().
>>
>> Cc: Danilo Krummrich <dakr@kernel.org>
>> Cc: Gary Guo <gary@garyguo.net>
> 
> Why the Ccs? Some patches in the series seem to have random people Cc'd
> to them.

Many many months ago, I wanted to follow a convention of adding
people to Cc if they had responded with comments. Several versions
into it, I just gave up, it's too much trouble for a large patchset.

So I'll just remove all Cc's for v8.

I've fixed all of the other comments below, in v8.


thanks,
-- 
John Hubbard

> 
>> Signed-off-by: John Hubbard <jhubbard@nvidia.com>
>> ---
>>  drivers/gpu/nova-core/driver.rs | 28 +++++++--------
>>  drivers/gpu/nova-core/gpu.rs    | 60 +++++++++++++++++++--------------
>>  2 files changed, 47 insertions(+), 41 deletions(-)
>>
>> diff --git a/drivers/gpu/nova-core/driver.rs b/drivers/gpu/nova-core/driver.rs
>> index 84b0e1703150..41227d29934e 100644
>> --- a/drivers/gpu/nova-core/driver.rs
>> +++ b/drivers/gpu/nova-core/driver.rs
>> @@ -5,7 +5,6 @@
>>      device::Core,
>>      devres::Devres,
>>      dma::Device,
>> -    dma::DmaMask,
>>      pci,
>>      pci::{
>>          Class,
>> @@ -23,7 +22,10 @@
>>      },
>>  };
>>  
>> -use crate::gpu::Gpu;
>> +use crate::gpu::{
>> +    Gpu,
>> +    Spec, //
>> +};
>>  
>>  /// Counter for generating unique auxiliary device IDs.
>>  static AUXILIARY_ID_COUNTER: Atomic<u32> = Atomic::new(0);
>> @@ -38,14 +40,6 @@ pub(crate) struct NovaCore {
>>  
>>  const BAR0_SIZE: usize = SZ_16M;
>>  
>> -// For now we only support Ampere which can use up to 47-bit DMA addresses.
>> -//
>> -// TODO: Add an abstraction for this to support newer GPUs which may support
>> -// larger DMA addresses. Limiting these GPUs to smaller address widths won't
>> -// have any adverse affects, unless installed on systems which require larger
>> -// DMA addresses. These systems should be quite rare.
>> -const GPU_DMA_BITS: u32 = 47;
>> -
>>  pub(crate) type Bar0 = pci::Bar<BAR0_SIZE>;
>>  
>>  kernel::pci_device_table!(
>> @@ -84,18 +78,20 @@ fn probe(pdev: &pci::Device<Core>, _info: &Self::IdInfo) -> impl PinInit<Self, E
>>              pdev.enable_device_mem()?;
>>              pdev.set_master();
>>  
>> -            // SAFETY: No concurrent DMA allocations or mappings can be made because
>> -            // the device is still being probed and therefore isn't being used by
>> -            // other threads of execution.
>> -            unsafe { pdev.dma_set_mask_and_coherent(DmaMask::new::<GPU_DMA_BITS>())? };
>> -
>>              let bar = Arc::pin_init(
>>                  pdev.iomap_region_sized::<BAR0_SIZE>(0, c"nova-core/bar0"),
>>                  GFP_KERNEL,
>>              )?;
>> +            let spec = Spec::new(pdev.as_ref(), bar.access(pdev.as_ref())?)?;
>> +            dev_info!(pdev, "NVIDIA ({})\n", spec);
>> +
>> +            // SAFETY: No concurrent DMA allocations or mappings can be made because
>> +            // the device is still being probed and therefore isn't being used by
>> +            // other threads of execution.
>> +            unsafe { pdev.dma_set_mask_and_coherent(spec.chipset().arch().dma_mask())? };
>>  
>>              Ok(try_pin_init!(Self {
>> -                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?),
>> +                gpu <- Gpu::new(pdev, bar.clone(), bar.access(pdev.as_ref())?, spec),
>>                  _reg <- auxiliary::Registration::new(
>>                      pdev.as_ref(),
>>                      c"nova-drm",
>> diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
>> index 8f317d213908..9e140463603b 100644
>> --- a/drivers/gpu/nova-core/gpu.rs
>> +++ b/drivers/gpu/nova-core/gpu.rs
>> @@ -3,6 +3,7 @@
>>  use kernel::{
>>      device,
>>      devres::Devres,
>> +    dma::DmaMask,
>>      fmt,
>>      pci,
>>      prelude::*,
>> @@ -162,6 +163,19 @@ pub(crate) enum Architecture {
>>      Blackwell = 0x1b,
>>  }
>>  
>> +impl Architecture {
>> +    /// Returns the DMA mask supported by this architecture.
>> +    ///
>> +    /// Hopper and Blackwell support 52-bit DMA addresses, while earlier
>> +    /// architectures (Turing, Ampere, Ada) support 47-bit.
> 
> This last sentence is unneeded, we describe what methods provide in
> doccomments, not how they do it or what the result will be.
> 
>> +    pub(crate) const fn dma_mask(&self) -> DmaMask {
>> +        match self {
>> +            Self::Turing | Self::Ampere | Self::Ada => DmaMask::new::<47>(),
>> +            Self::Hopper | Self::Blackwell => DmaMask::new::<52>(),
>> +        }
>> +    }
>> +}
> 
> I see you introduce a `Gpu` HAL in the next patch. I think this should
> also be part of the HAL - there is no benefit in having this method
> const since the architecture is probed at runtime anyway.
> 
>> +
>>  impl TryFrom<u8> for Architecture {
>>      type Error = Error;
>>  
>> @@ -211,7 +225,7 @@ pub(crate) struct Spec {
>>  }
>>  
>>  impl Spec {
>> -    fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
>> +    pub(crate) fn new(dev: &device::Device, bar: &Bar0) -> Result<Spec> {
>>          // Some brief notes about boot0 and boot42, in chronological order:
>>          //
>>          // NV04 through NV50:
>> @@ -292,38 +306,34 @@ pub(crate) fn new<'a>(
>>          pdev: &'a pci::Device<device::Bound>,
>>          devres_bar: Arc<Devres<Bar0>>,
>>          bar: &'a Bar0,
>> +        spec: Spec,
>>      ) -> impl PinInit<Self, Error> + 'a {
>> -        pin_init::pin_init_scope(move || {
>> -            let spec = Spec::new(pdev.as_ref(), bar)?;
>> -            dev_info!(pdev, "NVIDIA ({})\n", spec);
>> -
>> -            let chipset = spec.chipset();
>> +        let chipset = spec.chipset();
>>  
>> -            Ok(try_pin_init!(Self {
>> -                // We must wait for GFW_BOOT completion before doing any significant setup
>> -                // on the GPU.
>> -                _: {
>> -                    gfw::wait_gfw_boot_completion(bar)
>> -                        .inspect_err(|_| dev_err!(pdev, "GFW boot did not complete\n"))?;
>> -                },
>> +        try_pin_init!(Self {
> 
> What, and now we undo what we just did in patch 4? 0_o; What was that
> all for?
> 
> I did a `git diff` between this step in the series and the state two
> steps above, and it seems to confirm my intuition on patch 4: you just
> need a few more `Copy` implementations. They can be added in this patch,
> and patch 4 dropped altogether.