[PATCH 11/16] gpu: nova-core: add falcon register definitions and base code

Alexandre Courbot posted 16 patches 9 months, 3 weeks ago
There is a newer version of this series
[PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Alexandre Courbot 9 months, 3 weeks ago
Add the common Falcon code and HAL for Ampere GPUs, and instantiate the
GSP and SEC2 Falcons that will be required to boot the GSP.

Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
---
 drivers/gpu/nova-core/falcon.rs           | 469 ++++++++++++++++++++++++++++++
 drivers/gpu/nova-core/falcon/gsp.rs       |  27 ++
 drivers/gpu/nova-core/falcon/hal.rs       |  54 ++++
 drivers/gpu/nova-core/falcon/hal/ga102.rs | 111 +++++++
 drivers/gpu/nova-core/falcon/sec2.rs      |   9 +
 drivers/gpu/nova-core/gpu.rs              |  16 +
 drivers/gpu/nova-core/nova_core.rs        |   1 +
 drivers/gpu/nova-core/regs.rs             | 189 ++++++++++++
 drivers/gpu/nova-core/timer.rs            |   3 -
 9 files changed, 876 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/nova-core/falcon.rs b/drivers/gpu/nova-core/falcon.rs
new file mode 100644
index 0000000000000000000000000000000000000000..71f374445ff3277eac628e183942c79f557366d5
--- /dev/null
+++ b/drivers/gpu/nova-core/falcon.rs
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Falcon microprocessor base support
+
+// To be removed when all code is used.
+#![allow(dead_code)]
+
+use core::hint::unreachable_unchecked;
+use core::time::Duration;
+use hal::FalconHal;
+use kernel::bindings;
+use kernel::devres::Devres;
+use kernel::sync::Arc;
+use kernel::{pci, prelude::*};
+
+use crate::driver::Bar0;
+use crate::gpu::Chipset;
+use crate::regs;
+use crate::timer::Timer;
+
+pub(crate) mod gsp;
+mod hal;
+pub(crate) mod sec2;
+
+#[repr(u8)]
+#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub(crate) enum FalconCoreRev {
+    #[default]
+    Rev1 = 1,
+    Rev2 = 2,
+    Rev3 = 3,
+    Rev4 = 4,
+    Rev5 = 5,
+    Rev6 = 6,
+    Rev7 = 7,
+}
+
+impl TryFrom<u32> for FalconCoreRev {
+    type Error = Error;
+
+    fn try_from(value: u32) -> core::result::Result<Self, Self::Error> {
+        use FalconCoreRev::*;
+
+        let rev = match value {
+            1 => Rev1,
+            2 => Rev2,
+            3 => Rev3,
+            4 => Rev4,
+            5 => Rev5,
+            6 => Rev6,
+            7 => Rev7,
+            _ => return Err(EINVAL),
+        };
+
+        Ok(rev)
+    }
+}
+
+#[repr(u8)]
+#[derive(Debug, Default, Copy, Clone)]
+pub(crate) enum FalconSecurityModel {
+    #[default]
+    None = 0,
+    Light = 2,
+    Heavy = 3,
+}
+
+impl TryFrom<u32> for FalconSecurityModel {
+    type Error = Error;
+
+    fn try_from(value: u32) -> core::result::Result<Self, Self::Error> {
+        use FalconSecurityModel::*;
+
+        let sec_model = match value {
+            0 => None,
+            2 => Light,
+            3 => Heavy,
+            _ => return Err(EINVAL),
+        };
+
+        Ok(sec_model)
+    }
+}
+
+#[repr(u8)]
+#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+pub(crate) enum FalconCoreRevSubversion {
+    #[default]
+    Subversion0 = 0,
+    Subversion1 = 1,
+    Subversion2 = 2,
+    Subversion3 = 3,
+}
+
+impl From<u32> for FalconCoreRevSubversion {
+    fn from(value: u32) -> Self {
+        use FalconCoreRevSubversion::*;
+
+        match value & 0b11 {
+            0 => Subversion0,
+            1 => Subversion1,
+            2 => Subversion2,
+            3 => Subversion3,
+            // SAFETY: the `0b11` mask limits the possible values to `0..=3`.
+            4..=u32::MAX => unsafe { unreachable_unchecked() },
+        }
+    }
+}
+
+#[repr(u8)]
+#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
+pub(crate) enum FalconModSelAlgo {
+    #[default]
+    Rsa3k = 1,
+}
+
+impl TryFrom<u32> for FalconModSelAlgo {
+    type Error = Error;
+
+    fn try_from(value: u32) -> core::result::Result<Self, Self::Error> {
+        match value {
+            1 => Ok(FalconModSelAlgo::Rsa3k),
+            _ => Err(EINVAL),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum RiscvCoreSelect {
+    Falcon = 0,
+    Riscv = 1,
+}
+
+impl From<bool> for RiscvCoreSelect {
+    fn from(value: bool) -> Self {
+        match value {
+            false => RiscvCoreSelect::Falcon,
+            true => RiscvCoreSelect::Riscv,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum FalconMem {
+    Imem,
+    Dmem,
+}
+
+#[derive(Debug, Clone, Default)]
+pub(crate) enum FalconFbifTarget {
+    #[default]
+    LocalFb = 0,
+    CoherentSysmem = 1,
+    NoncoherentSysmem = 2,
+}
+
+impl TryFrom<u32> for FalconFbifTarget {
+    type Error = Error;
+
+    fn try_from(value: u32) -> core::result::Result<Self, Self::Error> {
+        let res = match value {
+            0 => Self::LocalFb,
+            1 => Self::CoherentSysmem,
+            2 => Self::NoncoherentSysmem,
+            _ => return Err(EINVAL),
+        };
+
+        Ok(res)
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub(crate) enum FalconFbifMemType {
+    #[default]
+    Virtual = 0,
+    Physical = 1,
+}
+
+impl From<bool> for FalconFbifMemType {
+    fn from(value: bool) -> Self {
+        match value {
+            false => Self::Virtual,
+            true => Self::Physical,
+        }
+    }
+}
+
+/// Trait defining the parameters of a given Falcon instance.
+pub(crate) trait FalconEngine: Sync {
+    /// Base I/O address for the falcon, relative from which its registers are accessed.
+    const BASE: usize;
+}
+
+/// Represents a portion of the firmware to be loaded into a particular memory (e.g. IMEM or DMEM).
+#[derive(Debug)]
+pub(crate) struct FalconLoadTarget {
+    /// Offset from the start of the source object to copy from.
+    pub(crate) src_start: u32,
+    /// Offset from the start of the destination memory to copy into.
+    pub(crate) dst_start: u32,
+    /// Number of bytes to copy.
+    pub(crate) len: u32,
+}
+
+#[derive(Debug)]
+pub(crate) struct FalconBromParams {
+    pub(crate) pkc_data_offset: u32,
+    pub(crate) engine_id_mask: u16,
+    pub(crate) ucode_id: u8,
+}
+
+pub(crate) trait FalconFirmware {
+    type Target: FalconEngine;
+
+    /// Returns the DMA handle of the object containing the firmware.
+    fn dma_handle(&self) -> bindings::dma_addr_t;
+
+    /// Returns the load parameters for `IMEM`.
+    fn imem_load(&self) -> FalconLoadTarget;
+
+    /// Returns the load parameters for `DMEM`.
+    fn dmem_load(&self) -> FalconLoadTarget;
+
+    /// Returns the parameters to write into the BROM registers.
+    fn brom_params(&self) -> FalconBromParams;
+
+    /// Returns the start address of the firmware.
+    fn boot_addr(&self) -> u32;
+}
+
+/// Contains the base parameters common to all Falcon instances.
+pub(crate) struct Falcon<E: FalconEngine> {
+    pub hal: Arc<dyn FalconHal<E>>,
+}
+
+impl<E: FalconEngine + 'static> Falcon<E> {
+    pub(crate) fn new(
+        pdev: &pci::Device,
+        chipset: Chipset,
+        bar: &Devres<Bar0>,
+        need_riscv: bool,
+    ) -> Result<Self> {
+        let hwcfg1 = with_bar!(bar, |b| regs::FalconHwcfg1::read(b, E::BASE))?;
+        // Ensure that the revision and security model contain valid values.
+        let _rev = hwcfg1.core_rev()?;
+        let _sec_model = hwcfg1.security_model()?;
+
+        if need_riscv {
+            let hwcfg2 = with_bar!(bar, |b| regs::FalconHwcfg2::read(b, E::BASE))?;
+            if !hwcfg2.riscv() {
+                dev_err!(
+                    pdev.as_ref(),
+                    "riscv support requested on falcon that does not support it\n"
+                );
+                return Err(EINVAL);
+            }
+        }
+
+        Ok(Self {
+            hal: hal::create_falcon_hal(chipset)?,
+        })
+    }
+
+    fn reset_wait_mem_scrubbing(&self, bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
+        timer.wait_on(bar, Duration::from_millis(20), || {
+            bar.try_access_with(|b| regs::FalconHwcfg2::read(b, E::BASE))
+                .and_then(|r| if r.mem_scrubbing() { Some(()) } else { None })
+        })
+    }
+
+    fn reset_eng(&self, bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
+        let _ = with_bar!(bar, |b| regs::FalconHwcfg2::read(b, E::BASE))?;
+
+        // According to OpenRM's `kflcnPreResetWait_GA102` documentation, HW sometimes does not set
+        // RESET_READY so a non-failing timeout is used.
+        let _ = timer.wait_on(bar, Duration::from_micros(150), || {
+            bar.try_access_with(|b| regs::FalconHwcfg2::read(b, E::BASE))
+                .and_then(|r| if r.reset_ready() { Some(()) } else { None })
+        });
+
+        with_bar!(bar, |b| regs::FalconEngine::alter(b, E::BASE, |v| v
+            .set_reset(true)))?;
+
+        let _: Result<()> = timer.wait_on(bar, Duration::from_micros(10), || None);
+
+        with_bar!(bar, |b| regs::FalconEngine::alter(b, E::BASE, |v| v
+            .set_reset(false)))?;
+
+        self.reset_wait_mem_scrubbing(bar, timer)?;
+
+        Ok(())
+    }
+
+    pub(crate) fn reset(&self, bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
+        self.reset_eng(bar, timer)?;
+        self.hal.select_core(bar, timer)?;
+        self.reset_wait_mem_scrubbing(bar, timer)?;
+
+        with_bar!(bar, |b| {
+            regs::FalconRm::default()
+                .set_val(regs::Boot0::read(b).into())
+                .write(b, E::BASE)
+        })
+    }
+
+    fn dma_wr(
+        &self,
+        bar: &Devres<Bar0>,
+        timer: &Timer,
+        dma_handle: bindings::dma_addr_t,
+        target_mem: FalconMem,
+        load_offsets: FalconLoadTarget,
+        sec: bool,
+    ) -> Result<()> {
+        const DMA_LEN: u32 = 256;
+        const DMA_LEN_ILOG2_MINUS2: u8 = (DMA_LEN.ilog2() - 2) as u8;
+
+        // For IMEM, we want to use the start offset as a virtual address tag for each page, since
+        // code addresses in the firmware (and the boot vector) are virtual.
+        //
+        // For DMEM we can fold the start offset into the DMA handle.
+        let (src_start, dma_start) = match target_mem {
+            FalconMem::Imem => (load_offsets.src_start, dma_handle),
+            FalconMem::Dmem => (
+                0,
+                dma_handle + load_offsets.src_start as bindings::dma_addr_t,
+            ),
+        };
+        if dma_start % DMA_LEN as bindings::dma_addr_t > 0 {
+            pr_err!(
+                "DMA transfer start addresses must be a multiple of {}",
+                DMA_LEN
+            );
+            return Err(EINVAL);
+        }
+        if load_offsets.len % DMA_LEN > 0 {
+            pr_err!("DMA transfer length must be a multiple of {}", DMA_LEN);
+            return Err(EINVAL);
+        }
+
+        // Set up the base source DMA address.
+        with_bar!(bar, |b| {
+            regs::FalconDmaTrfBase::default()
+                .set_base((dma_start >> 8) as u32)
+                .write(b, E::BASE);
+            regs::FalconDmaTrfBase1::default()
+                .set_base((dma_start >> 40) as u16)
+                .write(b, E::BASE)
+        })?;
+
+        let cmd = regs::FalconDmaTrfCmd::default()
+            .set_size(DMA_LEN_ILOG2_MINUS2)
+            .set_imem(target_mem == FalconMem::Imem)
+            .set_sec(if sec { 1 } else { 0 });
+
+        for pos in (0..load_offsets.len).step_by(DMA_LEN as usize) {
+            // Perform a transfer of size `DMA_LEN`.
+            with_bar!(bar, |b| {
+                regs::FalconDmaTrfMOffs::default()
+                    .set_offs(load_offsets.dst_start + pos)
+                    .write(b, E::BASE);
+                regs::FalconDmaTrfBOffs::default()
+                    .set_offs(src_start + pos)
+                    .write(b, E::BASE);
+                cmd.write(b, E::BASE)
+            })?;
+
+            // Wait for the transfer to complete.
+            timer.wait_on(bar, Duration::from_millis(2000), || {
+                bar.try_access_with(|b| regs::FalconDmaTrfCmd::read(b, E::BASE))
+                    .and_then(|v| if v.idle() { Some(()) } else { None })
+            })?;
+        }
+
+        Ok(())
+    }
+
+    pub(crate) fn dma_load<F: FalconFirmware<Target = E>>(
+        &self,
+        bar: &Devres<Bar0>,
+        timer: &Timer,
+        fw: &F,
+    ) -> Result<()> {
+        let dma_handle = fw.dma_handle();
+
+        with_bar!(bar, |b| {
+            regs::FalconFbifCtl::alter(b, E::BASE, |v| v.set_allow_phys_no_ctx(true));
+            regs::FalconDmaCtl::default().write(b, E::BASE);
+            regs::FalconFbifTranscfg::alter(b, E::BASE, |v| {
+                v.set_target(FalconFbifTarget::CoherentSysmem)
+                    .set_mem_type(FalconFbifMemType::Physical)
+            });
+        })?;
+
+        self.dma_wr(
+            bar,
+            timer,
+            dma_handle,
+            FalconMem::Imem,
+            fw.imem_load(),
+            true,
+        )?;
+        self.dma_wr(
+            bar,
+            timer,
+            dma_handle,
+            FalconMem::Dmem,
+            fw.dmem_load(),
+            true,
+        )?;
+
+        self.hal.program_brom(bar, &fw.brom_params())?;
+
+        with_bar!(bar, |b| {
+            // Set `BootVec` to start of non-secure code.
+            regs::FalconBootVec::default()
+                .set_boot_vec(fw.boot_addr())
+                .write(b, E::BASE);
+        })?;
+
+        Ok(())
+    }
+
+    pub(crate) fn boot(
+        &self,
+        bar: &Devres<Bar0>,
+        timer: &Timer,
+        mbox0: Option<u32>,
+        mbox1: Option<u32>,
+    ) -> Result<(u32, u32)> {
+        with_bar!(bar, |b| {
+            if let Some(mbox0) = mbox0 {
+                regs::FalconMailbox0::default()
+                    .set_mailbox0(mbox0)
+                    .write(b, E::BASE);
+            }
+
+            if let Some(mbox1) = mbox1 {
+                regs::FalconMailbox1::default()
+                    .set_mailbox1(mbox1)
+                    .write(b, E::BASE);
+            }
+
+            match regs::FalconCpuCtl::read(b, E::BASE).alias_en() {
+                true => regs::FalconCpuCtlAlias::default()
+                    .set_start_cpu(true)
+                    .write(b, E::BASE),
+                false => regs::FalconCpuCtl::default()
+                    .set_start_cpu(true)
+                    .write(b, E::BASE),
+            }
+        })?;
+
+        timer.wait_on(bar, Duration::from_secs(2), || {
+            bar.try_access()
+                .map(|b| regs::FalconCpuCtl::read(&*b, E::BASE))
+                .and_then(|v| if v.halted() { Some(()) } else { None })
+        })?;
+
+        let (mbox0, mbox1) = with_bar!(bar, |b| {
+            let mbox0 = regs::FalconMailbox0::read(b, E::BASE).mailbox0();
+            let mbox1 = regs::FalconMailbox1::read(b, E::BASE).mailbox1();
+
+            (mbox0, mbox1)
+        })?;
+
+        Ok((mbox0, mbox1))
+    }
+}
diff --git a/drivers/gpu/nova-core/falcon/gsp.rs b/drivers/gpu/nova-core/falcon/gsp.rs
new file mode 100644
index 0000000000000000000000000000000000000000..44b8dc118eda1263eaede466efd55408c6e7cded
--- /dev/null
+++ b/drivers/gpu/nova-core/falcon/gsp.rs
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use kernel::devres::Devres;
+use kernel::prelude::*;
+
+use crate::{
+    driver::Bar0,
+    falcon::{Falcon, FalconEngine},
+    regs,
+};
+
+pub(crate) struct Gsp;
+impl FalconEngine for Gsp {
+    const BASE: usize = 0x00110000;
+}
+
+pub(crate) type GspFalcon = Falcon<Gsp>;
+
+impl Falcon<Gsp> {
+    /// Clears the SWGEN0 bit in the Falcon's IRQ status clear register to
+    /// allow GSP to signal CPU for processing new messages in message queue.
+    pub(crate) fn clear_swgen0_intr(&self, bar: &Devres<Bar0>) -> Result<()> {
+        with_bar!(bar, |b| regs::FalconIrqsclr::default()
+            .set_swgen0(true)
+            .write(b, Gsp::BASE))
+    }
+}
diff --git a/drivers/gpu/nova-core/falcon/hal.rs b/drivers/gpu/nova-core/falcon/hal.rs
new file mode 100644
index 0000000000000000000000000000000000000000..5ebf4e88f1f25a13cf47859a53507be53e795d34
--- /dev/null
+++ b/drivers/gpu/nova-core/falcon/hal.rs
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use kernel::devres::Devres;
+use kernel::prelude::*;
+use kernel::sync::Arc;
+
+use crate::driver::Bar0;
+use crate::falcon::{FalconBromParams, FalconEngine};
+use crate::gpu::Chipset;
+use crate::timer::Timer;
+
+mod ga102;
+
+/// Hardware Abstraction Layer for Falcon cores.
+///
+/// Implements chipset-specific low-level operations. The trait is generic against [`FalconEngine`]
+/// so its `BASE` parameter can be used in order to avoid runtime bound checks when accessing
+/// registers.
+pub(crate) trait FalconHal<E: FalconEngine>: Sync {
+    // Activates the Falcon core if the engine is a risvc/falcon dual engine.
+    fn select_core(&self, _bar: &Devres<Bar0>, _timer: &Timer) -> Result<()> {
+        Ok(())
+    }
+
+    fn get_signature_reg_fuse_version(
+        &self,
+        bar: &Devres<Bar0>,
+        engine_id_mask: u16,
+        ucode_id: u8,
+    ) -> Result<u32>;
+
+    // Program the BROM registers prior to starting a secure firmware.
+    fn program_brom(&self, bar: &Devres<Bar0>, params: &FalconBromParams) -> Result<()>;
+}
+
+/// Returns a boxed falcon HAL adequate for the passed `chipset`.
+///
+/// We use this function and a heap-allocated trait object instead of statically defined trait
+/// objects because of the two-dimensional (Chipset, Engine) lookup required to return the
+/// requested HAL.
+///
+/// TODO: replace the return type with `KBox` once it gains the ability to host trait objects.
+pub(crate) fn create_falcon_hal<E: FalconEngine + 'static>(
+    chipset: Chipset,
+) -> Result<Arc<dyn FalconHal<E>>> {
+    let hal = match chipset {
+        Chipset::GA102 | Chipset::GA103 | Chipset::GA104 | Chipset::GA106 | Chipset::GA107 => {
+            Arc::new(ga102::Ga102::<E>::new(), GFP_KERNEL)? as Arc<dyn FalconHal<E>>
+        }
+        _ => return Err(ENOTSUPP),
+    };
+
+    Ok(hal)
+}
diff --git a/drivers/gpu/nova-core/falcon/hal/ga102.rs b/drivers/gpu/nova-core/falcon/hal/ga102.rs
new file mode 100644
index 0000000000000000000000000000000000000000..747b02ca671f7d4a97142665a9ba64807c87391e
--- /dev/null
+++ b/drivers/gpu/nova-core/falcon/hal/ga102.rs
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use core::marker::PhantomData;
+use core::time::Duration;
+
+use kernel::devres::Devres;
+use kernel::prelude::*;
+
+use crate::driver::Bar0;
+use crate::falcon::{FalconBromParams, FalconEngine, FalconModSelAlgo, RiscvCoreSelect};
+use crate::regs;
+use crate::timer::Timer;
+
+use super::FalconHal;
+
+fn select_core_ga102<E: FalconEngine>(bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
+    let bcr_ctrl = with_bar!(bar, |b| regs::RiscvBcrCtrl::read(b, E::BASE))?;
+    if bcr_ctrl.core_select() != RiscvCoreSelect::Falcon {
+        with_bar!(bar, |b| regs::RiscvBcrCtrl::default()
+            .set_core_select(RiscvCoreSelect::Falcon)
+            .write(b, E::BASE))?;
+
+        timer.wait_on(bar, Duration::from_millis(10), || {
+            bar.try_access_with(|b| regs::RiscvBcrCtrl::read(b, E::BASE))
+                .and_then(|v| if v.valid() { Some(()) } else { None })
+        })?;
+    }
+
+    Ok(())
+}
+
+fn get_signature_reg_fuse_version_ga102(
+    bar: &Devres<Bar0>,
+    engine_id_mask: u16,
+    ucode_id: u8,
+) -> Result<u32> {
+    // The ucode fuse versions are contained in the FUSE_OPT_FPF_<ENGINE>_UCODE<X>_VERSION
+    // registers, which are an array. Our register definition macros do not allow us to manage them
+    // properly, so we need to hardcode their addresses for now.
+
+    // Each engine has 16 ucode version registers numbered from 1 to 16.
+    if ucode_id == 0 || ucode_id > 16 {
+        pr_warn!("invalid ucode id {:#x}", ucode_id);
+        return Err(EINVAL);
+    }
+    let reg_fuse = if engine_id_mask & 0x0001 != 0 {
+        // NV_FUSE_OPT_FPF_SEC2_UCODE1_VERSION
+        0x824140
+    } else if engine_id_mask & 0x0004 != 0 {
+        // NV_FUSE_OPT_FPF_NVDEC_UCODE1_VERSION
+        0x824100
+    } else if engine_id_mask & 0x0400 != 0 {
+        // NV_FUSE_OPT_FPF_GSP_UCODE1_VERSION
+        0x8241c0
+    } else {
+        pr_warn!("unexpected engine_id_mask {:#x}", engine_id_mask);
+        return Err(EINVAL);
+    } + ((ucode_id - 1) as usize * core::mem::size_of::<u32>());
+
+    let reg_fuse_version = with_bar!(bar, |b| { b.read32(reg_fuse) })?;
+
+    // Equivalent of Find Last Set bit.
+    Ok(u32::BITS - reg_fuse_version.leading_zeros())
+}
+
+fn program_brom_ga102<E: FalconEngine>(
+    bar: &Devres<Bar0>,
+    params: &FalconBromParams,
+) -> Result<()> {
+    with_bar!(bar, |b| {
+        regs::FalconBromParaaddr0::default()
+            .set_addr(params.pkc_data_offset)
+            .write(b, E::BASE);
+        regs::FalconBromEngidmask::default()
+            .set_mask(params.engine_id_mask as u32)
+            .write(b, E::BASE);
+        regs::FalconBromCurrUcodeId::default()
+            .set_ucode_id(params.ucode_id as u32)
+            .write(b, E::BASE);
+        regs::FalconModSel::default()
+            .set_algo(FalconModSelAlgo::Rsa3k)
+            .write(b, E::BASE)
+    })
+}
+
+pub(super) struct Ga102<E: FalconEngine>(PhantomData<E>);
+
+impl<E: FalconEngine> Ga102<E> {
+    pub(super) fn new() -> Self {
+        Self(PhantomData)
+    }
+}
+
+impl<E: FalconEngine> FalconHal<E> for Ga102<E> {
+    fn select_core(&self, bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
+        select_core_ga102::<E>(bar, timer)
+    }
+
+    fn get_signature_reg_fuse_version(
+        &self,
+        bar: &Devres<Bar0>,
+        engine_id_mask: u16,
+        ucode_id: u8,
+    ) -> Result<u32> {
+        get_signature_reg_fuse_version_ga102(bar, engine_id_mask, ucode_id)
+    }
+
+    fn program_brom(&self, bar: &Devres<Bar0>, params: &FalconBromParams) -> Result<()> {
+        program_brom_ga102::<E>(bar, params)
+    }
+}
diff --git a/drivers/gpu/nova-core/falcon/sec2.rs b/drivers/gpu/nova-core/falcon/sec2.rs
new file mode 100644
index 0000000000000000000000000000000000000000..85dda3e8380a3d31d34c92c4236c6f81c63ce772
--- /dev/null
+++ b/drivers/gpu/nova-core/falcon/sec2.rs
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use crate::falcon::{Falcon, FalconEngine};
+
+pub(crate) struct Sec2;
+impl FalconEngine for Sec2 {
+    const BASE: usize = 0x00840000;
+}
+pub(crate) type Sec2Falcon = Falcon<Sec2>;
diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
index 1b3e43e0412e2a2ea178c7404ea647c9e38d4e04..ec4c648c6e8b4aa7d06c627ed59c0e66a08c679e 100644
--- a/drivers/gpu/nova-core/gpu.rs
+++ b/drivers/gpu/nova-core/gpu.rs
@@ -5,6 +5,8 @@
 use crate::devinit;
 use crate::dma::DmaObject;
 use crate::driver::Bar0;
+use crate::falcon::gsp::GspFalcon;
+use crate::falcon::sec2::Sec2Falcon;
 use crate::firmware::Firmware;
 use crate::regs;
 use crate::timer::Timer;
@@ -221,6 +223,20 @@ pub(crate) fn new(
 
         let timer = Timer::new();
 
+        let gsp_falcon = GspFalcon::new(
+            pdev,
+            spec.chipset,
+            &bar,
+            if spec.chipset > Chipset::GA100 {
+                true
+            } else {
+                false
+            },
+        )?;
+        gsp_falcon.clear_swgen0_intr(&bar)?;
+
+        let _sec2_falcon = Sec2Falcon::new(pdev, spec.chipset, &bar, true)?;
+
         Ok(pin_init!(Self {
             spec,
             bar,
diff --git a/drivers/gpu/nova-core/nova_core.rs b/drivers/gpu/nova-core/nova_core.rs
index df3468c92c6081b3e2db218d92fbe1c40a0a75c3..4dde8004d24882c60669b5acd6af9d6988c66a9c 100644
--- a/drivers/gpu/nova-core/nova_core.rs
+++ b/drivers/gpu/nova-core/nova_core.rs
@@ -23,6 +23,7 @@ macro_rules! with_bar {
 mod devinit;
 mod dma;
 mod driver;
+mod falcon;
 mod firmware;
 mod gpu;
 mod regs;
diff --git a/drivers/gpu/nova-core/regs.rs b/drivers/gpu/nova-core/regs.rs
index f191cf4eb44c2b950e5cfcc6d04f95c122ce29d3..c76a16dc8e7267a4eb54cb71e1cca6fb9e00188f 100644
--- a/drivers/gpu/nova-core/regs.rs
+++ b/drivers/gpu/nova-core/regs.rs
@@ -6,6 +6,10 @@
 #[macro_use]
 mod macros;
 
+use crate::falcon::{
+    FalconCoreRev, FalconCoreRevSubversion, FalconFbifMemType, FalconFbifTarget, FalconModSelAlgo,
+    FalconSecurityModel, RiscvCoreSelect,
+};
 use crate::gpu::Chipset;
 
 register!(Boot0@0x00000000, "Basic revision information about the GPU";
@@ -44,3 +48,188 @@
 register!(Pgc6AonSecureScratchGroup05@0x00118234;
     31:0    value => as u32
 );
+
+/* PFALCON */
+
+register!(FalconIrqsclr@+0x00000004;
+    4:4     halt => as_bit bool;
+    6:6     swgen0 => as_bit bool;
+);
+
+register!(FalconIrqstat@+0x00000008;
+    4:4     halt => as_bit bool;
+    6:6     swgen0 => as_bit bool;
+);
+
+register!(FalconIrqmclr@+0x00000014;
+    31:0    val => as u32
+);
+
+register!(FalconIrqmask@+0x00000018;
+    31:0    val => as u32
+);
+
+register!(FalconRm@+0x00000084;
+    31:0    val => as u32
+);
+
+register!(FalconIrqdest@+0x0000001c;
+    31:0    val => as u32
+);
+
+register!(FalconMailbox0@+0x00000040;
+    31:0    mailbox0 => as u32
+);
+register!(FalconMailbox1@+0x00000044;
+    31:0    mailbox1 => as u32
+);
+
+register!(FalconHwcfg2@+0x000000f4;
+    10:10   riscv => as_bit bool;
+    12:12   mem_scrubbing => as_bit bool;
+    31:31   reset_ready => as_bit bool;
+);
+
+register!(FalconCpuCtl@+0x00000100;
+    1:1     start_cpu => as_bit bool;
+    4:4     halted => as_bit bool;
+    6:6     alias_en => as_bit bool;
+);
+
+register!(FalconBootVec@+0x00000104;
+    31:0    boot_vec => as u32
+);
+
+register!(FalconHwCfg@+0x00000108;
+    8:0     imem_size => as u32;
+    17:9    dmem_size => as u32;
+);
+
+register!(FalconDmaCtl@+0x0000010c;
+    0:0     require_ctx => as_bit bool;
+    1:1     dmem_scrubbing  => as_bit bool;
+    2:2     imem_scrubbing => as_bit bool;
+    6:3     dmaq_num => as_bit u8;
+    7:7     secure_stat => as_bit bool;
+);
+
+register!(FalconDmaTrfBase@+0x00000110;
+    31:0    base => as u32;
+);
+
+register!(FalconDmaTrfMOffs@+0x00000114;
+    23:0    offs => as u32;
+);
+
+register!(FalconDmaTrfCmd@+0x00000118;
+    0:0     full => as_bit bool;
+    1:1     idle => as_bit bool;
+    3:2     sec => as_bit u8;
+    4:4     imem => as_bit bool;
+    5:5     is_write => as_bit bool;
+    10:8    size => as u8;
+    14:12   ctxdma => as u8;
+    16:16   set_dmtag => as u8;
+);
+
+register!(FalconDmaTrfBOffs@+0x0000011c;
+    31:0    offs => as u32;
+);
+
+register!(FalconDmaTrfBase1@+0x00000128;
+    8:0     base => as u16;
+);
+
+register!(FalconHwcfg1@+0x0000012c;
+    3:0     core_rev => try_into FalconCoreRev, "core revision of the falcon";
+    5:4     security_model => try_into FalconSecurityModel, "security model of the falcon";
+    7:6     core_rev_subversion => into FalconCoreRevSubversion;
+    11:8    imem_ports => as u8;
+    15:12   dmem_ports => as u8;
+);
+
+register!(FalconCpuCtlAlias@+0x00000130;
+    1:1     start_cpu => as_bit bool;
+);
+
+/* TODO: this is an array of registers */
+register!(FalconImemC@+0x00000180;
+    7:2     offs => as u8;
+    23:8    blk => as u8;
+    24:24   aincw => as_bit bool;
+    25:25   aincr => as_bit bool;
+    28:28   secure => as_bit bool;
+    29:29   sec_atomic => as_bit bool;
+);
+
+register!(FalconImemD@+0x00000184;
+    31:0    data => as u32;
+);
+
+register!(FalconImemT@+0x00000188;
+    15:0    data => as u16;
+);
+
+register!(FalconDmemC@+0x000001c0;
+    7:2     offs => as u8;
+    23:0    addr => as u32;
+    23:8    blk => as u8;
+    24:24   aincw => as_bit bool;
+    25:25   aincr => as_bit bool;
+    26:26   settag => as_bit bool;
+    27:27   setlvl => as_bit bool;
+    28:28   va => as_bit bool;
+    29:29   miss => as_bit bool;
+);
+
+register!(FalconDmemD@+0x000001c4;
+    31:0    data => as u32;
+);
+
+register!(FalconModSel@+0x00001180;
+    7:0     algo => try_into FalconModSelAlgo;
+);
+register!(FalconBromCurrUcodeId@+0x00001198;
+    31:0    ucode_id => as u32;
+);
+register!(FalconBromEngidmask@+0x0000119c;
+    31:0    mask => as u32;
+);
+register!(FalconBromParaaddr0@+0x00001210;
+    31:0    addr => as u32;
+);
+
+register!(RiscvCpuctl@+0x00000388;
+    0:0     startcpu => as_bit bool;
+    4:4     halted => as_bit bool;
+    5:5     stopped => as_bit bool;
+    7:7     active_stat => as_bit bool;
+);
+
+register!(FalconEngine@+0x000003c0;
+    0:0     reset => as_bit bool;
+);
+
+register!(RiscvIrqmask@+0x00000528;
+    31:0    mask => as u32;
+);
+
+register!(RiscvIrqdest@+0x0000052c;
+    31:0    dest => as u32;
+);
+
+/* TODO: this is an array of registers */
+register!(FalconFbifTranscfg@+0x00000600;
+    1:0     target => try_into FalconFbifTarget;
+    2:2     mem_type => as_bit FalconFbifMemType;
+);
+
+register!(FalconFbifCtl@+0x00000624;
+    7:7     allow_phys_no_ctx => as_bit bool;
+);
+
+register!(RiscvBcrCtrl@+0x00001668;
+    0:0     valid => as_bit bool;
+    4:4     core_select => as_bit RiscvCoreSelect;
+    8:8     br_fetch => as_bit bool;
+);
diff --git a/drivers/gpu/nova-core/timer.rs b/drivers/gpu/nova-core/timer.rs
index 8987352f4192bc9b4b2fc0fb5f2e8e62ff27be68..c03a5c36d1230dfbf2bd6e02a793264280c6d509 100644
--- a/drivers/gpu/nova-core/timer.rs
+++ b/drivers/gpu/nova-core/timer.rs
@@ -2,9 +2,6 @@
 
 //! Nova Core Timer subdevice
 
-// To be removed when all code is used.
-#![allow(dead_code)]
-
 use core::fmt::Display;
 use core::ops::{Add, Sub};
 use core::time::Duration;

-- 
2.49.0
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Danilo Krummrich 9 months, 3 weeks ago
This patch could probably split up a bit, to make it more pleasant to review. :)

On Sun, Apr 20, 2025 at 09:19:43PM +0900, Alexandre Courbot wrote:
> 
> +#[repr(u8)]
> +#[derive(Debug, Default, Copy, Clone)]
> +pub(crate) enum FalconSecurityModel {
> +    #[default]
> +    None = 0,
> +    Light = 2,
> +    Heavy = 3,
> +}

Please add an explanation for the different security modules. Where are the
differences?

I think most of the structures, registers, abbreviations, etc. introduced in
this patch need some documentation.

Please see https://docs.kernel.org/gpu/nova/guidelines.html#documentation.

> +
> +impl TryFrom<u32> for FalconSecurityModel {
> +    type Error = Error;
> +
> +    fn try_from(value: u32) -> core::result::Result<Self, Self::Error> {
> +        use FalconSecurityModel::*;
> +
> +        let sec_model = match value {
> +            0 => None,
> +            2 => Light,
> +            3 => Heavy,
> +            _ => return Err(EINVAL),
> +        };
> +
> +        Ok(sec_model)
> +    }
> +}
> +
> +#[repr(u8)]
> +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
> +pub(crate) enum FalconCoreRevSubversion {
> +    #[default]
> +    Subversion0 = 0,
> +    Subversion1 = 1,
> +    Subversion2 = 2,
> +    Subversion3 = 3,
> +}
> +
> +impl From<u32> for FalconCoreRevSubversion {
> +    fn from(value: u32) -> Self {
> +        use FalconCoreRevSubversion::*;
> +
> +        match value & 0b11 {
> +            0 => Subversion0,
> +            1 => Subversion1,
> +            2 => Subversion2,
> +            3 => Subversion3,
> +            // SAFETY: the `0b11` mask limits the possible values to `0..=3`.
> +            4..=u32::MAX => unsafe { unreachable_unchecked() },
> +        }

FalconCoreRev uses TryFrom to avoid unsafe code, I think FalconCoreRevSubversion
should do the same thing.

> +/// Trait defining the parameters of a given Falcon instance.
> +pub(crate) trait FalconEngine: Sync {
> +    /// Base I/O address for the falcon, relative from which its registers are accessed.
> +    const BASE: usize;
> +}
> +
> +/// Represents a portion of the firmware to be loaded into a particular memory (e.g. IMEM or DMEM).
> +#[derive(Debug)]
> +pub(crate) struct FalconLoadTarget {
> +    /// Offset from the start of the source object to copy from.
> +    pub(crate) src_start: u32,
> +    /// Offset from the start of the destination memory to copy into.
> +    pub(crate) dst_start: u32,
> +    /// Number of bytes to copy.
> +    pub(crate) len: u32,
> +}
> +
> +#[derive(Debug)]
> +pub(crate) struct FalconBromParams {
> +    pub(crate) pkc_data_offset: u32,
> +    pub(crate) engine_id_mask: u16,
> +    pub(crate) ucode_id: u8,
> +}
> +
> +pub(crate) trait FalconFirmware {
> +    type Target: FalconEngine;
> +
> +    /// Returns the DMA handle of the object containing the firmware.
> +    fn dma_handle(&self) -> bindings::dma_addr_t;
> +
> +    /// Returns the load parameters for `IMEM`.
> +    fn imem_load(&self) -> FalconLoadTarget;
> +
> +    /// Returns the load parameters for `DMEM`.
> +    fn dmem_load(&self) -> FalconLoadTarget;
> +
> +    /// Returns the parameters to write into the BROM registers.
> +    fn brom_params(&self) -> FalconBromParams;
> +
> +    /// Returns the start address of the firmware.
> +    fn boot_addr(&self) -> u32;
> +}
> +
> +/// Contains the base parameters common to all Falcon instances.
> +pub(crate) struct Falcon<E: FalconEngine> {
> +    pub hal: Arc<dyn FalconHal<E>>,

This should probably be private and instead should be exposed via Deref.

Also, please see my comment at create_falcon_hal() regarding the dynamic
dispatch.

> +}
> +
> +impl<E: FalconEngine + 'static> Falcon<E> {
> +    pub(crate) fn new(
> +        pdev: &pci::Device,
> +        chipset: Chipset,
> +        bar: &Devres<Bar0>,
> +        need_riscv: bool,
> +    ) -> Result<Self> {
> +        let hwcfg1 = with_bar!(bar, |b| regs::FalconHwcfg1::read(b, E::BASE))?;
> +        // Ensure that the revision and security model contain valid values.
> +        let _rev = hwcfg1.core_rev()?;
> +        let _sec_model = hwcfg1.security_model()?;
> +
> +        if need_riscv {
> +            let hwcfg2 = with_bar!(bar, |b| regs::FalconHwcfg2::read(b, E::BASE))?;
> +            if !hwcfg2.riscv() {
> +                dev_err!(
> +                    pdev.as_ref(),
> +                    "riscv support requested on falcon that does not support it\n"
> +                );
> +                return Err(EINVAL);
> +            }
> +        }
> +
> +        Ok(Self {
> +            hal: hal::create_falcon_hal(chipset)?,

I'd prefer to move the contents of create_falcon_hal() into this constructor.

> +        })
> +    }
> +
> +    fn reset_wait_mem_scrubbing(&self, bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
> +        timer.wait_on(bar, Duration::from_millis(20), || {
> +            bar.try_access_with(|b| regs::FalconHwcfg2::read(b, E::BASE))
> +                .and_then(|r| if r.mem_scrubbing() { Some(()) } else { None })
> +        })
> +    }
> +
> +    fn reset_eng(&self, bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
> +        let _ = with_bar!(bar, |b| regs::FalconHwcfg2::read(b, E::BASE))?;
> +
> +        // According to OpenRM's `kflcnPreResetWait_GA102` documentation, HW sometimes does not set
> +        // RESET_READY so a non-failing timeout is used.
> +        let _ = timer.wait_on(bar, Duration::from_micros(150), || {
> +            bar.try_access_with(|b| regs::FalconHwcfg2::read(b, E::BASE))
> +                .and_then(|r| if r.reset_ready() { Some(()) } else { None })
> +        });
> +
> +        with_bar!(bar, |b| regs::FalconEngine::alter(b, E::BASE, |v| v
> +            .set_reset(true)))?;
> +
> +        let _: Result<()> = timer.wait_on(bar, Duration::from_micros(10), || None);
> +
> +        with_bar!(bar, |b| regs::FalconEngine::alter(b, E::BASE, |v| v
> +            .set_reset(false)))?;
> +
> +        self.reset_wait_mem_scrubbing(bar, timer)?;
> +
> +        Ok(())
> +    }
> +
> +    pub(crate) fn reset(&self, bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
> +        self.reset_eng(bar, timer)?;
> +        self.hal.select_core(bar, timer)?;
> +        self.reset_wait_mem_scrubbing(bar, timer)?;
> +
> +        with_bar!(bar, |b| {
> +            regs::FalconRm::default()
> +                .set_val(regs::Boot0::read(b).into())
> +                .write(b, E::BASE)
> +        })
> +    }
> +
> +    fn dma_wr(
> +        &self,
> +        bar: &Devres<Bar0>,
> +        timer: &Timer,
> +        dma_handle: bindings::dma_addr_t,
> +        target_mem: FalconMem,
> +        load_offsets: FalconLoadTarget,
> +        sec: bool,
> +    ) -> Result<()> {
> +        const DMA_LEN: u32 = 256;
> +        const DMA_LEN_ILOG2_MINUS2: u8 = (DMA_LEN.ilog2() - 2) as u8;
> +
> +        // For IMEM, we want to use the start offset as a virtual address tag for each page, since
> +        // code addresses in the firmware (and the boot vector) are virtual.
> +        //
> +        // For DMEM we can fold the start offset into the DMA handle.
> +        let (src_start, dma_start) = match target_mem {
> +            FalconMem::Imem => (load_offsets.src_start, dma_handle),
> +            FalconMem::Dmem => (
> +                0,
> +                dma_handle + load_offsets.src_start as bindings::dma_addr_t,
> +            ),
> +        };
> +        if dma_start % DMA_LEN as bindings::dma_addr_t > 0 {
> +            pr_err!(
> +                "DMA transfer start addresses must be a multiple of {}",
> +                DMA_LEN
> +            );
> +            return Err(EINVAL);
> +        }
> +        if load_offsets.len % DMA_LEN > 0 {
> +            pr_err!("DMA transfer length must be a multiple of {}", DMA_LEN);
> +            return Err(EINVAL);
> +        }
> +
> +        // Set up the base source DMA address.
> +        with_bar!(bar, |b| {
> +            regs::FalconDmaTrfBase::default()
> +                .set_base((dma_start >> 8) as u32)
> +                .write(b, E::BASE);
> +            regs::FalconDmaTrfBase1::default()
> +                .set_base((dma_start >> 40) as u16)
> +                .write(b, E::BASE)
> +        })?;
> +
> +        let cmd = regs::FalconDmaTrfCmd::default()
> +            .set_size(DMA_LEN_ILOG2_MINUS2)
> +            .set_imem(target_mem == FalconMem::Imem)
> +            .set_sec(if sec { 1 } else { 0 });
> +
> +        for pos in (0..load_offsets.len).step_by(DMA_LEN as usize) {
> +            // Perform a transfer of size `DMA_LEN`.
> +            with_bar!(bar, |b| {
> +                regs::FalconDmaTrfMOffs::default()
> +                    .set_offs(load_offsets.dst_start + pos)
> +                    .write(b, E::BASE);
> +                regs::FalconDmaTrfBOffs::default()
> +                    .set_offs(src_start + pos)
> +                    .write(b, E::BASE);
> +                cmd.write(b, E::BASE)
> +            })?;
> +
> +            // Wait for the transfer to complete.
> +            timer.wait_on(bar, Duration::from_millis(2000), || {
> +                bar.try_access_with(|b| regs::FalconDmaTrfCmd::read(b, E::BASE))
> +                    .and_then(|v| if v.idle() { Some(()) } else { None })
> +            })?;
> +        }
> +
> +        Ok(())
> +    }
> +
> +    pub(crate) fn dma_load<F: FalconFirmware<Target = E>>(
> +        &self,
> +        bar: &Devres<Bar0>,
> +        timer: &Timer,
> +        fw: &F,
> +    ) -> Result<()> {
> +        let dma_handle = fw.dma_handle();
> +
> +        with_bar!(bar, |b| {
> +            regs::FalconFbifCtl::alter(b, E::BASE, |v| v.set_allow_phys_no_ctx(true));
> +            regs::FalconDmaCtl::default().write(b, E::BASE);
> +            regs::FalconFbifTranscfg::alter(b, E::BASE, |v| {
> +                v.set_target(FalconFbifTarget::CoherentSysmem)
> +                    .set_mem_type(FalconFbifMemType::Physical)
> +            });
> +        })?;
> +
> +        self.dma_wr(
> +            bar,
> +            timer,
> +            dma_handle,
> +            FalconMem::Imem,
> +            fw.imem_load(),
> +            true,
> +        )?;
> +        self.dma_wr(
> +            bar,
> +            timer,
> +            dma_handle,
> +            FalconMem::Dmem,
> +            fw.dmem_load(),
> +            true,
> +        )?;
> +
> +        self.hal.program_brom(bar, &fw.brom_params())?;
> +
> +        with_bar!(bar, |b| {
> +            // Set `BootVec` to start of non-secure code.
> +            regs::FalconBootVec::default()
> +                .set_boot_vec(fw.boot_addr())
> +                .write(b, E::BASE);
> +        })?;
> +
> +        Ok(())
> +    }
> +
> +    pub(crate) fn boot(
> +        &self,
> +        bar: &Devres<Bar0>,
> +        timer: &Timer,
> +        mbox0: Option<u32>,
> +        mbox1: Option<u32>,
> +    ) -> Result<(u32, u32)> {
> +        with_bar!(bar, |b| {
> +            if let Some(mbox0) = mbox0 {
> +                regs::FalconMailbox0::default()
> +                    .set_mailbox0(mbox0)
> +                    .write(b, E::BASE);
> +            }
> +
> +            if let Some(mbox1) = mbox1 {
> +                regs::FalconMailbox1::default()
> +                    .set_mailbox1(mbox1)
> +                    .write(b, E::BASE);
> +            }
> +
> +            match regs::FalconCpuCtl::read(b, E::BASE).alias_en() {
> +                true => regs::FalconCpuCtlAlias::default()
> +                    .set_start_cpu(true)
> +                    .write(b, E::BASE),
> +                false => regs::FalconCpuCtl::default()
> +                    .set_start_cpu(true)
> +                    .write(b, E::BASE),
> +            }
> +        })?;
> +
> +        timer.wait_on(bar, Duration::from_secs(2), || {
> +            bar.try_access()
> +                .map(|b| regs::FalconCpuCtl::read(&*b, E::BASE))
> +                .and_then(|v| if v.halted() { Some(()) } else { None })
> +        })?;
> +
> +        let (mbox0, mbox1) = with_bar!(bar, |b| {
> +            let mbox0 = regs::FalconMailbox0::read(b, E::BASE).mailbox0();
> +            let mbox1 = regs::FalconMailbox1::read(b, E::BASE).mailbox1();
> +
> +            (mbox0, mbox1)
> +        })?;
> +
> +        Ok((mbox0, mbox1))
> +    }
> +}
> diff --git a/drivers/gpu/nova-core/falcon/gsp.rs b/drivers/gpu/nova-core/falcon/gsp.rs
> new file mode 100644
> index 0000000000000000000000000000000000000000..44b8dc118eda1263eaede466efd55408c6e7cded
> --- /dev/null
> +++ b/drivers/gpu/nova-core/falcon/gsp.rs
> @@ -0,0 +1,27 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +use kernel::devres::Devres;
> +use kernel::prelude::*;
> +
> +use crate::{
> +    driver::Bar0,
> +    falcon::{Falcon, FalconEngine},
> +    regs,
> +};
> +
> +pub(crate) struct Gsp;
> +impl FalconEngine for Gsp {
> +    const BASE: usize = 0x00110000;
> +}
> +
> +pub(crate) type GspFalcon = Falcon<Gsp>;

Please drop this type alias, Falcon<Gsp> seems simple enough and is much more
obvious IMHO.

> +
> +impl Falcon<Gsp> {
> +    /// Clears the SWGEN0 bit in the Falcon's IRQ status clear register to
> +    /// allow GSP to signal CPU for processing new messages in message queue.
> +    pub(crate) fn clear_swgen0_intr(&self, bar: &Devres<Bar0>) -> Result<()> {
> +        with_bar!(bar, |b| regs::FalconIrqsclr::default()
> +            .set_swgen0(true)
> +            .write(b, Gsp::BASE))
> +    }
> +}
> diff --git a/drivers/gpu/nova-core/falcon/hal.rs b/drivers/gpu/nova-core/falcon/hal.rs
> new file mode 100644
> index 0000000000000000000000000000000000000000..5ebf4e88f1f25a13cf47859a53507be53e795d34
> --- /dev/null
> +++ b/drivers/gpu/nova-core/falcon/hal.rs
> @@ -0,0 +1,54 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +use kernel::devres::Devres;
> +use kernel::prelude::*;
> +use kernel::sync::Arc;
> +
> +use crate::driver::Bar0;
> +use crate::falcon::{FalconBromParams, FalconEngine};
> +use crate::gpu::Chipset;
> +use crate::timer::Timer;
> +
> +mod ga102;
> +
> +/// Hardware Abstraction Layer for Falcon cores.
> +///
> +/// Implements chipset-specific low-level operations. The trait is generic against [`FalconEngine`]
> +/// so its `BASE` parameter can be used in order to avoid runtime bound checks when accessing
> +/// registers.
> +pub(crate) trait FalconHal<E: FalconEngine>: Sync {
> +    // Activates the Falcon core if the engine is a risvc/falcon dual engine.
> +    fn select_core(&self, _bar: &Devres<Bar0>, _timer: &Timer) -> Result<()> {
> +        Ok(())
> +    }
> +
> +    fn get_signature_reg_fuse_version(
> +        &self,
> +        bar: &Devres<Bar0>,
> +        engine_id_mask: u16,
> +        ucode_id: u8,
> +    ) -> Result<u32>;
> +
> +    // Program the BROM registers prior to starting a secure firmware.
> +    fn program_brom(&self, bar: &Devres<Bar0>, params: &FalconBromParams) -> Result<()>;
> +}
> +
> +/// Returns a boxed falcon HAL adequate for the passed `chipset`.
> +///
> +/// We use this function and a heap-allocated trait object instead of statically defined trait
> +/// objects because of the two-dimensional (Chipset, Engine) lookup required to return the
> +/// requested HAL.

Do we really need the dynamic dispatch? AFAICS, there's only E::BASE that is
relevant to FalconHal impls?

Can't we do something like I do in the following example [1]?

```
use std::marker::PhantomData;
use std::ops::Deref;

trait Engine {
    const BASE: u32;
}

trait Hal<E: Engine> {
    fn access(&self);
}

struct Gsp;

impl Engine for Gsp {
    const BASE: u32 = 0x1;
}

struct Sec2;

impl Engine for Sec2 {
    const BASE: u32 = 0x2;
}

struct GA100<E: Engine>(PhantomData<E>);

impl<E: Engine> Hal<E> for GA100<E> {
    fn access(&self) {
        println!("Base: {}", E::BASE);
    }
}

impl<E: Engine> GA100<E> {
    fn new() -> Self {
        Self(PhantomData)
    }
}

//struct Falcon<E: Engine>(GA100<E>);

struct Falcon<H: Hal<E>, E: Engine>(H, PhantomData<E>);

impl<H: Hal<E>, E: Engine> Falcon<H, E> {
    fn new(hal: H) -> Self {
        Self(hal, PhantomData)
    }
}

impl<H: Hal<E>, E: Engine> Deref for Falcon<H, E> {
    type Target = H;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

fn main() {
    let gsp = Falcon::new(GA100::<Gsp>::new());
    let sec2 = Falcon::new(GA100::<Sec2>::new());

    gsp.access();
    sec2.access();
}
```

[1] https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=bf7035a07e79a4047fb6834eac03a9f2

> +///
> +/// TODO: replace the return type with `KBox` once it gains the ability to host trait objects.
> +pub(crate) fn create_falcon_hal<E: FalconEngine + 'static>(
> +    chipset: Chipset,
> +) -> Result<Arc<dyn FalconHal<E>>> {
> +    let hal = match chipset {
> +        Chipset::GA102 | Chipset::GA103 | Chipset::GA104 | Chipset::GA106 | Chipset::GA107 => {
> +            Arc::new(ga102::Ga102::<E>::new(), GFP_KERNEL)? as Arc<dyn FalconHal<E>>
> +        }
> +        _ => return Err(ENOTSUPP),
> +    };
> +
> +    Ok(hal)
> +}
> diff --git a/drivers/gpu/nova-core/falcon/hal/ga102.rs b/drivers/gpu/nova-core/falcon/hal/ga102.rs
> new file mode 100644
> index 0000000000000000000000000000000000000000..747b02ca671f7d4a97142665a9ba64807c87391e
> --- /dev/null
> +++ b/drivers/gpu/nova-core/falcon/hal/ga102.rs
> @@ -0,0 +1,111 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +use core::marker::PhantomData;
> +use core::time::Duration;
> +
> +use kernel::devres::Devres;
> +use kernel::prelude::*;
> +
> +use crate::driver::Bar0;
> +use crate::falcon::{FalconBromParams, FalconEngine, FalconModSelAlgo, RiscvCoreSelect};
> +use crate::regs;
> +use crate::timer::Timer;
> +
> +use super::FalconHal;
> +
> +fn select_core_ga102<E: FalconEngine>(bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
> +    let bcr_ctrl = with_bar!(bar, |b| regs::RiscvBcrCtrl::read(b, E::BASE))?;
> +    if bcr_ctrl.core_select() != RiscvCoreSelect::Falcon {
> +        with_bar!(bar, |b| regs::RiscvBcrCtrl::default()
> +            .set_core_select(RiscvCoreSelect::Falcon)
> +            .write(b, E::BASE))?;
> +
> +        timer.wait_on(bar, Duration::from_millis(10), || {
> +            bar.try_access_with(|b| regs::RiscvBcrCtrl::read(b, E::BASE))
> +                .and_then(|v| if v.valid() { Some(()) } else { None })
> +        })?;
> +    }
> +
> +    Ok(())
> +}
> +
> +fn get_signature_reg_fuse_version_ga102(
> +    bar: &Devres<Bar0>,
> +    engine_id_mask: u16,
> +    ucode_id: u8,
> +) -> Result<u32> {
> +    // The ucode fuse versions are contained in the FUSE_OPT_FPF_<ENGINE>_UCODE<X>_VERSION
> +    // registers, which are an array. Our register definition macros do not allow us to manage them
> +    // properly, so we need to hardcode their addresses for now.
> +
> +    // Each engine has 16 ucode version registers numbered from 1 to 16.
> +    if ucode_id == 0 || ucode_id > 16 {
> +        pr_warn!("invalid ucode id {:#x}", ucode_id);
> +        return Err(EINVAL);
> +    }
> +    let reg_fuse = if engine_id_mask & 0x0001 != 0 {
> +        // NV_FUSE_OPT_FPF_SEC2_UCODE1_VERSION
> +        0x824140
> +    } else if engine_id_mask & 0x0004 != 0 {
> +        // NV_FUSE_OPT_FPF_NVDEC_UCODE1_VERSION
> +        0x824100
> +    } else if engine_id_mask & 0x0400 != 0 {
> +        // NV_FUSE_OPT_FPF_GSP_UCODE1_VERSION
> +        0x8241c0
> +    } else {
> +        pr_warn!("unexpected engine_id_mask {:#x}", engine_id_mask);
> +        return Err(EINVAL);
> +    } + ((ucode_id - 1) as usize * core::mem::size_of::<u32>());
> +
> +    let reg_fuse_version = with_bar!(bar, |b| { b.read32(reg_fuse) })?;
> +
> +    // Equivalent of Find Last Set bit.
> +    Ok(u32::BITS - reg_fuse_version.leading_zeros())
> +}
> +
> +fn program_brom_ga102<E: FalconEngine>(
> +    bar: &Devres<Bar0>,
> +    params: &FalconBromParams,
> +) -> Result<()> {
> +    with_bar!(bar, |b| {
> +        regs::FalconBromParaaddr0::default()
> +            .set_addr(params.pkc_data_offset)
> +            .write(b, E::BASE);
> +        regs::FalconBromEngidmask::default()
> +            .set_mask(params.engine_id_mask as u32)
> +            .write(b, E::BASE);
> +        regs::FalconBromCurrUcodeId::default()
> +            .set_ucode_id(params.ucode_id as u32)
> +            .write(b, E::BASE);
> +        regs::FalconModSel::default()
> +            .set_algo(FalconModSelAlgo::Rsa3k)
> +            .write(b, E::BASE)
> +    })
> +}
> +
> +pub(super) struct Ga102<E: FalconEngine>(PhantomData<E>);
> +
> +impl<E: FalconEngine> Ga102<E> {
> +    pub(super) fn new() -> Self {
> +        Self(PhantomData)
> +    }
> +}
> +
> +impl<E: FalconEngine> FalconHal<E> for Ga102<E> {
> +    fn select_core(&self, bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
> +        select_core_ga102::<E>(bar, timer)
> +    }
> +
> +    fn get_signature_reg_fuse_version(
> +        &self,
> +        bar: &Devres<Bar0>,
> +        engine_id_mask: u16,
> +        ucode_id: u8,
> +    ) -> Result<u32> {
> +        get_signature_reg_fuse_version_ga102(bar, engine_id_mask, ucode_id)
> +    }
> +
> +    fn program_brom(&self, bar: &Devres<Bar0>, params: &FalconBromParams) -> Result<()> {
> +        program_brom_ga102::<E>(bar, params)
> +    }
> +}
> diff --git a/drivers/gpu/nova-core/falcon/sec2.rs b/drivers/gpu/nova-core/falcon/sec2.rs
> new file mode 100644
> index 0000000000000000000000000000000000000000..85dda3e8380a3d31d34c92c4236c6f81c63ce772
> --- /dev/null
> +++ b/drivers/gpu/nova-core/falcon/sec2.rs
> @@ -0,0 +1,9 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +use crate::falcon::{Falcon, FalconEngine};
> +
> +pub(crate) struct Sec2;
> +impl FalconEngine for Sec2 {
> +    const BASE: usize = 0x00840000;
> +}
> +pub(crate) type Sec2Falcon = Falcon<Sec2>;
> diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
> index 1b3e43e0412e2a2ea178c7404ea647c9e38d4e04..ec4c648c6e8b4aa7d06c627ed59c0e66a08c679e 100644
> --- a/drivers/gpu/nova-core/gpu.rs
> +++ b/drivers/gpu/nova-core/gpu.rs
> @@ -5,6 +5,8 @@
>  use crate::devinit;
>  use crate::dma::DmaObject;
>  use crate::driver::Bar0;
> +use crate::falcon::gsp::GspFalcon;
> +use crate::falcon::sec2::Sec2Falcon;
>  use crate::firmware::Firmware;
>  use crate::regs;
>  use crate::timer::Timer;
> @@ -221,6 +223,20 @@ pub(crate) fn new(
>  
>          let timer = Timer::new();
>  
> +        let gsp_falcon = GspFalcon::new(
> +            pdev,
> +            spec.chipset,
> +            &bar,
> +            if spec.chipset > Chipset::GA100 {
> +                true
> +            } else {
> +                false
> +            },
> +        )?;
> +        gsp_falcon.clear_swgen0_intr(&bar)?;
> +
> +        let _sec2_falcon = Sec2Falcon::new(pdev, spec.chipset, &bar, true)?;
> +
>          Ok(pin_init!(Self {
>              spec,
>              bar,
> diff --git a/drivers/gpu/nova-core/nova_core.rs b/drivers/gpu/nova-core/nova_core.rs
> index df3468c92c6081b3e2db218d92fbe1c40a0a75c3..4dde8004d24882c60669b5acd6af9d6988c66a9c 100644
> --- a/drivers/gpu/nova-core/nova_core.rs
> +++ b/drivers/gpu/nova-core/nova_core.rs
> @@ -23,6 +23,7 @@ macro_rules! with_bar {
>  mod devinit;
>  mod dma;
>  mod driver;
> +mod falcon;
>  mod firmware;
>  mod gpu;
>  mod regs;
> diff --git a/drivers/gpu/nova-core/regs.rs b/drivers/gpu/nova-core/regs.rs
> index f191cf4eb44c2b950e5cfcc6d04f95c122ce29d3..c76a16dc8e7267a4eb54cb71e1cca6fb9e00188f 100644
> --- a/drivers/gpu/nova-core/regs.rs
> +++ b/drivers/gpu/nova-core/regs.rs
> @@ -6,6 +6,10 @@
>  #[macro_use]
>  mod macros;
>  
> +use crate::falcon::{
> +    FalconCoreRev, FalconCoreRevSubversion, FalconFbifMemType, FalconFbifTarget, FalconModSelAlgo,
> +    FalconSecurityModel, RiscvCoreSelect,
> +};
>  use crate::gpu::Chipset;
>  
>  register!(Boot0@0x00000000, "Basic revision information about the GPU";
> @@ -44,3 +48,188 @@
>  register!(Pgc6AonSecureScratchGroup05@0x00118234;
>      31:0    value => as u32
>  );
> +
> +/* PFALCON */
> +
> +register!(FalconIrqsclr@+0x00000004;
> +    4:4     halt => as_bit bool;
> +    6:6     swgen0 => as_bit bool;
> +);
> +
> +register!(FalconIrqstat@+0x00000008;
> +    4:4     halt => as_bit bool;
> +    6:6     swgen0 => as_bit bool;
> +);
> +
> +register!(FalconIrqmclr@+0x00000014;
> +    31:0    val => as u32
> +);
> +
> +register!(FalconIrqmask@+0x00000018;
> +    31:0    val => as u32
> +);
> +
> +register!(FalconRm@+0x00000084;
> +    31:0    val => as u32
> +);
> +
> +register!(FalconIrqdest@+0x0000001c;
> +    31:0    val => as u32
> +);
> +
> +register!(FalconMailbox0@+0x00000040;
> +    31:0    mailbox0 => as u32
> +);
> +register!(FalconMailbox1@+0x00000044;
> +    31:0    mailbox1 => as u32
> +);
> +
> +register!(FalconHwcfg2@+0x000000f4;
> +    10:10   riscv => as_bit bool;
> +    12:12   mem_scrubbing => as_bit bool;
> +    31:31   reset_ready => as_bit bool;
> +);
> +
> +register!(FalconCpuCtl@+0x00000100;
> +    1:1     start_cpu => as_bit bool;
> +    4:4     halted => as_bit bool;
> +    6:6     alias_en => as_bit bool;
> +);
> +
> +register!(FalconBootVec@+0x00000104;
> +    31:0    boot_vec => as u32
> +);
> +
> +register!(FalconHwCfg@+0x00000108;
> +    8:0     imem_size => as u32;
> +    17:9    dmem_size => as u32;
> +);
> +
> +register!(FalconDmaCtl@+0x0000010c;
> +    0:0     require_ctx => as_bit bool;
> +    1:1     dmem_scrubbing  => as_bit bool;
> +    2:2     imem_scrubbing => as_bit bool;
> +    6:3     dmaq_num => as_bit u8;
> +    7:7     secure_stat => as_bit bool;
> +);
> +
> +register!(FalconDmaTrfBase@+0x00000110;
> +    31:0    base => as u32;
> +);
> +
> +register!(FalconDmaTrfMOffs@+0x00000114;
> +    23:0    offs => as u32;
> +);
> +
> +register!(FalconDmaTrfCmd@+0x00000118;
> +    0:0     full => as_bit bool;
> +    1:1     idle => as_bit bool;
> +    3:2     sec => as_bit u8;
> +    4:4     imem => as_bit bool;
> +    5:5     is_write => as_bit bool;
> +    10:8    size => as u8;
> +    14:12   ctxdma => as u8;
> +    16:16   set_dmtag => as u8;
> +);
> +
> +register!(FalconDmaTrfBOffs@+0x0000011c;
> +    31:0    offs => as u32;
> +);
> +
> +register!(FalconDmaTrfBase1@+0x00000128;
> +    8:0     base => as u16;
> +);
> +
> +register!(FalconHwcfg1@+0x0000012c;
> +    3:0     core_rev => try_into FalconCoreRev, "core revision of the falcon";
> +    5:4     security_model => try_into FalconSecurityModel, "security model of the falcon";
> +    7:6     core_rev_subversion => into FalconCoreRevSubversion;
> +    11:8    imem_ports => as u8;
> +    15:12   dmem_ports => as u8;
> +);
> +
> +register!(FalconCpuCtlAlias@+0x00000130;
> +    1:1     start_cpu => as_bit bool;
> +);
> +
> +/* TODO: this is an array of registers */
> +register!(FalconImemC@+0x00000180;
> +    7:2     offs => as u8;
> +    23:8    blk => as u8;
> +    24:24   aincw => as_bit bool;
> +    25:25   aincr => as_bit bool;
> +    28:28   secure => as_bit bool;
> +    29:29   sec_atomic => as_bit bool;
> +);
> +
> +register!(FalconImemD@+0x00000184;
> +    31:0    data => as u32;
> +);
> +
> +register!(FalconImemT@+0x00000188;
> +    15:0    data => as u16;
> +);
> +
> +register!(FalconDmemC@+0x000001c0;
> +    7:2     offs => as u8;
> +    23:0    addr => as u32;
> +    23:8    blk => as u8;
> +    24:24   aincw => as_bit bool;
> +    25:25   aincr => as_bit bool;
> +    26:26   settag => as_bit bool;
> +    27:27   setlvl => as_bit bool;
> +    28:28   va => as_bit bool;
> +    29:29   miss => as_bit bool;
> +);
> +
> +register!(FalconDmemD@+0x000001c4;
> +    31:0    data => as u32;
> +);
> +
> +register!(FalconModSel@+0x00001180;
> +    7:0     algo => try_into FalconModSelAlgo;
> +);
> +register!(FalconBromCurrUcodeId@+0x00001198;
> +    31:0    ucode_id => as u32;
> +);
> +register!(FalconBromEngidmask@+0x0000119c;
> +    31:0    mask => as u32;
> +);
> +register!(FalconBromParaaddr0@+0x00001210;
> +    31:0    addr => as u32;
> +);
> +
> +register!(RiscvCpuctl@+0x00000388;
> +    0:0     startcpu => as_bit bool;
> +    4:4     halted => as_bit bool;
> +    5:5     stopped => as_bit bool;
> +    7:7     active_stat => as_bit bool;
> +);
> +
> +register!(FalconEngine@+0x000003c0;
> +    0:0     reset => as_bit bool;
> +);
> +
> +register!(RiscvIrqmask@+0x00000528;
> +    31:0    mask => as u32;
> +);
> +
> +register!(RiscvIrqdest@+0x0000052c;
> +    31:0    dest => as u32;
> +);
> +
> +/* TODO: this is an array of registers */
> +register!(FalconFbifTranscfg@+0x00000600;
> +    1:0     target => try_into FalconFbifTarget;
> +    2:2     mem_type => as_bit FalconFbifMemType;
> +);
> +
> +register!(FalconFbifCtl@+0x00000624;
> +    7:7     allow_phys_no_ctx => as_bit bool;
> +);
> +
> +register!(RiscvBcrCtrl@+0x00001668;
> +    0:0     valid => as_bit bool;
> +    4:4     core_select => as_bit RiscvCoreSelect;
> +    8:8     br_fetch => as_bit bool;
> +);
> diff --git a/drivers/gpu/nova-core/timer.rs b/drivers/gpu/nova-core/timer.rs
> index 8987352f4192bc9b4b2fc0fb5f2e8e62ff27be68..c03a5c36d1230dfbf2bd6e02a793264280c6d509 100644
> --- a/drivers/gpu/nova-core/timer.rs
> +++ b/drivers/gpu/nova-core/timer.rs
> @@ -2,9 +2,6 @@
>  
>  //! Nova Core Timer subdevice
>  
> -// To be removed when all code is used.
> -#![allow(dead_code)]
> -
>  use core::fmt::Display;
>  use core::ops::{Add, Sub};
>  use core::time::Duration;
> 
> -- 
> 2.49.0
>
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Alexandre Courbot 9 months, 2 weeks ago
Hi Danilo,

On Tue Apr 22, 2025 at 11:44 PM JST, Danilo Krummrich wrote:
> This patch could probably split up a bit, to make it more pleasant to review. :)

Probably yes. I thought since it is mostly new files, splitting up
wouldn't change much. Let me see what I can do.

>
> On Sun, Apr 20, 2025 at 09:19:43PM +0900, Alexandre Courbot wrote:
>> 
>> +#[repr(u8)]
>> +#[derive(Debug, Default, Copy, Clone)]
>> +pub(crate) enum FalconSecurityModel {
>> +    #[default]
>> +    None = 0,
>> +    Light = 2,
>> +    Heavy = 3,
>> +}
>
> Please add an explanation for the different security modules. Where are the
> differences?
>
> I think most of the structures, registers, abbreviations, etc. introduced in
> this patch need some documentation.

I've documented things a bit better for the next revision.

>
> Please see https://docs.kernel.org/gpu/nova/guidelines.html#documentation.
>
>> +
>> +impl TryFrom<u32> for FalconSecurityModel {
>> +    type Error = Error;
>> +
>> +    fn try_from(value: u32) -> core::result::Result<Self, Self::Error> {
>> +        use FalconSecurityModel::*;
>> +
>> +        let sec_model = match value {
>> +            0 => None,
>> +            2 => Light,
>> +            3 => Heavy,
>> +            _ => return Err(EINVAL),
>> +        };
>> +
>> +        Ok(sec_model)
>> +    }
>> +}
>> +
>> +#[repr(u8)]
>> +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
>> +pub(crate) enum FalconCoreRevSubversion {
>> +    #[default]
>> +    Subversion0 = 0,
>> +    Subversion1 = 1,
>> +    Subversion2 = 2,
>> +    Subversion3 = 3,
>> +}
>> +
>> +impl From<u32> for FalconCoreRevSubversion {
>> +    fn from(value: u32) -> Self {
>> +        use FalconCoreRevSubversion::*;
>> +
>> +        match value & 0b11 {
>> +            0 => Subversion0,
>> +            1 => Subversion1,
>> +            2 => Subversion2,
>> +            3 => Subversion3,
>> +            // SAFETY: the `0b11` mask limits the possible values to `0..=3`.
>> +            4..=u32::MAX => unsafe { unreachable_unchecked() },
>> +        }
>
> FalconCoreRev uses TryFrom to avoid unsafe code, I think FalconCoreRevSubversion
> should do the same thing.

Since the field from which `FalconCoreRevSubversion` is built is only 2
bits, I thought we could avoid using `TryFrom` since we are effectively
covering all possible values (I wish Rust has n-bit integer types :)).
But yeah I have probably overthought that, and that unsafe block is
unsightly. Converted to `TryFrom`.

>
>> +/// Trait defining the parameters of a given Falcon instance.
>> +pub(crate) trait FalconEngine: Sync {
>> +    /// Base I/O address for the falcon, relative from which its registers are accessed.
>> +    const BASE: usize;
>> +}
>> +
>> +/// Represents a portion of the firmware to be loaded into a particular memory (e.g. IMEM or DMEM).
>> +#[derive(Debug)]
>> +pub(crate) struct FalconLoadTarget {
>> +    /// Offset from the start of the source object to copy from.
>> +    pub(crate) src_start: u32,
>> +    /// Offset from the start of the destination memory to copy into.
>> +    pub(crate) dst_start: u32,
>> +    /// Number of bytes to copy.
>> +    pub(crate) len: u32,
>> +}
>> +
>> +#[derive(Debug)]
>> +pub(crate) struct FalconBromParams {
>> +    pub(crate) pkc_data_offset: u32,
>> +    pub(crate) engine_id_mask: u16,
>> +    pub(crate) ucode_id: u8,
>> +}
>> +
>> +pub(crate) trait FalconFirmware {
>> +    type Target: FalconEngine;
>> +
>> +    /// Returns the DMA handle of the object containing the firmware.
>> +    fn dma_handle(&self) -> bindings::dma_addr_t;
>> +
>> +    /// Returns the load parameters for `IMEM`.
>> +    fn imem_load(&self) -> FalconLoadTarget;
>> +
>> +    /// Returns the load parameters for `DMEM`.
>> +    fn dmem_load(&self) -> FalconLoadTarget;
>> +
>> +    /// Returns the parameters to write into the BROM registers.
>> +    fn brom_params(&self) -> FalconBromParams;
>> +
>> +    /// Returns the start address of the firmware.
>> +    fn boot_addr(&self) -> u32;
>> +}
>> +
>> +/// Contains the base parameters common to all Falcon instances.
>> +pub(crate) struct Falcon<E: FalconEngine> {
>> +    pub hal: Arc<dyn FalconHal<E>>,
>
> This should probably be private and instead should be exposed via Deref.

Agreed - actually not all the HAL is supposed to be exposed, so I've
added a proxy method for the only method that needs to be called from
outside this module.

>
> Also, please see my comment at create_falcon_hal() regarding the dynamic
> dispatch.
>
>> +}
>> +
>> +impl<E: FalconEngine + 'static> Falcon<E> {
>> +    pub(crate) fn new(
>> +        pdev: &pci::Device,
>> +        chipset: Chipset,
>> +        bar: &Devres<Bar0>,
>> +        need_riscv: bool,
>> +    ) -> Result<Self> {
>> +        let hwcfg1 = with_bar!(bar, |b| regs::FalconHwcfg1::read(b, E::BASE))?;
>> +        // Ensure that the revision and security model contain valid values.
>> +        let _rev = hwcfg1.core_rev()?;
>> +        let _sec_model = hwcfg1.security_model()?;
>> +
>> +        if need_riscv {
>> +            let hwcfg2 = with_bar!(bar, |b| regs::FalconHwcfg2::read(b, E::BASE))?;
>> +            if !hwcfg2.riscv() {
>> +                dev_err!(
>> +                    pdev.as_ref(),
>> +                    "riscv support requested on falcon that does not support it\n"
>> +                );
>> +                return Err(EINVAL);
>> +            }
>> +        }
>> +
>> +        Ok(Self {
>> +            hal: hal::create_falcon_hal(chipset)?,
>
> I'd prefer to move the contents of create_falcon_hal() into this constructor.

I think it is actually beneficial to have this in a dedicated method:
that way the individual HAL constructors do not need to be visible to
the `falcon` module and can be contained in the `hal` sub-module, which
I think helps keeping things at their place. Is there a good reason to
prefer doing it here?

Ah, maybe you are thinking that we are returning a Boxed HAL because we
are going through this function? It's actually on purpose - see below.

>> +pub(crate) struct Gsp;
>> +impl FalconEngine for Gsp {
>> +    const BASE: usize = 0x00110000;
>> +}
>> +
>> +pub(crate) type GspFalcon = Falcon<Gsp>;
>
> Please drop this type alias, Falcon<Gsp> seems simple enough and is much more
> obvious IMHO.

Yeah, I wanted to avoid having to import two symbols into the gpu
module, but I've probably been overthinking it again.

>
>> +
>> +impl Falcon<Gsp> {
>> +    /// Clears the SWGEN0 bit in the Falcon's IRQ status clear register to
>> +    /// allow GSP to signal CPU for processing new messages in message queue.
>> +    pub(crate) fn clear_swgen0_intr(&self, bar: &Devres<Bar0>) -> Result<()> {
>> +        with_bar!(bar, |b| regs::FalconIrqsclr::default()
>> +            .set_swgen0(true)
>> +            .write(b, Gsp::BASE))
>> +    }
>> +}
>> diff --git a/drivers/gpu/nova-core/falcon/hal.rs b/drivers/gpu/nova-core/falcon/hal.rs
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..5ebf4e88f1f25a13cf47859a53507be53e795d34
>> --- /dev/null
>> +++ b/drivers/gpu/nova-core/falcon/hal.rs
>> @@ -0,0 +1,54 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +use kernel::devres::Devres;
>> +use kernel::prelude::*;
>> +use kernel::sync::Arc;
>> +
>> +use crate::driver::Bar0;
>> +use crate::falcon::{FalconBromParams, FalconEngine};
>> +use crate::gpu::Chipset;
>> +use crate::timer::Timer;
>> +
>> +mod ga102;
>> +
>> +/// Hardware Abstraction Layer for Falcon cores.
>> +///
>> +/// Implements chipset-specific low-level operations. The trait is generic against [`FalconEngine`]
>> +/// so its `BASE` parameter can be used in order to avoid runtime bound checks when accessing
>> +/// registers.
>> +pub(crate) trait FalconHal<E: FalconEngine>: Sync {
>> +    // Activates the Falcon core if the engine is a risvc/falcon dual engine.
>> +    fn select_core(&self, _bar: &Devres<Bar0>, _timer: &Timer) -> Result<()> {
>> +        Ok(())
>> +    }
>> +
>> +    fn get_signature_reg_fuse_version(
>> +        &self,
>> +        bar: &Devres<Bar0>,
>> +        engine_id_mask: u16,
>> +        ucode_id: u8,
>> +    ) -> Result<u32>;
>> +
>> +    // Program the BROM registers prior to starting a secure firmware.
>> +    fn program_brom(&self, bar: &Devres<Bar0>, params: &FalconBromParams) -> Result<()>;
>> +}
>> +
>> +/// Returns a boxed falcon HAL adequate for the passed `chipset`.
>> +///
>> +/// We use this function and a heap-allocated trait object instead of statically defined trait
>> +/// objects because of the two-dimensional (Chipset, Engine) lookup required to return the
>> +/// requested HAL.
>
> Do we really need the dynamic dispatch? AFAICS, there's only E::BASE that is
> relevant to FalconHal impls?
>
> Can't we do something like I do in the following example [1]?
>
> ```
> use std::marker::PhantomData;
> use std::ops::Deref;
>
> trait Engine {
>     const BASE: u32;
> }
>
> trait Hal<E: Engine> {
>     fn access(&self);
> }
>
> struct Gsp;
>
> impl Engine for Gsp {
>     const BASE: u32 = 0x1;
> }
>
> struct Sec2;
>
> impl Engine for Sec2 {
>     const BASE: u32 = 0x2;
> }
>
> struct GA100<E: Engine>(PhantomData<E>);
>
> impl<E: Engine> Hal<E> for GA100<E> {
>     fn access(&self) {
>         println!("Base: {}", E::BASE);
>     }
> }
>
> impl<E: Engine> GA100<E> {
>     fn new() -> Self {
>         Self(PhantomData)
>     }
> }
>
> //struct Falcon<E: Engine>(GA100<E>);
>
> struct Falcon<H: Hal<E>, E: Engine>(H, PhantomData<E>);
>
> impl<H: Hal<E>, E: Engine> Falcon<H, E> {
>     fn new(hal: H) -> Self {
>         Self(hal, PhantomData)
>     }
> }
>
> impl<H: Hal<E>, E: Engine> Deref for Falcon<H, E> {
>     type Target = H;
>
>     fn deref(&self) -> &Self::Target {
>         &self.0
>     }
> }
>
> fn main() {
>     let gsp = Falcon::new(GA100::<Gsp>::new());
>     let sec2 = Falcon::new(GA100::<Sec2>::new());
>
>     gsp.access();
>     sec2.access();
> }
> ```
>
> [1] https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=bf7035a07e79a4047fb6834eac03a9f2

So are you have noticed there are two dimensions from which the falcons
can be instantiated:

- The engine, which determines its register BASE,
- The HAL, which is determined by the chipset.

For the engine, I want to keep things static for the main reason that if
BASE was dynamic, we would have to do all our IO using
try_read()/try_write() and check for an out-of-bounds error at each
register access. The cost of monomorphization is limited as there are
only a handful of engines.

But the HAL introduces a second dimension to this, and if we support N
engines then the amount of monomorphized code would then increase by N
for each new HAL we add. Chipsets are released at a good cadence, so
this is the dimension that risks growing the most.

It is also the one that makes use of methods to abstract things (vs.
fixed parameters), so it is a natural candidate for using virtual
methods. I am not a fan of having ever-growing boilerplate match
statements for each method that needs to be abstracted, especially since
this is that virtual methods do without requiring extra code, and for a
runtime penalty that is completely negligible in our context and IMHO
completely balanced by the smaller binary size that results from their
use.

Cheers,
Alex.
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Joel Fernandes 9 months, 2 weeks ago

On 4/30/2025 9:25 AM, Alexandre Courbot wrote:
> Hi Danilo,
> 
> On Tue Apr 22, 2025 at 11:44 PM JST, Danilo Krummrich wrote:
>> This patch could probably split up a bit, to make it more pleasant to review. :)
> 
> Probably yes. I thought since it is mostly new files, splitting up
> wouldn't change much. Let me see what I can do.
> 
>>
>> On Sun, Apr 20, 2025 at 09:19:43PM +0900, Alexandre Courbot wrote:
>>>
>>> +#[repr(u8)]
>>> +#[derive(Debug, Default, Copy, Clone)]
>>> +pub(crate) enum FalconSecurityModel {
>>> +    #[default]
>>> +    None = 0,
>>> +    Light = 2,
>>> +    Heavy = 3,
>>> +}
>>
>> Please add an explanation for the different security modules. Where are the
>> differences?
>>
>> I think most of the structures, registers, abbreviations, etc. introduced in
>> this patch need some documentation.
> 
> I've documented things a bit better for the next revision.
> 
>>
>> Please see https://docs.kernel.org/gpu/nova/guidelines.html#documentation.
>>
>>> +
>>> +impl TryFrom<u32> for FalconSecurityModel {
>>> +    type Error = Error;
>>> +
>>> +    fn try_from(value: u32) -> core::result::Result<Self, Self::Error> {
>>> +        use FalconSecurityModel::*;
>>> +
>>> +        let sec_model = match value {
>>> +            0 => None,
>>> +            2 => Light,
>>> +            3 => Heavy,
>>> +            _ => return Err(EINVAL),
>>> +        };
>>> +
>>> +        Ok(sec_model)
>>> +    }
>>> +}
>>> +
>>> +#[repr(u8)]
>>> +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
>>> +pub(crate) enum FalconCoreRevSubversion {
>>> +    #[default]
>>> +    Subversion0 = 0,
>>> +    Subversion1 = 1,
>>> +    Subversion2 = 2,
>>> +    Subversion3 = 3,
>>> +}
>>> +
>>> +impl From<u32> for FalconCoreRevSubversion {
>>> +    fn from(value: u32) -> Self {
>>> +        use FalconCoreRevSubversion::*;
>>> +
>>> +        match value & 0b11 {
>>> +            0 => Subversion0,
>>> +            1 => Subversion1,
>>> +            2 => Subversion2,
>>> +            3 => Subversion3,
>>> +            // SAFETY: the `0b11` mask limits the possible values to `0..=3`.
>>> +            4..=u32::MAX => unsafe { unreachable_unchecked() },
>>> +        }
>>
>> FalconCoreRev uses TryFrom to avoid unsafe code, I think FalconCoreRevSubversion
>> should do the same thing.
> 
> Since the field from which `FalconCoreRevSubversion` is built is only 2
> bits, I thought we could avoid using `TryFrom` since we are effectively
> covering all possible values (I wish Rust has n-bit integer types :)).
> But yeah I have probably overthought that, and that unsafe block is
> unsightly. Converted to `TryFrom`.
> 
>>
>>> +/// Trait defining the parameters of a given Falcon instance.
>>> +pub(crate) trait FalconEngine: Sync {
>>> +    /// Base I/O address for the falcon, relative from which its registers are accessed.
>>> +    const BASE: usize;
>>> +}
>>> +
>>> +/// Represents a portion of the firmware to be loaded into a particular memory (e.g. IMEM or DMEM).
>>> +#[derive(Debug)]
>>> +pub(crate) struct FalconLoadTarget {
>>> +    /// Offset from the start of the source object to copy from.
>>> +    pub(crate) src_start: u32,
>>> +    /// Offset from the start of the destination memory to copy into.
>>> +    pub(crate) dst_start: u32,
>>> +    /// Number of bytes to copy.
>>> +    pub(crate) len: u32,
>>> +}
>>> +
>>> +#[derive(Debug)]
>>> +pub(crate) struct FalconBromParams {
>>> +    pub(crate) pkc_data_offset: u32,
>>> +    pub(crate) engine_id_mask: u16,
>>> +    pub(crate) ucode_id: u8,
>>> +}
>>> +
>>> +pub(crate) trait FalconFirmware {
>>> +    type Target: FalconEngine;
>>> +
>>> +    /// Returns the DMA handle of the object containing the firmware.
>>> +    fn dma_handle(&self) -> bindings::dma_addr_t;
>>> +
>>> +    /// Returns the load parameters for `IMEM`.
>>> +    fn imem_load(&self) -> FalconLoadTarget;
>>> +
>>> +    /// Returns the load parameters for `DMEM`.
>>> +    fn dmem_load(&self) -> FalconLoadTarget;
>>> +
>>> +    /// Returns the parameters to write into the BROM registers.
>>> +    fn brom_params(&self) -> FalconBromParams;
>>> +
>>> +    /// Returns the start address of the firmware.
>>> +    fn boot_addr(&self) -> u32;
>>> +}
>>> +
>>> +/// Contains the base parameters common to all Falcon instances.
>>> +pub(crate) struct Falcon<E: FalconEngine> {
>>> +    pub hal: Arc<dyn FalconHal<E>>,
>>
>> This should probably be private and instead should be exposed via Deref.
> 
> Agreed - actually not all the HAL is supposed to be exposed, so I've
> added a proxy method for the only method that needs to be called from
> outside this module.
> 
>>
>> Also, please see my comment at create_falcon_hal() regarding the dynamic
>> dispatch.
>>
>>> +}
>>> +
>>> +impl<E: FalconEngine + 'static> Falcon<E> {
>>> +    pub(crate) fn new(
>>> +        pdev: &pci::Device,
>>> +        chipset: Chipset,
>>> +        bar: &Devres<Bar0>,
>>> +        need_riscv: bool,
>>> +    ) -> Result<Self> {
>>> +        let hwcfg1 = with_bar!(bar, |b| regs::FalconHwcfg1::read(b, E::BASE))?;
>>> +        // Ensure that the revision and security model contain valid values.
>>> +        let _rev = hwcfg1.core_rev()?;
>>> +        let _sec_model = hwcfg1.security_model()?;
>>> +
>>> +        if need_riscv {
>>> +            let hwcfg2 = with_bar!(bar, |b| regs::FalconHwcfg2::read(b, E::BASE))?;
>>> +            if !hwcfg2.riscv() {
>>> +                dev_err!(
>>> +                    pdev.as_ref(),
>>> +                    "riscv support requested on falcon that does not support it\n"
>>> +                );
>>> +                return Err(EINVAL);
>>> +            }
>>> +        }
>>> +
>>> +        Ok(Self {
>>> +            hal: hal::create_falcon_hal(chipset)?,
>>
>> I'd prefer to move the contents of create_falcon_hal() into this constructor.
> 
> I think it is actually beneficial to have this in a dedicated method:
> that way the individual HAL constructors do not need to be visible to
> the `falcon` module and can be contained in the `hal` sub-module, which
> I think helps keeping things at their place. Is there a good reason to
> prefer doing it here?
> 
> Ah, maybe you are thinking that we are returning a Boxed HAL because we
> are going through this function? It's actually on purpose - see below.
> 
>>> +pub(crate) struct Gsp;
>>> +impl FalconEngine for Gsp {
>>> +    const BASE: usize = 0x00110000;
>>> +}
>>> +
>>> +pub(crate) type GspFalcon = Falcon<Gsp>;
>>
>> Please drop this type alias, Falcon<Gsp> seems simple enough and is much more
>> obvious IMHO.
> 
> Yeah, I wanted to avoid having to import two symbols into the gpu
> module, but I've probably been overthinking it again.
> 
>>
>>> +
>>> +impl Falcon<Gsp> {
>>> +    /// Clears the SWGEN0 bit in the Falcon's IRQ status clear register to
>>> +    /// allow GSP to signal CPU for processing new messages in message queue.
>>> +    pub(crate) fn clear_swgen0_intr(&self, bar: &Devres<Bar0>) -> Result<()> {
>>> +        with_bar!(bar, |b| regs::FalconIrqsclr::default()
>>> +            .set_swgen0(true)
>>> +            .write(b, Gsp::BASE))
>>> +    }
>>> +}
>>> diff --git a/drivers/gpu/nova-core/falcon/hal.rs b/drivers/gpu/nova-core/falcon/hal.rs
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..5ebf4e88f1f25a13cf47859a53507be53e795d34
>>> --- /dev/null
>>> +++ b/drivers/gpu/nova-core/falcon/hal.rs
>>> @@ -0,0 +1,54 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +
>>> +use kernel::devres::Devres;
>>> +use kernel::prelude::*;
>>> +use kernel::sync::Arc;
>>> +
>>> +use crate::driver::Bar0;
>>> +use crate::falcon::{FalconBromParams, FalconEngine};
>>> +use crate::gpu::Chipset;
>>> +use crate::timer::Timer;
>>> +
>>> +mod ga102;
>>> +
>>> +/// Hardware Abstraction Layer for Falcon cores.
>>> +///
>>> +/// Implements chipset-specific low-level operations. The trait is generic against [`FalconEngine`]
>>> +/// so its `BASE` parameter can be used in order to avoid runtime bound checks when accessing
>>> +/// registers.
>>> +pub(crate) trait FalconHal<E: FalconEngine>: Sync {
>>> +    // Activates the Falcon core if the engine is a risvc/falcon dual engine.
>>> +    fn select_core(&self, _bar: &Devres<Bar0>, _timer: &Timer) -> Result<()> {
>>> +        Ok(())
>>> +    }
>>> +
>>> +    fn get_signature_reg_fuse_version(
>>> +        &self,
>>> +        bar: &Devres<Bar0>,
>>> +        engine_id_mask: u16,
>>> +        ucode_id: u8,
>>> +    ) -> Result<u32>;
>>> +
>>> +    // Program the BROM registers prior to starting a secure firmware.
>>> +    fn program_brom(&self, bar: &Devres<Bar0>, params: &FalconBromParams) -> Result<()>;
>>> +}
>>> +
>>> +/// Returns a boxed falcon HAL adequate for the passed `chipset`.
>>> +///
>>> +/// We use this function and a heap-allocated trait object instead of statically defined trait
>>> +/// objects because of the two-dimensional (Chipset, Engine) lookup required to return the
>>> +/// requested HAL.
>>
>> Do we really need the dynamic dispatch? AFAICS, there's only E::BASE that is
>> relevant to FalconHal impls?
>>
>> Can't we do something like I do in the following example [1]?
>>
>> ```
>> use std::marker::PhantomData;
>> use std::ops::Deref;
>>
>> trait Engine {
>>     const BASE: u32;
>> }
>>
>> trait Hal<E: Engine> {
>>     fn access(&self);
>> }
>>
>> struct Gsp;
>>
>> impl Engine for Gsp {
>>     const BASE: u32 = 0x1;
>> }
>>
>> struct Sec2;
>>
>> impl Engine for Sec2 {
>>     const BASE: u32 = 0x2;
>> }
>>
>> struct GA100<E: Engine>(PhantomData<E>);
>>
>> impl<E: Engine> Hal<E> for GA100<E> {
>>     fn access(&self) {
>>         println!("Base: {}", E::BASE);
>>     }
>> }
>>
>> impl<E: Engine> GA100<E> {
>>     fn new() -> Self {
>>         Self(PhantomData)
>>     }
>> }
>>
>> //struct Falcon<E: Engine>(GA100<E>);
>>
>> struct Falcon<H: Hal<E>, E: Engine>(H, PhantomData<E>);
>>
>> impl<H: Hal<E>, E: Engine> Falcon<H, E> {
>>     fn new(hal: H) -> Self {
>>         Self(hal, PhantomData)
>>     }
>> }
>>
>> impl<H: Hal<E>, E: Engine> Deref for Falcon<H, E> {
>>     type Target = H;
>>
>>     fn deref(&self) -> &Self::Target {
>>         &self.0
>>     }
>> }
>>
>> fn main() {
>>     let gsp = Falcon::new(GA100::<Gsp>::new());
>>     let sec2 = Falcon::new(GA100::<Sec2>::new());
>>
>>     gsp.access();
>>     sec2.access();
>> }
>> ```
>>
>> [1] https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=bf7035a07e79a4047fb6834eac03a9f2
> 
> So are you have noticed there are two dimensions from which the falcons
> can be instantiated:
> 
> - The engine, which determines its register BASE,
> - The HAL, which is determined by the chipset.
> 
> For the engine, I want to keep things static for the main reason that if
> BASE was dynamic, we would have to do all our IO using
> try_read()/try_write() and check for an out-of-bounds error at each
> register access. The cost of monomorphization is limited as there are
> only a handful of engines.
> 
> But the HAL introduces a second dimension to this, and if we support N
> engines then the amount of monomorphized code would then increase by N
> for each new HAL we add. Chipsets are released at a good cadence, so
> this is the dimension that risks growing the most.
> 
> It is also the one that makes use of methods to abstract things (vs.
> fixed parameters), so it is a natural candidate for using virtual
> methods. I am not a fan of having ever-growing boilerplate match
> statements for each method that needs to be abstracted, especially since
> this is that virtual methods do without requiring extra code, and for a
> runtime penalty that is completely negligible in our context and IMHO
> completely balanced by the smaller binary size that results from their
> use.
Adding to what Alex said, note that the runtime cost is still there even without
using dyn. Because at runtime, the match conditionals need to route function
calls to the right place. I am just not seeing the benefits of not using dyn for
this use case and only drawbacks. IMHO, we should try to not be doing the
compiler's job.

Maybe the only benefit is you don't need an Arc or Kbox wrapper?

 - Joel
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Danilo Krummrich 9 months, 2 weeks ago
On Wed, Apr 30, 2025 at 10:38:11AM -0400, Joel Fernandes wrote:
> On 4/30/2025 9:25 AM, Alexandre Courbot wrote:
> > On Tue Apr 22, 2025 at 11:44 PM JST, Danilo Krummrich wrote:
> 
> >>> +/// Returns a boxed falcon HAL adequate for the passed `chipset`.
> >>> +///
> >>> +/// We use this function and a heap-allocated trait object instead of statically defined trait
> >>> +/// objects because of the two-dimensional (Chipset, Engine) lookup required to return the
> >>> +/// requested HAL.
> >>
> >> Do we really need the dynamic dispatch? AFAICS, there's only E::BASE that is
> >> relevant to FalconHal impls?
> >>
> >> Can't we do something like I do in the following example [1]?
> >>
> >> [1] https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=bf7035a07e79a4047fb6834eac03a9f2
> > 
> > So are you have noticed there are two dimensions from which the falcons
> > can be instantiated:
> > 
> > - The engine, which determines its register BASE,
> > - The HAL, which is determined by the chipset.
> > 
> > For the engine, I want to keep things static for the main reason that if
> > BASE was dynamic, we would have to do all our IO using
> > try_read()/try_write() and check for an out-of-bounds error at each
> > register access. The cost of monomorphization is limited as there are
> > only a handful of engines.
> > 
> > But the HAL introduces a second dimension to this, and if we support N
> > engines then the amount of monomorphized code would then increase by N
> > for each new HAL we add. Chipsets are released at a good cadence, so
> > this is the dimension that risks growing the most.

I agree, avoiding the dynamic dispatch is probably not worth in this case
considering the long term. However, I wanted to point out an alternative with
[2].

> > It is also the one that makes use of methods to abstract things (vs.
> > fixed parameters), so it is a natural candidate for using virtual
> > methods. I am not a fan of having ever-growing boilerplate match
> > statements for each method that needs to be abstracted, especially since
> > this is that virtual methods do without requiring extra code, and for a
> > runtime penalty that is completely negligible in our context and IMHO
> > completely balanced by the smaller binary size that results from their
> > use.
>
> Adding to what Alex said, note that the runtime cost is still there even without
> using dyn. Because at runtime, the match conditionals need to route function
> calls to the right place.

Honestly, I don't know how dynamic dispatch scales compared to static dispatch
with conditionals.

OOC, I briefly looked for a benchmark and found [3], which doesn't look
unreasonable at a first glance.

I modified it real quick to have more than 2 actions. [4]

2 Actions
---------
Dynamic Dispatch: time:   [2.0679 ns 2.0825 ns 2.0945 ns]
 Static Dispatch: time:   [850.29 ps 851.05 ps 852.36 ps]

20 Actions
----------
Dynamic Dispatch: time:   [21.368 ns 21.827 ns 22.284 ns]
 Static Dispatch: time:   [1.3623 ns 1.3703 ns 1.3793 ns]

100 Actions
-----------
Dynamic Dispatch: time:   [103.72 ns 104.33 ns 105.13 ns]
 Static Dispatch: time:   [4.5905 ns 4.6311 ns 4.6775 ns]

Absolutely take it with a grain of salt, I neither spend a lot of brain power
nor time on this, which usually is not a great combination with benchmarking
things. :)

However, I think it's probably not too important here. Hence, feel free to go
with dynamic dispatch for this.

> I am just not seeing the benefits of not using dyn for
> this use case and only drawbacks. IMHO, we should try to not be doing the
> compiler's job.
> 
> Maybe the only benefit is you don't need an Arc or Kbox wrapper?

That's not a huge concern for me, it's only one single allocation per Engine,
correct?

[2] https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=99ce0f12542488f78e35356c99a1e23f
[3] https://github.com/tailcallhq/rust-benchmarks
[4] https://pastebin.com/k0PqtQnq
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Alexandre Courbot 9 months, 2 weeks ago
On Thu May 1, 2025 at 3:16 AM JST, Danilo Krummrich wrote:
> On Wed, Apr 30, 2025 at 10:38:11AM -0400, Joel Fernandes wrote:
>> On 4/30/2025 9:25 AM, Alexandre Courbot wrote:
>> > On Tue Apr 22, 2025 at 11:44 PM JST, Danilo Krummrich wrote:
>> 
>> >>> +/// Returns a boxed falcon HAL adequate for the passed `chipset`.
>> >>> +///
>> >>> +/// We use this function and a heap-allocated trait object instead of statically defined trait
>> >>> +/// objects because of the two-dimensional (Chipset, Engine) lookup required to return the
>> >>> +/// requested HAL.
>> >>
>> >> Do we really need the dynamic dispatch? AFAICS, there's only E::BASE that is
>> >> relevant to FalconHal impls?
>> >>
>> >> Can't we do something like I do in the following example [1]?
>> >>
>> >> [1] https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=bf7035a07e79a4047fb6834eac03a9f2
>> > 
>> > So are you have noticed there are two dimensions from which the falcons
>> > can be instantiated:
>> > 
>> > - The engine, which determines its register BASE,
>> > - The HAL, which is determined by the chipset.
>> > 
>> > For the engine, I want to keep things static for the main reason that if
>> > BASE was dynamic, we would have to do all our IO using
>> > try_read()/try_write() and check for an out-of-bounds error at each
>> > register access. The cost of monomorphization is limited as there are
>> > only a handful of engines.
>> > 
>> > But the HAL introduces a second dimension to this, and if we support N
>> > engines then the amount of monomorphized code would then increase by N
>> > for each new HAL we add. Chipsets are released at a good cadence, so
>> > this is the dimension that risks growing the most.
>
> I agree, avoiding the dynamic dispatch is probably not worth in this case
> considering the long term. However, I wanted to point out an alternative with
> [2].
>
>> > It is also the one that makes use of methods to abstract things (vs.
>> > fixed parameters), so it is a natural candidate for using virtual
>> > methods. I am not a fan of having ever-growing boilerplate match
>> > statements for each method that needs to be abstracted, especially since
>> > this is that virtual methods do without requiring extra code, and for a
>> > runtime penalty that is completely negligible in our context and IMHO
>> > completely balanced by the smaller binary size that results from their
>> > use.
>>
>> Adding to what Alex said, note that the runtime cost is still there even without
>> using dyn. Because at runtime, the match conditionals need to route function
>> calls to the right place.
>
> Honestly, I don't know how dynamic dispatch scales compared to static dispatch
> with conditionals.
>
> OOC, I briefly looked for a benchmark and found [3], which doesn't look
> unreasonable at a first glance.
>
> I modified it real quick to have more than 2 actions. [4]
>
> 2 Actions
> ---------
> Dynamic Dispatch: time:   [2.0679 ns 2.0825 ns 2.0945 ns]
>  Static Dispatch: time:   [850.29 ps 851.05 ps 852.36 ps]
>
> 20 Actions
> ----------
> Dynamic Dispatch: time:   [21.368 ns 21.827 ns 22.284 ns]
>  Static Dispatch: time:   [1.3623 ns 1.3703 ns 1.3793 ns]
>
> 100 Actions
> -----------
> Dynamic Dispatch: time:   [103.72 ns 104.33 ns 105.13 ns]
>  Static Dispatch: time:   [4.5905 ns 4.6311 ns 4.6775 ns]
>
> Absolutely take it with a grain of salt, I neither spend a lot of brain power
> nor time on this, which usually is not a great combination with benchmarking
> things. :)
>
> However, I think it's probably not too important here. Hence, feel free to go
> with dynamic dispatch for this.

Indeed, it looks like the cost of dispatch will be completely shadowed
by the IO behind it anyway. And these HAL calls are like a few here and
there anyway, it's not like they are on a critical path.

>
>> I am just not seeing the benefits of not using dyn for
>> this use case and only drawbacks. IMHO, we should try to not be doing the
>> compiler's job.
>> 
>> Maybe the only benefit is you don't need an Arc or Kbox wrapper?
>
> That's not a huge concern for me, it's only one single allocation per Engine,
> correct?

Correct. Note that for other engines we will be able to store the HALs as
static singletons instead of building them on the heap like I am
currently doing. The reason for doing this on falcon is that the
dual-dimension of the instances makes it more complex to build and look
them up.

... or maybe I could just use a macro? Let me try that and see whether
it works.
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Joel Fernandes 9 months, 2 weeks ago
Hi Alex,

On 4/30/2025 8:09 PM, Alexandre Courbot wrote:
>>> I am just not seeing the benefits of not using dyn for
>>> this use case and only drawbacks. IMHO, we should try to not be doing the
>>> compiler's job.
>>>
>>> Maybe the only benefit is you don't need an Arc or Kbox wrapper?
>> That's not a huge concern for me, it's only one single allocation per Engine,
>> correct?
> Correct. Note that for other engines we will be able to store the HALs as
> static singletons instead of building them on the heap like I am
> currently doing. The reason for doing this on falcon is that the
> dual-dimension of the instances makes it more complex to build and look
> them up.
> 
> ... or maybe I could just use a macro? Let me try that and see whether
> it works.

Do you mean a macro for create_falcon_hal which adds an entry to this?

 let hal = match chipset {
    Chipset::GA102 | Chipset::GA103 | Chipset::GA104 | Chipset::GA106
|Chipset::GA107 => { .. }


Actually it would be nice if a single macro defined both a chipset and created
the hal together in the above list, that way the definition of a "chipset" is in
a singe place. Kind of like what I did in the vbios patch for various BiosImage.
But not sure how easy it is to do for Falcon.

Or perhaps you meant a macro that statically allocates the Engine + HAL
combination, and avoids need for Arc/KBox and their corresponding allocations?

thanks,

 - Joel
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Alexandre Courbot 9 months, 1 week ago
On Thu May 1, 2025 at 9:22 AM JST, Joel Fernandes wrote:
> Hi Alex,
>
> On 4/30/2025 8:09 PM, Alexandre Courbot wrote:
>>>> I am just not seeing the benefits of not using dyn for
>>>> this use case and only drawbacks. IMHO, we should try to not be doing the
>>>> compiler's job.
>>>>
>>>> Maybe the only benefit is you don't need an Arc or Kbox wrapper?
>>> That's not a huge concern for me, it's only one single allocation per Engine,
>>> correct?
>> Correct. Note that for other engines we will be able to store the HALs as
>> static singletons instead of building them on the heap like I am
>> currently doing. The reason for doing this on falcon is that the
>> dual-dimension of the instances makes it more complex to build and look
>> them up.
>> 
>> ... or maybe I could just use a macro? Let me try that and see whether
>> it works.
>
> Do you mean a macro for create_falcon_hal which adds an entry to this?
>
>  let hal = match chipset {
>     Chipset::GA102 | Chipset::GA103 | Chipset::GA104 | Chipset::GA106
> |Chipset::GA107 => { .. }
>
>
> Actually it would be nice if a single macro defined both a chipset and created
> the hal together in the above list, that way the definition of a "chipset" is in
> a singe place. Kind of like what I did in the vbios patch for various BiosImage.
> But not sure how easy it is to do for Falcon.
>
> Or perhaps you meant a macro that statically allocates the Engine + HAL
> combination, and avoids need for Arc/KBox and their corresponding allocations?

I was thinking of a macro to create all the Chipset * Engine static
instances of HALs, and generate the body of a lookup function to return
the right one for a given Chipset at runtime.

But trying to write it, I realized it wasn't as easy as I thought since
generics cannot be used as macro parameters - i.e. if you have 
<E: Engine> as a generic and pass `E` to the macro, it will see... `E`
and not whatever was bound to `E` when the code is monomorphized (macros
are expanded before generics it seems).

A solution to that would involve new traits and a bunch of boilerplate,
which I have decided is not worth the trouble to save one 8-bytes object
on the heap per falcon instance. :) I'll keep things as they currently
are for now.
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Joel Fernandes 9 months, 2 weeks ago
On 4/30/2025 2:16 PM, Danilo Krummrich wrote:
[...]
>>> It is also the one that makes use of methods to abstract things (vs.
>>> fixed parameters), so it is a natural candidate for using virtual
>>> methods. I am not a fan of having ever-growing boilerplate match
>>> statements for each method that needs to be abstracted, especially since
>>> this is that virtual methods do without requiring extra code, and for a
>>> runtime penalty that is completely negligible in our context and IMHO
>>> completely balanced by the smaller binary size that results from their
>>> use.
>>
>> Adding to what Alex said, note that the runtime cost is still there even without
>> using dyn. Because at runtime, the match conditionals need to route function
>> calls to the right place.
> 
> Honestly, I don't know how dynamic dispatch scales compared to static dispatch
> with conditionals.
> 
> OOC, I briefly looked for a benchmark and found [3], which doesn't look
> unreasonable at a first glance.
> 
> I modified it real quick to have more than 2 actions. [4]
> 
> 2 Actions
> ---------
> Dynamic Dispatch: time:   [2.0679 ns 2.0825 ns 2.0945 ns]
>  Static Dispatch: time:   [850.29 ps 851.05 ps 852.36 ps]
> 
> 20 Actions
> ----------
> Dynamic Dispatch: time:   [21.368 ns 21.827 ns 22.284 ns]
>  Static Dispatch: time:   [1.3623 ns 1.3703 ns 1.3793 ns]
> 
> 100 Actions
> -----------
> Dynamic Dispatch: time:   [103.72 ns 104.33 ns 105.13 ns]
>  Static Dispatch: time:   [4.5905 ns 4.6311 ns 4.6775 ns]
> 
> Absolutely take it with a grain of salt, I neither spend a lot of brain power
> nor time on this, which usually is not a great combination with benchmarking
> things. :)

Interesting, thanks for running the benchmark. I think this could be because of
function inlining during the static dispatch, so maybe at runtime there is no
overhead after all, even with long match statements. Just speculating, I have
not looked at codegen for this or anything.

But as you noted, the overhead still is not that much an issue (unless say the
method in concern is in an extremely hot path).

> 
> However, I think it's probably not too important here. Hence, feel free to go
> with dynamic dispatch for this.

Ok thanks, sounds good to me. It does seem the code is a lot more readable IMHO
as well, with dyn.

>> I am just not seeing the benefits of not using dyn for
>> this use case and only drawbacks. IMHO, we should try to not be doing the
>> compiler's job.
>>
>> Maybe the only benefit is you don't need an Arc or Kbox wrapper?
> 
> That's not a huge concern for me, it's only one single allocation per Engine,
> correct?

Yes, that's right. I was more referring to the fact that static dispatch as in
your example does not need Arc/Box, however even with Arc/Box IMHO the
readability of the code using dyn is more due to the lack of long match
statements on the access methods.

thanks,

- Joel
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Joel Fernandes 9 months, 2 weeks ago

> On Apr 22, 2025, at 10:45 AM, Danilo Krummrich <dakr@kernel.org> wrote:
> […]
>> +
>> +    fn get_signature_reg_fuse_version(
>> +        &self,
>> +        bar: &Devres<Bar0>,
>> +        engine_id_mask: u16,
>> +        ucode_id: u8,
>> +    ) -> Result<u32>;
>> +
>> +    // Program the BROM registers prior to starting a secure firmware.
>> +    fn program_brom(&self, bar: &Devres<Bar0>, params: &FalconBromParams) -> Result<()>;
>> +}
>> +
>> +/// Returns a boxed falcon HAL adequate for the passed `chipset`.
>> +///
>> +/// We use this function and a heap-allocated trait object instead of statically defined trait
>> +/// objects because of the two-dimensional (Chipset, Engine) lookup required to return the
>> +/// requested HAL.
> 
> Do we really need the dynamic dispatch? AFAICS, there's only E::BASE that is
> relevant to FalconHal impls?
> 
> Can't we do something like I do in the following example [1]?
> 
> ```
> use std::marker::PhantomData;
> use std::ops::Deref;
> 
> trait Engine {
>    const BASE: u32;
> }
> 
> trait Hal<E: Engine> {
>    fn access(&self);
> }
> 
> struct Gsp;
> 
> impl Engine for Gsp {
>    const BASE: u32 = 0x1;
> }
> 
> struct Sec2;
> 
> impl Engine for Sec2 {
>    const BASE: u32 = 0x2;
> }
> 
> struct GA100<E: Engine>(PhantomData<E>);
> 
> impl<E: Engine> Hal<E> for GA100<E> {
>    fn access(&self) {
>        println!("Base: {}", E::BASE);
>    }
> }
> 
> impl<E: Engine> GA100<E> {
>    fn new() -> Self {
>        Self(PhantomData)
>    }
> }
> 
> //struct Falcon<E: Engine>(GA100<E>);
> 
> struct Falcon<H: Hal<E>, E: Engine>(H, PhantomData<E>);
> 
> impl<H: Hal<E>, E: Engine> Falcon<H, E> {
>    fn new(hal: H) -> Self {
>        Self(hal, PhantomData)
>    }
> }
> 
> impl<H: Hal<E>, E: Engine> Deref for Falcon<H, E> {
>    type Target = H;
> 
>    fn deref(&self) -> &Self::Target {
>        &self.0
>    }
> }
> 
> fn main() {
>    let gsp = Falcon::new(GA100::<Gsp>::new());
>    let sec2 = Falcon::new(GA100::<Sec2>::new());
> 
>    gsp.access();
>    sec2.access();
> }
> ```
> 
> [1] https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=bf7035a07e79a4047fb6834eac03a9f2

I am still researching this idea from a rust point of view, but quick question - will this even work if the chip type (GAxxx) is determined at runtime? That does need runtime polymorphism.

Thanks,

 - Joel


> 
>> +///
>> +/// TODO: replace the return type with `KBox` once it gains the ability to host trait objects.
>> +pub(crate) fn create_falcon_hal<E: FalconEngine + 'static>(
>> +    chipset: Chipset,
>> +) -> Result<Arc<dyn FalconHal<E>>> {
>> +    let hal = match chipset {
>> +        Chipset::GA102 | Chipset::GA103 | Chipset::GA104 | Chipset::GA106 | Chipset::GA107 => {
>> +            Arc::new(ga102::Ga102::<E>::new(), GFP_KERNEL)? as Arc<dyn FalconHal<E>>
>> +        }
>> +        _ => return Err(ENOTSUPP),
>> +    };
>> +
>> +    Ok(hal)
>> +}
>> diff --git a/drivers/gpu/nova-core/falcon/hal/ga102.rs b/drivers/gpu/nova-core/falcon/hal/ga102.rs
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..747b02ca671f7d4a97142665a9ba64807c87391e
>> --- /dev/null
>> +++ b/drivers/gpu/nova-core/falcon/hal/ga102.rs
>> @@ -0,0 +1,111 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +use core::marker::PhantomData;
>> +use core::time::Duration;
>> +
>> +use kernel::devres::Devres;
>> +use kernel::prelude::*;
>> +
>> +use crate::driver::Bar0;
>> +use crate::falcon::{FalconBromParams, FalconEngine, FalconModSelAlgo, RiscvCoreSelect};
>> +use crate::regs;
>> +use crate::timer::Timer;
>> +
>> +use super::FalconHal;
>> +
>> +fn select_core_ga102<E: FalconEngine>(bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
>> +    let bcr_ctrl = with_bar!(bar, |b| regs::RiscvBcrCtrl::read(b, E::BASE))?;
>> +    if bcr_ctrl.core_select() != RiscvCoreSelect::Falcon {
>> +        with_bar!(bar, |b| regs::RiscvBcrCtrl::default()
>> +            .set_core_select(RiscvCoreSelect::Falcon)
>> +            .write(b, E::BASE))?;
>> +
>> +        timer.wait_on(bar, Duration::from_millis(10), || {
>> +            bar.try_access_with(|b| regs::RiscvBcrCtrl::read(b, E::BASE))
>> +                .and_then(|v| if v.valid() { Some(()) } else { None })
>> +        })?;
>> +    }
>> +
>> +    Ok(())
>> +}
>> +
>> +fn get_signature_reg_fuse_version_ga102(
>> +    bar: &Devres<Bar0>,
>> +    engine_id_mask: u16,
>> +    ucode_id: u8,
>> +) -> Result<u32> {
>> +    // The ucode fuse versions are contained in the FUSE_OPT_FPF_<ENGINE>_UCODE<X>_VERSION
>> +    // registers, which are an array. Our register definition macros do not allow us to manage them
>> +    // properly, so we need to hardcode their addresses for now.
>> +
>> +    // Each engine has 16 ucode version registers numbered from 1 to 16.
>> +    if ucode_id == 0 || ucode_id > 16 {
>> +        pr_warn!("invalid ucode id {:#x}", ucode_id);
>> +        return Err(EINVAL);
>> +    }
>> +    let reg_fuse = if engine_id_mask & 0x0001 != 0 {
>> +        // NV_FUSE_OPT_FPF_SEC2_UCODE1_VERSION
>> +        0x824140
>> +    } else if engine_id_mask & 0x0004 != 0 {
>> +        // NV_FUSE_OPT_FPF_NVDEC_UCODE1_VERSION
>> +        0x824100
>> +    } else if engine_id_mask & 0x0400 != 0 {
>> +        // NV_FUSE_OPT_FPF_GSP_UCODE1_VERSION
>> +        0x8241c0
>> +    } else {
>> +        pr_warn!("unexpected engine_id_mask {:#x}", engine_id_mask);
>> +        return Err(EINVAL);
>> +    } + ((ucode_id - 1) as usize * core::mem::size_of::<u32>());
>> +
>> +    let reg_fuse_version = with_bar!(bar, |b| { b.read32(reg_fuse) })?;
>> +
>> +    // Equivalent of Find Last Set bit.
>> +    Ok(u32::BITS - reg_fuse_version.leading_zeros())
>> +}
>> +
>> +fn program_brom_ga102<E: FalconEngine>(
>> +    bar: &Devres<Bar0>,
>> +    params: &FalconBromParams,
>> +) -> Result<()> {
>> +    with_bar!(bar, |b| {
>> +        regs::FalconBromParaaddr0::default()
>> +            .set_addr(params.pkc_data_offset)
>> +            .write(b, E::BASE);
>> +        regs::FalconBromEngidmask::default()
>> +            .set_mask(params.engine_id_mask as u32)
>> +            .write(b, E::BASE);
>> +        regs::FalconBromCurrUcodeId::default()
>> +            .set_ucode_id(params.ucode_id as u32)
>> +            .write(b, E::BASE);
>> +        regs::FalconModSel::default()
>> +            .set_algo(FalconModSelAlgo::Rsa3k)
>> +            .write(b, E::BASE)
>> +    })
>> +}
>> +
>> +pub(super) struct Ga102<E: FalconEngine>(PhantomData<E>);
>> +
>> +impl<E: FalconEngine> Ga102<E> {
>> +    pub(super) fn new() -> Self {
>> +        Self(PhantomData)
>> +    }
>> +}
>> +
>> +impl<E: FalconEngine> FalconHal<E> for Ga102<E> {
>> +    fn select_core(&self, bar: &Devres<Bar0>, timer: &Timer) -> Result<()> {
>> +        select_core_ga102::<E>(bar, timer)
>> +    }
>> +
>> +    fn get_signature_reg_fuse_version(
>> +        &self,
>> +        bar: &Devres<Bar0>,
>> +        engine_id_mask: u16,
>> +        ucode_id: u8,
>> +    ) -> Result<u32> {
>> +        get_signature_reg_fuse_version_ga102(bar, engine_id_mask, ucode_id)
>> +    }
>> +
>> +    fn program_brom(&self, bar: &Devres<Bar0>, params: &FalconBromParams) -> Result<()> {
>> +        program_brom_ga102::<E>(bar, params)
>> +    }
>> +}
>> diff --git a/drivers/gpu/nova-core/falcon/sec2.rs b/drivers/gpu/nova-core/falcon/sec2.rs
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..85dda3e8380a3d31d34c92c4236c6f81c63ce772
>> --- /dev/null
>> +++ b/drivers/gpu/nova-core/falcon/sec2.rs
>> @@ -0,0 +1,9 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +use crate::falcon::{Falcon, FalconEngine};
>> +
>> +pub(crate) struct Sec2;
>> +impl FalconEngine for Sec2 {
>> +    const BASE: usize = 0x00840000;
>> +}
>> +pub(crate) type Sec2Falcon = Falcon<Sec2>;
>> diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
>> index 1b3e43e0412e2a2ea178c7404ea647c9e38d4e04..ec4c648c6e8b4aa7d06c627ed59c0e66a08c679e 100644
>> --- a/drivers/gpu/nova-core/gpu.rs
>> +++ b/drivers/gpu/nova-core/gpu.rs
>> @@ -5,6 +5,8 @@
>> use crate::devinit;
>> use crate::dma::DmaObject;
>> use crate::driver::Bar0;
>> +use crate::falcon::gsp::GspFalcon;
>> +use crate::falcon::sec2::Sec2Falcon;
>> use crate::firmware::Firmware;
>> use crate::regs;
>> use crate::timer::Timer;
>> @@ -221,6 +223,20 @@ pub(crate) fn new(
>> 
>>         let timer = Timer::new();
>> 
>> +        let gsp_falcon = GspFalcon::new(
>> +            pdev,
>> +            spec.chipset,
>> +            &bar,
>> +            if spec.chipset > Chipset::GA100 {
>> +                true
>> +            } else {
>> +                false
>> +            },
>> +        )?;
>> +        gsp_falcon.clear_swgen0_intr(&bar)?;
>> +
>> +        let _sec2_falcon = Sec2Falcon::new(pdev, spec.chipset, &bar, true)?;
>> +
>>         Ok(pin_init!(Self {
>>             spec,
>>             bar,
>> diff --git a/drivers/gpu/nova-core/nova_core.rs b/drivers/gpu/nova-core/nova_core.rs
>> index df3468c92c6081b3e2db218d92fbe1c40a0a75c3..4dde8004d24882c60669b5acd6af9d6988c66a9c 100644
>> --- a/drivers/gpu/nova-core/nova_core.rs
>> +++ b/drivers/gpu/nova-core/nova_core.rs
>> @@ -23,6 +23,7 @@ macro_rules! with_bar {
>> mod devinit;
>> mod dma;
>> mod driver;
>> +mod falcon;
>> mod firmware;
>> mod gpu;
>> mod regs;
>> diff --git a/drivers/gpu/nova-core/regs.rs b/drivers/gpu/nova-core/regs.rs
>> index f191cf4eb44c2b950e5cfcc6d04f95c122ce29d3..c76a16dc8e7267a4eb54cb71e1cca6fb9e00188f 100644
>> --- a/drivers/gpu/nova-core/regs.rs
>> +++ b/drivers/gpu/nova-core/regs.rs
>> @@ -6,6 +6,10 @@
>> #[macro_use]
>> mod macros;
>> 
>> +use crate::falcon::{
>> +    FalconCoreRev, FalconCoreRevSubversion, FalconFbifMemType, FalconFbifTarget, FalconModSelAlgo,
>> +    FalconSecurityModel, RiscvCoreSelect,
>> +};
>> use crate::gpu::Chipset;
>> 
>> register!(Boot0@0x00000000, "Basic revision information about the GPU";
>> @@ -44,3 +48,188 @@
>> register!(Pgc6AonSecureScratchGroup05@0x00118234;
>>     31:0    value => as u32
>> );
>> +
>> +/* PFALCON */
>> +
>> +register!(FalconIrqsclr@+0x00000004;
>> +    4:4     halt => as_bit bool;
>> +    6:6     swgen0 => as_bit bool;
>> +);
>> +
>> +register!(FalconIrqstat@+0x00000008;
>> +    4:4     halt => as_bit bool;
>> +    6:6     swgen0 => as_bit bool;
>> +);
>> +
>> +register!(FalconIrqmclr@+0x00000014;
>> +    31:0    val => as u32
>> +);
>> +
>> +register!(FalconIrqmask@+0x00000018;
>> +    31:0    val => as u32
>> +);
>> +
>> +register!(FalconRm@+0x00000084;
>> +    31:0    val => as u32
>> +);
>> +
>> +register!(FalconIrqdest@+0x0000001c;
>> +    31:0    val => as u32
>> +);
>> +
>> +register!(FalconMailbox0@+0x00000040;
>> +    31:0    mailbox0 => as u32
>> +);
>> +register!(FalconMailbox1@+0x00000044;
>> +    31:0    mailbox1 => as u32
>> +);
>> +
>> +register!(FalconHwcfg2@+0x000000f4;
>> +    10:10   riscv => as_bit bool;
>> +    12:12   mem_scrubbing => as_bit bool;
>> +    31:31   reset_ready => as_bit bool;
>> +);
>> +
>> +register!(FalconCpuCtl@+0x00000100;
>> +    1:1     start_cpu => as_bit bool;
>> +    4:4     halted => as_bit bool;
>> +    6:6     alias_en => as_bit bool;
>> +);
>> +
>> +register!(FalconBootVec@+0x00000104;
>> +    31:0    boot_vec => as u32
>> +);
>> +
>> +register!(FalconHwCfg@+0x00000108;
>> +    8:0     imem_size => as u32;
>> +    17:9    dmem_size => as u32;
>> +);
>> +
>> +register!(FalconDmaCtl@+0x0000010c;
>> +    0:0     require_ctx => as_bit bool;
>> +    1:1     dmem_scrubbing  => as_bit bool;
>> +    2:2     imem_scrubbing => as_bit bool;
>> +    6:3     dmaq_num => as_bit u8;
>> +    7:7     secure_stat => as_bit bool;
>> +);
>> +
>> +register!(FalconDmaTrfBase@+0x00000110;
>> +    31:0    base => as u32;
>> +);
>> +
>> +register!(FalconDmaTrfMOffs@+0x00000114;
>> +    23:0    offs => as u32;
>> +);
>> +
>> +register!(FalconDmaTrfCmd@+0x00000118;
>> +    0:0     full => as_bit bool;
>> +    1:1     idle => as_bit bool;
>> +    3:2     sec => as_bit u8;
>> +    4:4     imem => as_bit bool;
>> +    5:5     is_write => as_bit bool;
>> +    10:8    size => as u8;
>> +    14:12   ctxdma => as u8;
>> +    16:16   set_dmtag => as u8;
>> +);
>> +
>> +register!(FalconDmaTrfBOffs@+0x0000011c;
>> +    31:0    offs => as u32;
>> +);
>> +
>> +register!(FalconDmaTrfBase1@+0x00000128;
>> +    8:0     base => as u16;
>> +);
>> +
>> +register!(FalconHwcfg1@+0x0000012c;
>> +    3:0     core_rev => try_into FalconCoreRev, "core revision of the falcon";
>> +    5:4     security_model => try_into FalconSecurityModel, "security model of the falcon";
>> +    7:6     core_rev_subversion => into FalconCoreRevSubversion;
>> +    11:8    imem_ports => as u8;
>> +    15:12   dmem_ports => as u8;
>> +);
>> +
>> +register!(FalconCpuCtlAlias@+0x00000130;
>> +    1:1     start_cpu => as_bit bool;
>> +);
>> +
>> +/* TODO: this is an array of registers */
>> +register!(FalconImemC@+0x00000180;
>> +    7:2     offs => as u8;
>> +    23:8    blk => as u8;
>> +    24:24   aincw => as_bit bool;
>> +    25:25   aincr => as_bit bool;
>> +    28:28   secure => as_bit bool;
>> +    29:29   sec_atomic => as_bit bool;
>> +);
>> +
>> +register!(FalconImemD@+0x00000184;
>> +    31:0    data => as u32;
>> +);
>> +
>> +register!(FalconImemT@+0x00000188;
>> +    15:0    data => as u16;
>> +);
>> +
>> +register!(FalconDmemC@+0x000001c0;
>> +    7:2     offs => as u8;
>> +    23:0    addr => as u32;
>> +    23:8    blk => as u8;
>> +    24:24   aincw => as_bit bool;
>> +    25:25   aincr => as_bit bool;
>> +    26:26   settag => as_bit bool;
>> +    27:27   setlvl => as_bit bool;
>> +    28:28   va => as_bit bool;
>> +    29:29   miss => as_bit bool;
>> +);
>> +
>> +register!(FalconDmemD@+0x000001c4;
>> +    31:0    data => as u32;
>> +);
>> +
>> +register!(FalconModSel@+0x00001180;
>> +    7:0     algo => try_into FalconModSelAlgo;
>> +);
>> +register!(FalconBromCurrUcodeId@+0x00001198;
>> +    31:0    ucode_id => as u32;
>> +);
>> +register!(FalconBromEngidmask@+0x0000119c;
>> +    31:0    mask => as u32;
>> +);
>> +register!(FalconBromParaaddr0@+0x00001210;
>> +    31:0    addr => as u32;
>> +);
>> +
>> +register!(RiscvCpuctl@+0x00000388;
>> +    0:0     startcpu => as_bit bool;
>> +    4:4     halted => as_bit bool;
>> +    5:5     stopped => as_bit bool;
>> +    7:7     active_stat => as_bit bool;
>> +);
>> +
>> +register!(FalconEngine@+0x000003c0;
>> +    0:0     reset => as_bit bool;
>> +);
>> +
>> +register!(RiscvIrqmask@+0x00000528;
>> +    31:0    mask => as u32;
>> +);
>> +
>> +register!(RiscvIrqdest@+0x0000052c;
>> +    31:0    dest => as u32;
>> +);
>> +
>> +/* TODO: this is an array of registers */
>> +register!(FalconFbifTranscfg@+0x00000600;
>> +    1:0     target => try_into FalconFbifTarget;
>> +    2:2     mem_type => as_bit FalconFbifMemType;
>> +);
>> +
>> +register!(FalconFbifCtl@+0x00000624;
>> +    7:7     allow_phys_no_ctx => as_bit bool;
>> +);
>> +
>> +register!(RiscvBcrCtrl@+0x00001668;
>> +    0:0     valid => as_bit bool;
>> +    4:4     core_select => as_bit RiscvCoreSelect;
>> +    8:8     br_fetch => as_bit bool;
>> +);
>> diff --git a/drivers/gpu/nova-core/timer.rs b/drivers/gpu/nova-core/timer.rs
>> index 8987352f4192bc9b4b2fc0fb5f2e8e62ff27be68..c03a5c36d1230dfbf2bd6e02a793264280c6d509 100644
>> --- a/drivers/gpu/nova-core/timer.rs
>> +++ b/drivers/gpu/nova-core/timer.rs
>> @@ -2,9 +2,6 @@
>> 
>> //! Nova Core Timer subdevice
>> 
>> -// To be removed when all code is used.
>> -#![allow(dead_code)]
>> -
>> use core::fmt::Display;
>> use core::ops::{Add, Sub};
>> use core::time::Duration;
>> 
>> --
>> 2.49.0
>> 
Re: [PATCH 11/16] gpu: nova-core: add falcon register definitions and base code
Posted by Danilo Krummrich 9 months, 2 weeks ago
On Wed, Apr 30, 2025 at 06:58:44AM +0000, Joel Fernandes wrote:
> > On Apr 22, 2025, at 10:45 AM, Danilo Krummrich <dakr@kernel.org> wrote:
> 
> > [1] https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=bf7035a07e79a4047fb6834eac03a9f2
> 
> I am still researching this idea from a rust point of view, but quick question - will this even work if the chip type (GAxxx) is determined at runtime? That does need runtime polymorphism.

I exetended the example in [2] to address this with `enum HalImpl<E: Engine>`
and a second architecture that is picked randomly. It needs match for every
access, but that's probably still better than the dynamic dispatch.

[2] https://play.rust-lang.org/?version=stable&mode=debug&edition=2024&gist=99ce0f12542488f78e35356c99a1e23f