From: Tim Kovalenko <tim.kovalenko@proton.me>
The `Cmdq::new` function was allocating a `PteArray` struct on the stack
and was causing a stack overflow with 8216 bytes.
Modify the `PteArray` to calculate and write the Page Table Entries
directly into the coherent DMA buffer one-by-one. This reduces the stack
usage quite a lot.
Signed-off-by: Tim Kovalenko <tim.kovalenko@proton.me>
---
drivers/gpu/nova-core/gsp.rs | 34 +++++++++++++++++++---------------
drivers/gpu/nova-core/gsp/cmdq.rs | 15 ++++++++++++++-
2 files changed, 33 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/nova-core/gsp.rs b/drivers/gpu/nova-core/gsp.rs
index 25cd48514c777cb405a2af0acf57196b2e2e7837..20170e483e04c476efce8997b3916b0ad829ed38 100644
--- a/drivers/gpu/nova-core/gsp.rs
+++ b/drivers/gpu/nova-core/gsp.rs
@@ -47,16 +47,11 @@
unsafe impl<const NUM_ENTRIES: usize> AsBytes for PteArray<NUM_ENTRIES> {}
impl<const NUM_PAGES: usize> PteArray<NUM_PAGES> {
- /// Creates a new page table array mapping `NUM_PAGES` GSP pages starting at address `start`.
- fn new(start: DmaAddress) -> Result<Self> {
- let mut ptes = [0u64; NUM_PAGES];
- for (i, pte) in ptes.iter_mut().enumerate() {
- *pte = start
- .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
- .ok_or(EOVERFLOW)?;
- }
-
- Ok(Self(ptes))
+ /// Returns the page table entry for `index`, for a mapping starting at `start` DmaAddress.
+ fn entry(start: DmaAddress, index: usize) -> Result<u64> {
+ start
+ .checked_add(num::usize_as_u64(index) << GSP_PAGE_SHIFT)
+ .ok_or(EOVERFLOW)
}
}
@@ -86,16 +81,25 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
NUM_PAGES * GSP_PAGE_SIZE,
GFP_KERNEL | __GFP_ZERO,
)?);
- let ptes = PteArray::<NUM_PAGES>::new(obj.0.dma_handle())?;
+
+ let start_addr = obj.0.dma_handle();
// SAFETY: `obj` has just been created and we are its sole user.
- unsafe {
- // Copy the self-mapping PTE at the expected location.
+ let pte_region = unsafe {
obj.0
- .as_slice_mut(size_of::<u64>(), size_of_val(&ptes))?
- .copy_from_slice(ptes.as_bytes())
+ .as_slice_mut(size_of::<u64>(), NUM_PAGES * size_of::<u64>())?
};
+ // This is a one by one GSP Page write to the memory
+ // to avoid stack overflow when allocating the whole array at once.
+ for (i, chunk) in pte_region.chunks_exact_mut(size_of::<u64>()).enumerate() {
+ let pte_value = start_addr
+ .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
+ .ok_or(EOVERFLOW)?;
+
+ chunk.copy_from_slice(&pte_value.to_ne_bytes());
+ }
+
Ok(obj)
}
}
diff --git a/drivers/gpu/nova-core/gsp/cmdq.rs b/drivers/gpu/nova-core/gsp/cmdq.rs
index 0056bfbf0a44cfbc5a0ca08d069f881b877e1edc..c8327d3098f73f9b880eee99038ad10a16e1e32d 100644
--- a/drivers/gpu/nova-core/gsp/cmdq.rs
+++ b/drivers/gpu/nova-core/gsp/cmdq.rs
@@ -202,7 +202,20 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
let gsp_mem =
CoherentAllocation::<GspMem>::alloc_coherent(dev, 1, GFP_KERNEL | __GFP_ZERO)?;
- dma_write!(gsp_mem, [0]?.ptes, PteArray::new(gsp_mem.dma_handle())?);
+
+ const NUM_PTES: usize = GSP_PAGE_SIZE / size_of::<u64>();
+
+ let start = gsp_mem.dma_handle();
+ // One by one GSP Page write to the memory to avoid stack overflow when allocating
+ // the whole array at once.
+ for i in 0..NUM_PTES {
+ dma_write!(
+ gsp_mem,
+ [0]?.ptes.0[i],
+ PteArray::<NUM_PTES>::entry(start, i)?
+ );
+ }
+
dma_write!(
gsp_mem,
[0]?.cpuq.tx,
--
2.53.0
On Mon Mar 9, 2026 at 5:34 PM CET, Tim Kovalenko via B4 Relay wrote:
> From: Tim Kovalenko <tim.kovalenko@proton.me>
>
> The `Cmdq::new` function was allocating a `PteArray` struct on the stack
> and was causing a stack overflow with 8216 bytes.
>
> Modify the `PteArray` to calculate and write the Page Table Entries
> directly into the coherent DMA buffer one-by-one. This reduces the stack
> usage quite a lot.
>
> Signed-off-by: Tim Kovalenko <tim.kovalenko@proton.me>
Applied to drm-rust-fixes, thanks!
--- commit ---
commit c7940c8bf215b9dc6211781c77ce80e76982a723
Author: Tim Kovalenko <tim.kovalenko@proton.me>
Date: Mon Mar 9 12:34:21 2026 -0400
gpu: nova-core: fix stack overflow in GSP memory allocation
The `Cmdq::new` function was allocating a `PteArray` struct on the stack
and was causing a stack overflow with 8216 bytes.
Modify the `PteArray` to calculate and write the Page Table Entries
directly into the coherent DMA buffer one-by-one. This reduces the stack
usage quite a lot.
Reported-by: Gary Guo <gary@garyguo.net>
Closes: https://rust-for-linux.zulipchat.com/#narrow/channel/509436-Nova/topic/.60Cmdq.3A.3Anew.60.20uses.20excessive.20stack.20size/near/570375549
Link: https://lore.kernel.org/rust-for-linux/CANiq72mAQxbRJZDnik3Qmd4phvFwPA01O2jwaaXRh_T+2=L-qA@mail.gmail.com/
Fixes: f38b4f105cfc ("gpu: nova-core: Create initial Gsp")
Acked-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Tim Kovalenko <tim.kovalenko@proton.me>
Link: https://patch.msgid.link/20260309-drm-rust-next-v4-4-4ef485b19a4c@proton.me
[ * Use PteArray::entry() in LogBuffer::new(),
* Add TODO comment to use IoView projections once available,
* Add PTE_ARRAY_SIZE constant to avoid duplication.
- Danilo ]
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
On Tue Mar 10, 2026 at 1:34 AM JST, Tim Kovalenko via B4 Relay wrote:
> From: Tim Kovalenko <tim.kovalenko@proton.me>
>
> The `Cmdq::new` function was allocating a `PteArray` struct on the stack
> and was causing a stack overflow with 8216 bytes.
>
> Modify the `PteArray` to calculate and write the Page Table Entries
> directly into the coherent DMA buffer one-by-one. This reduces the stack
> usage quite a lot.
>
> Signed-off-by: Tim Kovalenko <tim.kovalenko@proton.me>
> ---
> drivers/gpu/nova-core/gsp.rs | 34 +++++++++++++++++++---------------
> drivers/gpu/nova-core/gsp/cmdq.rs | 15 ++++++++++++++-
> 2 files changed, 33 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/nova-core/gsp.rs b/drivers/gpu/nova-core/gsp.rs
> index 25cd48514c777cb405a2af0acf57196b2e2e7837..20170e483e04c476efce8997b3916b0ad829ed38 100644
> --- a/drivers/gpu/nova-core/gsp.rs
> +++ b/drivers/gpu/nova-core/gsp.rs
> @@ -47,16 +47,11 @@
> unsafe impl<const NUM_ENTRIES: usize> AsBytes for PteArray<NUM_ENTRIES> {}
>
> impl<const NUM_PAGES: usize> PteArray<NUM_PAGES> {
> - /// Creates a new page table array mapping `NUM_PAGES` GSP pages starting at address `start`.
> - fn new(start: DmaAddress) -> Result<Self> {
> - let mut ptes = [0u64; NUM_PAGES];
> - for (i, pte) in ptes.iter_mut().enumerate() {
> - *pte = start
> - .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
> - .ok_or(EOVERFLOW)?;
> - }
> -
> - Ok(Self(ptes))
> + /// Returns the page table entry for `index`, for a mapping starting at `start` DmaAddress.
> + fn entry(start: DmaAddress, index: usize) -> Result<u64> {
> + start
> + .checked_add(num::usize_as_u64(index) << GSP_PAGE_SHIFT)
> + .ok_or(EOVERFLOW)
> }
> }
>
> @@ -86,16 +81,25 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
> NUM_PAGES * GSP_PAGE_SIZE,
> GFP_KERNEL | __GFP_ZERO,
> )?);
> - let ptes = PteArray::<NUM_PAGES>::new(obj.0.dma_handle())?;
> +
> + let start_addr = obj.0.dma_handle();
>
> // SAFETY: `obj` has just been created and we are its sole user.
> - unsafe {
> - // Copy the self-mapping PTE at the expected location.
> + let pte_region = unsafe {
> obj.0
> - .as_slice_mut(size_of::<u64>(), size_of_val(&ptes))?
> - .copy_from_slice(ptes.as_bytes())
> + .as_slice_mut(size_of::<u64>(), NUM_PAGES * size_of::<u64>())?
> };
>
> + // This is a one by one GSP Page write to the memory
> + // to avoid stack overflow when allocating the whole array at once.
> + for (i, chunk) in pte_region.chunks_exact_mut(size_of::<u64>()).enumerate() {
> + let pte_value = start_addr
> + .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
> + .ok_or(EOVERFLOW)?;
> +
> + chunk.copy_from_slice(&pte_value.to_ne_bytes());
> + }
> +
> Ok(obj)
> }
> }
> diff --git a/drivers/gpu/nova-core/gsp/cmdq.rs b/drivers/gpu/nova-core/gsp/cmdq.rs
> index 0056bfbf0a44cfbc5a0ca08d069f881b877e1edc..c8327d3098f73f9b880eee99038ad10a16e1e32d 100644
> --- a/drivers/gpu/nova-core/gsp/cmdq.rs
> +++ b/drivers/gpu/nova-core/gsp/cmdq.rs
> @@ -202,7 +202,20 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
>
> let gsp_mem =
> CoherentAllocation::<GspMem>::alloc_coherent(dev, 1, GFP_KERNEL | __GFP_ZERO)?;
> - dma_write!(gsp_mem, [0]?.ptes, PteArray::new(gsp_mem.dma_handle())?);
> +
> + const NUM_PTES: usize = GSP_PAGE_SIZE / size_of::<u64>();
> +
> + let start = gsp_mem.dma_handle();
> + // One by one GSP Page write to the memory to avoid stack overflow when allocating
> + // the whole array at once.
> + for i in 0..NUM_PTES {
> + dma_write!(
> + gsp_mem,
> + [0]?.ptes.0[i],
> + PteArray::<NUM_PTES>::entry(start, i)?
Does `::<NUM_PTES>` need to be mentioned here, or is the compiler able
to infer it?
In any case, the updated patch
Acked-by: Alexandre Courbot <acourbot@nvidia.com>
Thanks!
On Tue Mar 10, 2026 at 1:40 AM GMT, Alexandre Courbot wrote:
> On Tue Mar 10, 2026 at 1:34 AM JST, Tim Kovalenko via B4 Relay wrote:
>> From: Tim Kovalenko <tim.kovalenko@proton.me>
>>
>> The `Cmdq::new` function was allocating a `PteArray` struct on the stack
>> and was causing a stack overflow with 8216 bytes.
>>
>> Modify the `PteArray` to calculate and write the Page Table Entries
>> directly into the coherent DMA buffer one-by-one. This reduces the stack
>> usage quite a lot.
>>
>> Signed-off-by: Tim Kovalenko <tim.kovalenko@proton.me>
>> ---
>> drivers/gpu/nova-core/gsp.rs | 34 +++++++++++++++++++---------------
>> drivers/gpu/nova-core/gsp/cmdq.rs | 15 ++++++++++++++-
>> 2 files changed, 33 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/gpu/nova-core/gsp.rs b/drivers/gpu/nova-core/gsp.rs
>> index 25cd48514c777cb405a2af0acf57196b2e2e7837..20170e483e04c476efce8997b3916b0ad829ed38 100644
>> --- a/drivers/gpu/nova-core/gsp.rs
>> +++ b/drivers/gpu/nova-core/gsp.rs
>> @@ -47,16 +47,11 @@
>> unsafe impl<const NUM_ENTRIES: usize> AsBytes for PteArray<NUM_ENTRIES> {}
>>
>> impl<const NUM_PAGES: usize> PteArray<NUM_PAGES> {
>> - /// Creates a new page table array mapping `NUM_PAGES` GSP pages starting at address `start`.
>> - fn new(start: DmaAddress) -> Result<Self> {
>> - let mut ptes = [0u64; NUM_PAGES];
>> - for (i, pte) in ptes.iter_mut().enumerate() {
>> - *pte = start
>> - .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
>> - .ok_or(EOVERFLOW)?;
>> - }
>> -
>> - Ok(Self(ptes))
>> + /// Returns the page table entry for `index`, for a mapping starting at `start` DmaAddress.
>> + fn entry(start: DmaAddress, index: usize) -> Result<u64> {
>> + start
>> + .checked_add(num::usize_as_u64(index) << GSP_PAGE_SHIFT)
>> + .ok_or(EOVERFLOW)
>> }
>> }
>>
>> @@ -86,16 +81,25 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
>> NUM_PAGES * GSP_PAGE_SIZE,
>> GFP_KERNEL | __GFP_ZERO,
>> )?);
>> - let ptes = PteArray::<NUM_PAGES>::new(obj.0.dma_handle())?;
>> +
>> + let start_addr = obj.0.dma_handle();
>>
>> // SAFETY: `obj` has just been created and we are its sole user.
>> - unsafe {
>> - // Copy the self-mapping PTE at the expected location.
>> + let pte_region = unsafe {
>> obj.0
>> - .as_slice_mut(size_of::<u64>(), size_of_val(&ptes))?
>> - .copy_from_slice(ptes.as_bytes())
>> + .as_slice_mut(size_of::<u64>(), NUM_PAGES * size_of::<u64>())?
>> };
>>
>> + // This is a one by one GSP Page write to the memory
>> + // to avoid stack overflow when allocating the whole array at once.
>> + for (i, chunk) in pte_region.chunks_exact_mut(size_of::<u64>()).enumerate() {
>> + let pte_value = start_addr
>> + .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
>> + .ok_or(EOVERFLOW)?;
>> +
>> + chunk.copy_from_slice(&pte_value.to_ne_bytes());
>> + }
>> +
>> Ok(obj)
>> }
>> }
>> diff --git a/drivers/gpu/nova-core/gsp/cmdq.rs b/drivers/gpu/nova-core/gsp/cmdq.rs
>> index 0056bfbf0a44cfbc5a0ca08d069f881b877e1edc..c8327d3098f73f9b880eee99038ad10a16e1e32d 100644
>> --- a/drivers/gpu/nova-core/gsp/cmdq.rs
>> +++ b/drivers/gpu/nova-core/gsp/cmdq.rs
>> @@ -202,7 +202,20 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
>>
>> let gsp_mem =
>> CoherentAllocation::<GspMem>::alloc_coherent(dev, 1, GFP_KERNEL | __GFP_ZERO)?;
>> - dma_write!(gsp_mem, [0]?.ptes, PteArray::new(gsp_mem.dma_handle())?);
>> +
>> + const NUM_PTES: usize = GSP_PAGE_SIZE / size_of::<u64>();
>> +
>> + let start = gsp_mem.dma_handle();
>> + // One by one GSP Page write to the memory to avoid stack overflow when allocating
>> + // the whole array at once.
>> + for i in 0..NUM_PTES {
>> + dma_write!(
>> + gsp_mem,
>> + [0]?.ptes.0[i],
>> + PteArray::<NUM_PTES>::entry(start, i)?
>
> Does `::<NUM_PTES>` need to be mentioned here, or is the compiler able
> to infer it?
The function signature doesn't mention NUM_PTES at all, so no. In fact, perhaps
the `entry` shouldn't be an associated method at all (even if is, it probably
should be of `PteArray::<0>` or something.
Best,
Gary
>
> In any case, the updated patch
>
> Acked-by: Alexandre Courbot <acourbot@nvidia.com>
>
> Thanks!
On Tue Mar 10, 2026 at 10:51 AM JST, Gary Guo wrote:
> On Tue Mar 10, 2026 at 1:40 AM GMT, Alexandre Courbot wrote:
>> On Tue Mar 10, 2026 at 1:34 AM JST, Tim Kovalenko via B4 Relay wrote:
>>> From: Tim Kovalenko <tim.kovalenko@proton.me>
>>>
>>> The `Cmdq::new` function was allocating a `PteArray` struct on the stack
>>> and was causing a stack overflow with 8216 bytes.
>>>
>>> Modify the `PteArray` to calculate and write the Page Table Entries
>>> directly into the coherent DMA buffer one-by-one. This reduces the stack
>>> usage quite a lot.
>>>
>>> Signed-off-by: Tim Kovalenko <tim.kovalenko@proton.me>
>>> ---
>>> drivers/gpu/nova-core/gsp.rs | 34 +++++++++++++++++++---------------
>>> drivers/gpu/nova-core/gsp/cmdq.rs | 15 ++++++++++++++-
>>> 2 files changed, 33 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/gpu/nova-core/gsp.rs b/drivers/gpu/nova-core/gsp.rs
>>> index 25cd48514c777cb405a2af0acf57196b2e2e7837..20170e483e04c476efce8997b3916b0ad829ed38 100644
>>> --- a/drivers/gpu/nova-core/gsp.rs
>>> +++ b/drivers/gpu/nova-core/gsp.rs
>>> @@ -47,16 +47,11 @@
>>> unsafe impl<const NUM_ENTRIES: usize> AsBytes for PteArray<NUM_ENTRIES> {}
>>>
>>> impl<const NUM_PAGES: usize> PteArray<NUM_PAGES> {
>>> - /// Creates a new page table array mapping `NUM_PAGES` GSP pages starting at address `start`.
>>> - fn new(start: DmaAddress) -> Result<Self> {
>>> - let mut ptes = [0u64; NUM_PAGES];
>>> - for (i, pte) in ptes.iter_mut().enumerate() {
>>> - *pte = start
>>> - .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
>>> - .ok_or(EOVERFLOW)?;
>>> - }
>>> -
>>> - Ok(Self(ptes))
>>> + /// Returns the page table entry for `index`, for a mapping starting at `start` DmaAddress.
>>> + fn entry(start: DmaAddress, index: usize) -> Result<u64> {
>>> + start
>>> + .checked_add(num::usize_as_u64(index) << GSP_PAGE_SHIFT)
>>> + .ok_or(EOVERFLOW)
>>> }
>>> }
>>>
>>> @@ -86,16 +81,25 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
>>> NUM_PAGES * GSP_PAGE_SIZE,
>>> GFP_KERNEL | __GFP_ZERO,
>>> )?);
>>> - let ptes = PteArray::<NUM_PAGES>::new(obj.0.dma_handle())?;
>>> +
>>> + let start_addr = obj.0.dma_handle();
>>>
>>> // SAFETY: `obj` has just been created and we are its sole user.
>>> - unsafe {
>>> - // Copy the self-mapping PTE at the expected location.
>>> + let pte_region = unsafe {
>>> obj.0
>>> - .as_slice_mut(size_of::<u64>(), size_of_val(&ptes))?
>>> - .copy_from_slice(ptes.as_bytes())
>>> + .as_slice_mut(size_of::<u64>(), NUM_PAGES * size_of::<u64>())?
>>> };
>>>
>>> + // This is a one by one GSP Page write to the memory
>>> + // to avoid stack overflow when allocating the whole array at once.
>>> + for (i, chunk) in pte_region.chunks_exact_mut(size_of::<u64>()).enumerate() {
>>> + let pte_value = start_addr
>>> + .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
>>> + .ok_or(EOVERFLOW)?;
>>> +
>>> + chunk.copy_from_slice(&pte_value.to_ne_bytes());
>>> + }
>>> +
>>> Ok(obj)
>>> }
>>> }
>>> diff --git a/drivers/gpu/nova-core/gsp/cmdq.rs b/drivers/gpu/nova-core/gsp/cmdq.rs
>>> index 0056bfbf0a44cfbc5a0ca08d069f881b877e1edc..c8327d3098f73f9b880eee99038ad10a16e1e32d 100644
>>> --- a/drivers/gpu/nova-core/gsp/cmdq.rs
>>> +++ b/drivers/gpu/nova-core/gsp/cmdq.rs
>>> @@ -202,7 +202,20 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
>>>
>>> let gsp_mem =
>>> CoherentAllocation::<GspMem>::alloc_coherent(dev, 1, GFP_KERNEL | __GFP_ZERO)?;
>>> - dma_write!(gsp_mem, [0]?.ptes, PteArray::new(gsp_mem.dma_handle())?);
>>> +
>>> + const NUM_PTES: usize = GSP_PAGE_SIZE / size_of::<u64>();
>>> +
>>> + let start = gsp_mem.dma_handle();
>>> + // One by one GSP Page write to the memory to avoid stack overflow when allocating
>>> + // the whole array at once.
>>> + for i in 0..NUM_PTES {
>>> + dma_write!(
>>> + gsp_mem,
>>> + [0]?.ptes.0[i],
>>> + PteArray::<NUM_PTES>::entry(start, i)?
>>
>> Does `::<NUM_PTES>` need to be mentioned here, or is the compiler able
>> to infer it?
>
> The function signature doesn't mention NUM_PTES at all, so no. In fact, perhaps
> the `entry` shouldn't be an associated method at all (even if is, it probably
> should be of `PteArray::<0>` or something.
I had that thought as well - this calls for a redesign of the `PteArray`
business - but also didn't want to interfere too much as this fix is
very much (and quickly) needed. We will probably re-write this once we
have access to the new I/O code anyway.
On Tue Mar 10, 2026 at 3:28 AM CET, Alexandre Courbot wrote:
> On Tue Mar 10, 2026 at 10:51 AM JST, Gary Guo wrote:
>> On Tue Mar 10, 2026 at 1:40 AM GMT, Alexandre Courbot wrote:
>>> On Tue Mar 10, 2026 at 1:34 AM JST, Tim Kovalenko via B4 Relay wrote:
>>>> diff --git a/drivers/gpu/nova-core/gsp/cmdq.rs b/drivers/gpu/nova-core/gsp/cmdq.rs
>>>> index 0056bfbf0a44cfbc5a0ca08d069f881b877e1edc..c8327d3098f73f9b880eee99038ad10a16e1e32d 100644
>>>> --- a/drivers/gpu/nova-core/gsp/cmdq.rs
>>>> +++ b/drivers/gpu/nova-core/gsp/cmdq.rs
>>>> @@ -202,7 +202,20 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
>>>>
>>>> let gsp_mem =
>>>> CoherentAllocation::<GspMem>::alloc_coherent(dev, 1, GFP_KERNEL | __GFP_ZERO)?;
>>>> - dma_write!(gsp_mem, [0]?.ptes, PteArray::new(gsp_mem.dma_handle())?);
>>>> +
>>>> + const NUM_PTES: usize = GSP_PAGE_SIZE / size_of::<u64>();
>>>> +
>>>> + let start = gsp_mem.dma_handle();
>>>> + // One by one GSP Page write to the memory to avoid stack overflow when allocating
>>>> + // the whole array at once.
>>>> + for i in 0..NUM_PTES {
>>>> + dma_write!(
>>>> + gsp_mem,
>>>> + [0]?.ptes.0[i],
>>>> + PteArray::<NUM_PTES>::entry(start, i)?
>>>
>>> Does `::<NUM_PTES>` need to be mentioned here, or is the compiler able
>>> to infer it?
>>
>> The function signature doesn't mention NUM_PTES at all, so no. In fact, perhaps
>> the `entry` shouldn't be an associated method at all (even if is, it probably
>> should be of `PteArray::<0>` or something.
I think <0> is probably the best choice for this fix for now.
> I had that thought as well - this calls for a redesign of the `PteArray`
> business - but also didn't want to interfere too much as this fix is
> very much (and quickly) needed. We will probably re-write this once we
> have access to the new I/O code anyway.
Not sure it actually needs a redesign, as I think this just goes away with I/O
projections. That's also why I would add the following TODO comment on entry().
// TODO: Replace with `IoView` projections once available.
I.e. it is just a workaround for now.
On Mon Mar 9, 2026 at 5:34 PM CET, Tim Kovalenko via B4 Relay wrote:
> From: Tim Kovalenko <tim.kovalenko@proton.me>
>
> The `Cmdq::new` function was allocating a `PteArray` struct on the stack
> and was causing a stack overflow with 8216 bytes.
>
> Modify the `PteArray` to calculate and write the Page Table Entries
> directly into the coherent DMA buffer one-by-one. This reduces the stack
> usage quite a lot.
>
Reported-by: Gary Guo <gary@garyguo.net>
Closes: https://rust-for-linux.zulipchat.com/#narrow/channel/509436-Nova/topic/.60Cmdq.3A.3Anew.60.20uses.20excessive.20stack.20size/near/570375549
Fixes: f38b4f105cfc ("gpu: nova-core: Create initial Gsp")
> Signed-off-by: Tim Kovalenko <tim.kovalenko@proton.me>
A few nits below, but I can fix them up [1] on apply.
> + for (i, chunk) in pte_region.chunks_exact_mut(size_of::<u64>()).enumerate() {
> + let pte_value = start_addr
> + .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
> + .ok_or(EOVERFLOW)?;
This should use PteArray::entry().
It would also be nice to get rid of the unsafe {} and use dma_write!() instead,
but this can be a follow-up patch.
> +
> + chunk.copy_from_slice(&pte_value.to_ne_bytes());
> + }
> +
> Ok(obj)
> }
> }
> diff --git a/drivers/gpu/nova-core/gsp/cmdq.rs b/drivers/gpu/nova-core/gsp/cmdq.rs
> index 0056bfbf0a44cfbc5a0ca08d069f881b877e1edc..c8327d3098f73f9b880eee99038ad10a16e1e32d 100644
> --- a/drivers/gpu/nova-core/gsp/cmdq.rs
> +++ b/drivers/gpu/nova-core/gsp/cmdq.rs
> @@ -202,7 +202,20 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
>
> let gsp_mem =
> CoherentAllocation::<GspMem>::alloc_coherent(dev, 1, GFP_KERNEL | __GFP_ZERO)?;
> - dma_write!(gsp_mem, [0]?.ptes, PteArray::new(gsp_mem.dma_handle())?);
> +
> + const NUM_PTES: usize = GSP_PAGE_SIZE / size_of::<u64>();
We can avoid duplicating this by making it a constant of GspMem.
> + let start = gsp_mem.dma_handle();
> + // One by one GSP Page write to the memory to avoid stack overflow when allocating
> + // the whole array at once.
> + for i in 0..NUM_PTES {
> + dma_write!(
> + gsp_mem,
> + [0]?.ptes.0[i],
> + PteArray::<NUM_PTES>::entry(start, i)?
> + );
> + }
-- [1] --
diff --git a/drivers/gpu/nova-core/gsp.rs b/drivers/gpu/nova-core/gsp.rs
index 20170e483e04..b5ea14b7dad7 100644
--- a/drivers/gpu/nova-core/gsp.rs
+++ b/drivers/gpu/nova-core/gsp.rs
@@ -48,6 +48,7 @@ unsafe impl<const NUM_ENTRIES: usize> AsBytes for PteArray<NUM_ENTRIES> {}
impl<const NUM_PAGES: usize> PteArray<NUM_PAGES> {
/// Returns the page table entry for `index`, for a mapping starting at `start` DmaAddress.
+ // TODO: Replace with `IoView` projection once available.
fn entry(start: DmaAddress, index: usize) -> Result<u64> {
start
.checked_add(num::usize_as_u64(index) << GSP_PAGE_SHIFT)
@@ -90,12 +91,9 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
.as_slice_mut(size_of::<u64>(), NUM_PAGES * size_of::<u64>())?
};
- // This is a one by one GSP Page write to the memory
- // to avoid stack overflow when allocating the whole array at once.
+ // Write values one by one to avoid an on-stack instance of `PteArray`.
for (i, chunk) in pte_region.chunks_exact_mut(size_of::<u64>()).enumerate() {
- let pte_value = start_addr
- .checked_add(num::usize_as_u64(i) << GSP_PAGE_SHIFT)
- .ok_or(EOVERFLOW)?;
+ let pte_value = PteArray::<NUM_PAGES>::entry(start_addr, i)?;
chunk.copy_from_slice(&pte_value.to_ne_bytes());
}
diff --git a/drivers/gpu/nova-core/gsp/cmdq.rs b/drivers/gpu/nova-core/gsp/cmdq.rs
index 9107a1473797..aa42d180f0d5 100644
--- a/drivers/gpu/nova-core/gsp/cmdq.rs
+++ b/drivers/gpu/nova-core/gsp/cmdq.rs
@@ -159,7 +159,7 @@ struct Msgq {
#[repr(C)]
struct GspMem {
/// Self-mapping page table entries.
- ptes: PteArray<{ GSP_PAGE_SIZE / size_of::<u64>() }>,
+ ptes: PteArray<{ Self::PTE_ARRAY_SIZE }>,
/// CPU queue: the driver writes commands here, and the GSP reads them. It also contains the
/// write and read pointers that the CPU updates.
///
@@ -172,6 +172,10 @@ struct GspMem {
gspq: Msgq,
}
+impl GspMem {
+ const PTE_ARRAY_SIZE: usize = GSP_PAGE_SIZE / size_of::<u64>();
+}
+
// SAFETY: These structs don't meet the no-padding requirements of AsBytes but
// that is not a problem because they are not used outside the kernel.
unsafe impl AsBytes for GspMem {}
@@ -202,16 +206,14 @@ fn new(dev: &device::Device<device::Bound>) -> Result<Self> {
let gsp_mem =
CoherentAllocation::<GspMem>::alloc_coherent(dev, 1, GFP_KERNEL | __GFP_ZERO)?;
- const NUM_PTES: usize = GSP_PAGE_SIZE / size_of::<u64>();
-
let start = gsp_mem.dma_handle();
// One by one GSP Page write to the memory to avoid stack overflow when allocating
// the whole array at once.
- for i in 0..NUM_PTES {
+ for i in 0..GspMem::PTE_ARRAY_SIZE {
dma_write!(
gsp_mem,
[0]?.ptes.0[i],
- PteArray::<NUM_PTES>::entry(start, i)?
+ PteArray::<{ GspMem::PTE_ARRAY_SIZE }>::entry(start, i)?
);
}
On Mon, Mar 9, 2026 at 8:40 PM Danilo Krummrich <dakr@kernel.org> wrote:
>
> Reported-by: Gary Guo <gary@garyguo.net>
> Closes: https://rust-for-linux.zulipchat.com/#narrow/channel/509436-Nova/topic/.60Cmdq.3A.3Anew.60.20uses.20excessive.20stack.20size/near/570375549
> Fixes: f38b4f105cfc ("gpu: nova-core: Create initial Gsp")
`objtool` also complains about it because it doesn't handle certain
instructions, so probably:
Link: https://lore.kernel.org/rust-for-linux/CANiq72mAQxbRJZDnik3Qmd4phvFwPA01O2jwaaXRh_T+2=L-qA@mail.gmail.com/
Cheers,
Miguel
© 2016 - 2026 Red Hat, Inc.