sound/core/memalloc.c | 87 +++++++++---------------------------------- 1 file changed, 18 insertions(+), 69 deletions(-)
This patch attempted to work around a DMA issue involving Xen, but
causes subtle kernel memory corruption.
When I brought up this patch in the XenDevel matrix channel, I was
told that it had been requested by the Qubes OS developers because
they were trying to fix an issue where the sound stack would fail
after a few hours of uptime. They wound up disabling SG buffering
entirely instead as a workaround.
Accordingly, I propose that we should revert this workaround patch,
since it causes kernel memory corruption and that the ALSA and Xen
communities should collaborate on fixing the underlying problem in
such a way that SG buffering works correctly under Xen.
This reverts commit 53466ebdec614f915c691809b0861acecb941e30.
Signed-off-by: Ariadne Conill <ariadne@ariadne.space>
Cc: stable@vger.kernel.org
Cc: xen-devel@lists.xenproject.org
Cc: alsa-devel@alsa-project.org
Cc: Takashi Iwai <tiwai@suse.de>
---
sound/core/memalloc.c | 87 +++++++++----------------------------------
1 file changed, 18 insertions(+), 69 deletions(-)
diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c
index f901504b5afc..81025f50a542 100644
--- a/sound/core/memalloc.c
+++ b/sound/core/memalloc.c
@@ -541,15 +541,16 @@ static void *snd_dma_noncontig_alloc(struct snd_dma_buffer *dmab, size_t size)
struct sg_table *sgt;
void *p;
-#ifdef CONFIG_SND_DMA_SGBUF
- if (cpu_feature_enabled(X86_FEATURE_XENPV))
- return snd_dma_sg_fallback_alloc(dmab, size);
-#endif
sgt = dma_alloc_noncontiguous(dmab->dev.dev, size, dmab->dev.dir,
DEFAULT_GFP, 0);
#ifdef CONFIG_SND_DMA_SGBUF
- if (!sgt && !get_dma_ops(dmab->dev.dev))
+ if (!sgt && !get_dma_ops(dmab->dev.dev)) {
+ if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG)
+ dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK;
+ else
+ dmab->dev.type = SNDRV_DMA_TYPE_DEV_SG_FALLBACK;
return snd_dma_sg_fallback_alloc(dmab, size);
+ }
#endif
if (!sgt)
return NULL;
@@ -716,38 +717,19 @@ static const struct snd_malloc_ops snd_dma_sg_wc_ops = {
/* Fallback SG-buffer allocations for x86 */
struct snd_dma_sg_fallback {
- bool use_dma_alloc_coherent;
size_t count;
struct page **pages;
- /* DMA address array; the first page contains #pages in ~PAGE_MASK */
- dma_addr_t *addrs;
};
static void __snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab,
struct snd_dma_sg_fallback *sgbuf)
{
- size_t i, size;
-
- if (sgbuf->pages && sgbuf->addrs) {
- i = 0;
- while (i < sgbuf->count) {
- if (!sgbuf->pages[i] || !sgbuf->addrs[i])
- break;
- size = sgbuf->addrs[i] & ~PAGE_MASK;
- if (WARN_ON(!size))
- break;
- if (sgbuf->use_dma_alloc_coherent)
- dma_free_coherent(dmab->dev.dev, size << PAGE_SHIFT,
- page_address(sgbuf->pages[i]),
- sgbuf->addrs[i] & PAGE_MASK);
- else
- do_free_pages(page_address(sgbuf->pages[i]),
- size << PAGE_SHIFT, false);
- i += size;
- }
- }
+ bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK;
+ size_t i;
+
+ for (i = 0; i < sgbuf->count && sgbuf->pages[i]; i++)
+ do_free_pages(page_address(sgbuf->pages[i]), PAGE_SIZE, wc);
kvfree(sgbuf->pages);
- kvfree(sgbuf->addrs);
kfree(sgbuf);
}
@@ -756,36 +738,24 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size)
struct snd_dma_sg_fallback *sgbuf;
struct page **pagep, *curp;
size_t chunk, npages;
- dma_addr_t *addrp;
dma_addr_t addr;
void *p;
-
- /* correct the type */
- if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_SG)
- dmab->dev.type = SNDRV_DMA_TYPE_DEV_SG_FALLBACK;
- else if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG)
- dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK;
+ bool wc = dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK;
sgbuf = kzalloc(sizeof(*sgbuf), GFP_KERNEL);
if (!sgbuf)
return NULL;
- sgbuf->use_dma_alloc_coherent = cpu_feature_enabled(X86_FEATURE_XENPV);
size = PAGE_ALIGN(size);
sgbuf->count = size >> PAGE_SHIFT;
sgbuf->pages = kvcalloc(sgbuf->count, sizeof(*sgbuf->pages), GFP_KERNEL);
- sgbuf->addrs = kvcalloc(sgbuf->count, sizeof(*sgbuf->addrs), GFP_KERNEL);
- if (!sgbuf->pages || !sgbuf->addrs)
+ if (!sgbuf->pages)
goto error;
pagep = sgbuf->pages;
- addrp = sgbuf->addrs;
- chunk = (PAGE_SIZE - 1) << PAGE_SHIFT; /* to fit in low bits in addrs */
+ chunk = size;
while (size > 0) {
chunk = min(size, chunk);
- if (sgbuf->use_dma_alloc_coherent)
- p = dma_alloc_coherent(dmab->dev.dev, chunk, &addr, DEFAULT_GFP);
- else
- p = do_alloc_pages(dmab->dev.dev, chunk, &addr, false);
+ p = do_alloc_pages(dmab->dev.dev, chunk, &addr, wc);
if (!p) {
if (chunk <= PAGE_SIZE)
goto error;
@@ -797,25 +767,17 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size)
size -= chunk;
/* fill pages */
npages = chunk >> PAGE_SHIFT;
- *addrp = npages; /* store in lower bits */
curp = virt_to_page(p);
- while (npages--) {
+ while (npages--)
*pagep++ = curp++;
- *addrp++ |= addr;
- addr += PAGE_SIZE;
- }
}
p = vmap(sgbuf->pages, sgbuf->count, VM_MAP, PAGE_KERNEL);
if (!p)
goto error;
-
- if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK)
- set_pages_array_wc(sgbuf->pages, sgbuf->count);
-
dmab->private_data = sgbuf;
/* store the first page address for convenience */
- dmab->addr = sgbuf->addrs[0] & PAGE_MASK;
+ dmab->addr = snd_sgbuf_get_addr(dmab, 0);
return p;
error:
@@ -825,23 +787,10 @@ static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size)
static void snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab)
{
- struct snd_dma_sg_fallback *sgbuf = dmab->private_data;
-
- if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK)
- set_pages_array_wb(sgbuf->pages, sgbuf->count);
vunmap(dmab->area);
__snd_dma_sg_fallback_free(dmab, dmab->private_data);
}
-static dma_addr_t snd_dma_sg_fallback_get_addr(struct snd_dma_buffer *dmab,
- size_t offset)
-{
- struct snd_dma_sg_fallback *sgbuf = dmab->private_data;
- size_t index = offset >> PAGE_SHIFT;
-
- return (sgbuf->addrs[index] & PAGE_MASK) | (offset & ~PAGE_MASK);
-}
-
static int snd_dma_sg_fallback_mmap(struct snd_dma_buffer *dmab,
struct vm_area_struct *area)
{
@@ -856,8 +805,8 @@ static const struct snd_malloc_ops snd_dma_sg_fallback_ops = {
.alloc = snd_dma_sg_fallback_alloc,
.free = snd_dma_sg_fallback_free,
.mmap = snd_dma_sg_fallback_mmap,
- .get_addr = snd_dma_sg_fallback_get_addr,
/* reuse vmalloc helpers */
+ .get_addr = snd_dma_vmalloc_get_addr,
.get_page = snd_dma_vmalloc_get_page,
.get_chunk_size = snd_dma_vmalloc_get_chunk_size,
};
--
2.39.2
On Fri, 06 Sep 2024 20:42:09 +0200, Ariadne Conill wrote: > > This patch attempted to work around a DMA issue involving Xen, but > causes subtle kernel memory corruption. > > When I brought up this patch in the XenDevel matrix channel, I was > told that it had been requested by the Qubes OS developers because > they were trying to fix an issue where the sound stack would fail > after a few hours of uptime. They wound up disabling SG buffering > entirely instead as a workaround. > > Accordingly, I propose that we should revert this workaround patch, > since it causes kernel memory corruption and that the ALSA and Xen > communities should collaborate on fixing the underlying problem in > such a way that SG buffering works correctly under Xen. > > This reverts commit 53466ebdec614f915c691809b0861acecb941e30. > > Signed-off-by: Ariadne Conill <ariadne@ariadne.space> > Cc: stable@vger.kernel.org > Cc: xen-devel@lists.xenproject.org > Cc: alsa-devel@alsa-project.org > Cc: Takashi Iwai <tiwai@suse.de> The relevant code has been largely rewritten for 6.12, so please check the behavior with sound.git tree for-next branch. I guess the same issue should happen as the Xen workaround was kept and applied there, too, but it has to be checked at first. If the issue is persistent with there, the fix for 6.12 code would be rather much simpler like the blow. thanks, Takashi --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -793,9 +793,6 @@ static void *snd_dma_sg_alloc(struct snd_dma_buffer *dmab, size_t size) int type = dmab->dev.type; void *p; - if (cpu_feature_enabled(X86_FEATURE_XENPV)) - return snd_dma_sg_fallback_alloc(dmab, size); - /* try the standard DMA API allocation at first */ if (type == SNDRV_DMA_TYPE_DEV_WC_SG) dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC;
On 07/09/2024 8:46 am, Takashi Iwai wrote: > On Fri, 06 Sep 2024 20:42:09 +0200, > Ariadne Conill wrote: >> This patch attempted to work around a DMA issue involving Xen, but >> causes subtle kernel memory corruption. >> >> When I brought up this patch in the XenDevel matrix channel, I was >> told that it had been requested by the Qubes OS developers because >> they were trying to fix an issue where the sound stack would fail >> after a few hours of uptime. They wound up disabling SG buffering >> entirely instead as a workaround. >> >> Accordingly, I propose that we should revert this workaround patch, >> since it causes kernel memory corruption and that the ALSA and Xen >> communities should collaborate on fixing the underlying problem in >> such a way that SG buffering works correctly under Xen. >> >> This reverts commit 53466ebdec614f915c691809b0861acecb941e30. >> >> Signed-off-by: Ariadne Conill <ariadne@ariadne.space> >> Cc: stable@vger.kernel.org >> Cc: xen-devel@lists.xenproject.org >> Cc: alsa-devel@alsa-project.org >> Cc: Takashi Iwai <tiwai@suse.de> > The relevant code has been largely rewritten for 6.12, so please check > the behavior with sound.git tree for-next branch. I guess the same > issue should happen as the Xen workaround was kept and applied there, > too, but it has to be checked at first. > > If the issue is persistent with there, the fix for 6.12 code would be > rather much simpler like the blow. > > > thanks, > > Takashi > > --- a/sound/core/memalloc.c > +++ b/sound/core/memalloc.c > @@ -793,9 +793,6 @@ static void *snd_dma_sg_alloc(struct snd_dma_buffer *dmab, size_t size) > int type = dmab->dev.type; > void *p; > > - if (cpu_feature_enabled(X86_FEATURE_XENPV)) > - return snd_dma_sg_fallback_alloc(dmab, size); > - > /* try the standard DMA API allocation at first */ > if (type == SNDRV_DMA_TYPE_DEV_WC_SG) > dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC; > > Individual subsystems ought not to know or care about XENPV; it's a layering violation. If the main APIs don't behave properly, then it probably means we've got a bug at a lower level (e.g. Xen SWIOTLB is a constant source of fun) which is probably affecting other subsystems too. I think we need to re-analyse the original bug. Right now, the behaviour resulting from 53466ebde is worse than what it was trying to fix. ~Andrew
On Sat, Sep 07, 2024 at 11:38:50AM +0100, Andrew Cooper wrote: > Individual subsystems ought not to know or care about XENPV; it's a > layering violation. Agreed. > If the main APIs don't behave properly, then it probably means we've got > a bug at a lower level (e.g. Xen SWIOTLB is a constant source of fun) > which is probably affecting other subsystems too. > > I think we need to re-analyse the original bug. Right now, the > behaviour resulting from 53466ebde is worse than what it was trying to fix. 53466ebde looks bogus to me, and the commit message doesn't even try to explain what bad behavior it works around. I'd also like to state once again that if you think something is broken about dma allocation or mapping helpers please Cc me and the iommu list. Most of the time it's actually the drivers doing something invalid, but sometimes it is a core dma layer bug or something that needs a proper API. Also while looking at the above commit I noticed the broken fallback code in snd_dma_noncontig_alloc - get_dma_ops is not for driver use, and starting with the code queued up for 6.12 will also return NULL when using dma-iommu for example.
On Thu, 12 Sep 2024 11:56:21 +0200, Christoph Hellwig wrote: > > On Sat, Sep 07, 2024 at 11:38:50AM +0100, Andrew Cooper wrote: > > Individual subsystems ought not to know or care about XENPV; it's a > > layering violation. > > Agreed. > > > If the main APIs don't behave properly, then it probably means we've got > > a bug at a lower level (e.g. Xen SWIOTLB is a constant source of fun) > > which is probably affecting other subsystems too. > > > > I think we need to re-analyse the original bug. Right now, the > > behaviour resulting from 53466ebde is worse than what it was trying to fix. > > 53466ebde looks bogus to me, and the commit message doesn't even > try to explain what bad behavior it works around. I'd also like to > state once again that if you think something is broken about dma > allocation or mapping helpers please Cc me and the iommu list. > > Most of the time it's actually the drivers doing something invalid, but > sometimes it is a core dma layer bug or something that needs a proper > API. > > Also while looking at the above commit I noticed the broken fallback > code in snd_dma_noncontig_alloc - get_dma_ops is not for driver use, > and starting with the code queued up for 6.12 will also return NULL > when using dma-iommu for example. Yes, all those are really ugly hacks and have been already removed for 6.12. Let's hope everything works as expected with it. thanks, Takashi
On Mon, Sep 16, 2024 at 09:16:58AM +0200, Takashi Iwai wrote: > Yes, all those are really ugly hacks and have been already removed for > 6.12. Let's hope everything works as expected with it. The code currently in linux-next will not work as explained in my previous mail, because it tries to side step the DMA API and abuses get_dma_ops in an unsupported way.
On Mon, 16 Sep 2024 09:24:42 +0200, Christoph Hellwig wrote: > > On Mon, Sep 16, 2024 at 09:16:58AM +0200, Takashi Iwai wrote: > > Yes, all those are really ugly hacks and have been already removed for > > 6.12. Let's hope everything works as expected with it. > > The code currently in linux-next will not work as explained in my > previous mail, because it tries to side step the DMA API and abuses > get_dma_ops in an unsupported way. Those should have been removed since the last week. Could you check the today's linux-next tree? thanks, Takashi
On Mon, Sep 16, 2024 at 09:30:11AM +0200, Takashi Iwai wrote: > On Mon, 16 Sep 2024 09:24:42 +0200, > Christoph Hellwig wrote: > > > > On Mon, Sep 16, 2024 at 09:16:58AM +0200, Takashi Iwai wrote: > > > Yes, all those are really ugly hacks and have been already removed for > > > 6.12. Let's hope everything works as expected with it. > > > > The code currently in linux-next will not work as explained in my > > previous mail, because it tries to side step the DMA API and abuses > > get_dma_ops in an unsupported way. > > Those should have been removed since the last week. > Could you check the today's linux-next tree? Ok, looks like the Thursday updates fix the dma_get_ops abuse. They introduce new bugs at least for architectures with virtuall indexed caches by combining vmap and dma mappings without mainintaining the cache coherency using the proper helpers. What confuses my about this is the need to set the DMAable memory to write combinable. How does that improve things over the default writeback cached memory on x86? We could trivially add support for WC mappings for cache coherent DMA, but someone needs to explain how that actually makes sense first.
On Mon, 16 Sep 2024 09:37:07 +0200, Christoph Hellwig wrote: > > On Mon, Sep 16, 2024 at 09:30:11AM +0200, Takashi Iwai wrote: > > On Mon, 16 Sep 2024 09:24:42 +0200, > > Christoph Hellwig wrote: > > > > > > On Mon, Sep 16, 2024 at 09:16:58AM +0200, Takashi Iwai wrote: > > > > Yes, all those are really ugly hacks and have been already removed for > > > > 6.12. Let's hope everything works as expected with it. > > > > > > The code currently in linux-next will not work as explained in my > > > previous mail, because it tries to side step the DMA API and abuses > > > get_dma_ops in an unsupported way. > > > > Those should have been removed since the last week. > > Could you check the today's linux-next tree? > > Ok, looks like the Thursday updates fix the dma_get_ops abuse. > > They introduce new bugs at least for architectures with virtuall > indexed caches by combining vmap and dma mappings without > mainintaining the cache coherency using the proper helpers. Yes, but it should be OK, as those functions are applied only for x86. Others should use noncontig DMA instead, if any. > What confuses my about this is the need to set the DMAable memory > to write combinable. How does that improve things over the default > writeback cached memory on x86? We could trivially add support for > WC mappings for cache coherent DMA, but someone needs to explain > how that actually makes sense first. It's required for a few sound hardware including some HD-audio controllers like old AMD graphics chips, at least. Most of other hardware work in PCI snooping with the default wb pages but those chips lead to either stuttering or silence. It seems that Windows uses wc or uc pages, hence they aren't designed to work in other way. thanks, Takashi
On Sat, Sep 07, 2024 at 11:38:50AM +0100, Andrew Cooper wrote: > On 07/09/2024 8:46 am, Takashi Iwai wrote: > > On Fri, 06 Sep 2024 20:42:09 +0200, > > Ariadne Conill wrote: > >> This patch attempted to work around a DMA issue involving Xen, but > >> causes subtle kernel memory corruption. > >> > >> When I brought up this patch in the XenDevel matrix channel, I was > >> told that it had been requested by the Qubes OS developers because > >> they were trying to fix an issue where the sound stack would fail > >> after a few hours of uptime. They wound up disabling SG buffering > >> entirely instead as a workaround. > >> > >> Accordingly, I propose that we should revert this workaround patch, > >> since it causes kernel memory corruption and that the ALSA and Xen > >> communities should collaborate on fixing the underlying problem in > >> such a way that SG buffering works correctly under Xen. > >> > >> This reverts commit 53466ebdec614f915c691809b0861acecb941e30. > >> > >> Signed-off-by: Ariadne Conill <ariadne@ariadne.space> > >> Cc: stable@vger.kernel.org > >> Cc: xen-devel@lists.xenproject.org > >> Cc: alsa-devel@alsa-project.org > >> Cc: Takashi Iwai <tiwai@suse.de> > > The relevant code has been largely rewritten for 6.12, so please check > > the behavior with sound.git tree for-next branch. I guess the same > > issue should happen as the Xen workaround was kept and applied there, > > too, but it has to be checked at first. > > > > If the issue is persistent with there, the fix for 6.12 code would be > > rather much simpler like the blow. > > > > > > thanks, > > > > Takashi > > > > --- a/sound/core/memalloc.c > > +++ b/sound/core/memalloc.c > > @@ -793,9 +793,6 @@ static void *snd_dma_sg_alloc(struct snd_dma_buffer *dmab, size_t size) > > int type = dmab->dev.type; > > void *p; > > > > - if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > - return snd_dma_sg_fallback_alloc(dmab, size); > > - > > /* try the standard DMA API allocation at first */ > > if (type == SNDRV_DMA_TYPE_DEV_WC_SG) > > dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC; > > > > > > Individual subsystems ought not to know or care about XENPV; it's a > layering violation. > > If the main APIs don't behave properly, then it probably means we've got > a bug at a lower level (e.g. Xen SWIOTLB is a constant source of fun) > which is probably affecting other subsystems too. This is a big problem. Debian bug #988477 (https://bugs.debian.org/988477) showed up in May 2021. While some characteristics are quite different, the time when it was first reported is similar to the above and it is also likely a DMA bug with Xen. -- (\___(\___(\______ --=> 8-) EHM <=-- ______/)___/)___/) \BS ( | ehem+sigmsg@m5p.com PGP 87145445 | ) / \_CS\ | _____ -O #include <stddisclaimer.h> O- _____ | / _/ 8A19\___\_|_/58D2 7E3D DDF4 7BA6 <-PGP-> 41D1 B375 37D0 8714\_|_/___/5445
On Mon, 09 Sep 2024 22:02:08 +0200, Elliott Mitchell wrote: > > On Sat, Sep 07, 2024 at 11:38:50AM +0100, Andrew Cooper wrote: > > On 07/09/2024 8:46 am, Takashi Iwai wrote: > > > On Fri, 06 Sep 2024 20:42:09 +0200, > > > Ariadne Conill wrote: > > >> This patch attempted to work around a DMA issue involving Xen, but > > >> causes subtle kernel memory corruption. > > >> > > >> When I brought up this patch in the XenDevel matrix channel, I was > > >> told that it had been requested by the Qubes OS developers because > > >> they were trying to fix an issue where the sound stack would fail > > >> after a few hours of uptime. They wound up disabling SG buffering > > >> entirely instead as a workaround. > > >> > > >> Accordingly, I propose that we should revert this workaround patch, > > >> since it causes kernel memory corruption and that the ALSA and Xen > > >> communities should collaborate on fixing the underlying problem in > > >> such a way that SG buffering works correctly under Xen. > > >> > > >> This reverts commit 53466ebdec614f915c691809b0861acecb941e30. > > >> > > >> Signed-off-by: Ariadne Conill <ariadne@ariadne.space> > > >> Cc: stable@vger.kernel.org > > >> Cc: xen-devel@lists.xenproject.org > > >> Cc: alsa-devel@alsa-project.org > > >> Cc: Takashi Iwai <tiwai@suse.de> > > > The relevant code has been largely rewritten for 6.12, so please check > > > the behavior with sound.git tree for-next branch. I guess the same > > > issue should happen as the Xen workaround was kept and applied there, > > > too, but it has to be checked at first. > > > > > > If the issue is persistent with there, the fix for 6.12 code would be > > > rather much simpler like the blow. > > > > > > > > > thanks, > > > > > > Takashi > > > > > > --- a/sound/core/memalloc.c > > > +++ b/sound/core/memalloc.c > > > @@ -793,9 +793,6 @@ static void *snd_dma_sg_alloc(struct snd_dma_buffer *dmab, size_t size) > > > int type = dmab->dev.type; > > > void *p; > > > > > > - if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > > - return snd_dma_sg_fallback_alloc(dmab, size); > > > - > > > /* try the standard DMA API allocation at first */ > > > if (type == SNDRV_DMA_TYPE_DEV_WC_SG) > > > dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC; > > > > > > > > > > Individual subsystems ought not to know or care about XENPV; it's a > > layering violation. > > > > If the main APIs don't behave properly, then it probably means we've got > > a bug at a lower level (e.g. Xen SWIOTLB is a constant source of fun) > > which is probably affecting other subsystems too. > > This is a big problem. Debian bug #988477 (https://bugs.debian.org/988477) > showed up in May 2021. While some characteristics are quite different, > the time when it was first reported is similar to the above and it is > also likely a DMA bug with Xen. Yes, some incompatible behavior has been seen on Xen wrt DMA buffer handling, as it seems. But note that, in the case of above, it was triggered by the change in the sound driver side, hence we needed a quick workaround there. The result was to move back to the old method for Xen in the end. As already mentioned in another mail, the whole code was changed for 6.12, and the revert isn't applicable in anyway. So I'm going to submit another patch to drop this Xen PV-specific workaround for 6.12. The new code should work without the workaround (famous last words). If the problem happens there, I'd rather leave it to Xen people ;) thanks, Takashi
On Tue, Sep 10, 2024 at 01:17:03PM +0200, Takashi Iwai wrote: > On Mon, 09 Sep 2024 22:02:08 +0200, > Elliott Mitchell wrote: > > > > On Sat, Sep 07, 2024 at 11:38:50AM +0100, Andrew Cooper wrote: > > > > > > Individual subsystems ought not to know or care about XENPV; it's a > > > layering violation. > > > > > > If the main APIs don't behave properly, then it probably means we've got > > > a bug at a lower level (e.g. Xen SWIOTLB is a constant source of fun) > > > which is probably affecting other subsystems too. > > > > This is a big problem. Debian bug #988477 (https://bugs.debian.org/988477) > > showed up in May 2021. While some characteristics are quite different, > > the time when it was first reported is similar to the above and it is > > also likely a DMA bug with Xen. > > Yes, some incompatible behavior has been seen on Xen wrt DMA buffer > handling, as it seems. But note that, in the case of above, it was > triggered by the change in the sound driver side, hence we needed a > quick workaround there. The result was to move back to the old method > for Xen in the end. > > As already mentioned in another mail, the whole code was changed for > 6.12, and the revert isn't applicable in anyway. > > So I'm going to submit another patch to drop this Xen PV-specific > workaround for 6.12. The new code should work without the workaround > (famous last words). If the problem happens there, I'd rather leave > it to Xen people ;) I've seen that patch, but haven't seen any other activity related to this sound problem. I'm wondering whether the problem got fixed by something else, there is activity on different lists I don't see, versus no activity until Qubes OS discovers it is again broken. An overview of the other bug which may or may not be the same as this sound card bug: Both reproductions of the RAID1 bug have been on systems with AMD processors. This may indicate this is distinct, but could also mean only people who get AMD processors are wary enough of flash to bother with RAID1 on flash devices. Presently I suspect it is the latter, but not very many people are bothering with RAID1 with flash. Only systems with IOMMUv2 (full IOMMU, not merely GART) are effected. Samsung SATA devices are severely effected. Crucial/Micron NVMe devices are mildly effected. Crucial/Micron SATA devices are uneffected. Specifications for Samsung SATA and Crucial/Micron SATA devices are fairly similar. Similar IOps, similar bandwith capabilities. Crucial/Micron NVMe devices have massively superior specifications to the Samsung SATA devices. Yet the Crucial/Micron NVMe devices are less severely effected than the Samsung SATA devices. This seems likely to be a latency issue. Could be when commands are sent to the Samsung SATA devices, they are fast enough to start executing them before the IOMMU is ready. This could match with the sound driver issue. Since the sound hardware is able to execute its first command with minimal latency, that is when the problem occurs. If the first command gets through, the second command is likely executed with some delay and the IOMMU is reliably ready. -- (\___(\___(\______ --=> 8-) EHM <=-- ______/)___/)___/) \BS ( | ehem+sigmsg@m5p.com PGP 87145445 | ) / \_CS\ | _____ -O #include <stddisclaimer.h> O- _____ | / _/ 8A19\___\_|_/58D2 7E3D DDF4 7BA6 <-PGP-> 41D1 B375 37D0 8714\_|_/___/5445
On Sat, 07 Sep 2024 12:38:50 +0200, Andrew Cooper wrote: > > On 07/09/2024 8:46 am, Takashi Iwai wrote: > > On Fri, 06 Sep 2024 20:42:09 +0200, > > Ariadne Conill wrote: > >> This patch attempted to work around a DMA issue involving Xen, but > >> causes subtle kernel memory corruption. > >> > >> When I brought up this patch in the XenDevel matrix channel, I was > >> told that it had been requested by the Qubes OS developers because > >> they were trying to fix an issue where the sound stack would fail > >> after a few hours of uptime. They wound up disabling SG buffering > >> entirely instead as a workaround. > >> > >> Accordingly, I propose that we should revert this workaround patch, > >> since it causes kernel memory corruption and that the ALSA and Xen > >> communities should collaborate on fixing the underlying problem in > >> such a way that SG buffering works correctly under Xen. > >> > >> This reverts commit 53466ebdec614f915c691809b0861acecb941e30. > >> > >> Signed-off-by: Ariadne Conill <ariadne@ariadne.space> > >> Cc: stable@vger.kernel.org > >> Cc: xen-devel@lists.xenproject.org > >> Cc: alsa-devel@alsa-project.org > >> Cc: Takashi Iwai <tiwai@suse.de> > > The relevant code has been largely rewritten for 6.12, so please check > > the behavior with sound.git tree for-next branch. I guess the same > > issue should happen as the Xen workaround was kept and applied there, > > too, but it has to be checked at first. > > > > If the issue is persistent with there, the fix for 6.12 code would be > > rather much simpler like the blow. > > > > > > thanks, > > > > Takashi > > > > --- a/sound/core/memalloc.c > > +++ b/sound/core/memalloc.c > > @@ -793,9 +793,6 @@ static void *snd_dma_sg_alloc(struct snd_dma_buffer *dmab, size_t size) > > int type = dmab->dev.type; > > void *p; > > > > - if (cpu_feature_enabled(X86_FEATURE_XENPV)) > > - return snd_dma_sg_fallback_alloc(dmab, size); > > - > > /* try the standard DMA API allocation at first */ > > if (type == SNDRV_DMA_TYPE_DEV_WC_SG) > > dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC; > > > > > > Individual subsystems ought not to know or care about XENPV; it's a > layering violation. > > If the main APIs don't behave properly, then it probably means we've got > a bug at a lower level (e.g. Xen SWIOTLB is a constant source of fun) > which is probably affecting other subsystems too. > > I think we need to re-analyse the original bug. Right now, the > behaviour resulting from 53466ebde is worse than what it was trying to fix. I agree with its hackishness and ugliness there; it was to fix the regression at that time. And, it definitely needs a cleaner solution. But the fact is that this change was applied quite some time ago (over a year). It implies that it's no urgent regression that has to be addressed in the next few days for 6.11-final. Meanwhile we have largish code changes for 6.12, hence the revert can't be applied. So I'm asking testing the latest code for 6.12, and work on there. thanks, Takashi
© 2016 - 2024 Red Hat, Inc.