[v4] ati-vga: Implement HOST_DATA transfers to enable X.org text rendering

[PATCH v4 9/9] ati-vga: Implement HOST_DATA flush to VRAM

Posted by Chad Jablonski 1 month ago

Implement flushing the 128-bit HOST_DATA accumulator to VRAM to enable
text rendering in X. Currently supports only the monochrome
foreground/background datatype with the SRCCOPY ROP.

The flush is broken up into two steps for clarity. First, expansion of the
monochrome bits to the destination color depth. Then the expanded pixels
are clipped and copied into VRAM.

Signed-off-by: Chad Jablonski <chad@jablonski.xyz>
---
 hw/display/ati.c     |   4 +-
 hw/display/ati_2d.c  | 124 +++++++++++++++++++++++++++++++++++++++++++
 hw/display/ati_int.h |   3 ++
 3 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/hw/display/ati.c b/hw/display/ati.c
index 88d30bf532..cc5899981b 100644
--- a/hw/display/ati.c
+++ b/hw/display/ati.c
@@ -1064,13 +1064,13 @@ static void ati_mm_write(void *opaque, hwaddr addr,
     case HOST_DATA7:
         s->host_data.acc[s->host_data.next++] = data;
         if (s->host_data.next >= 4) {
-            qemu_log_mask(LOG_UNIMP, "HOST_DATA flush not yet implemented\n");
+            ati_flush_host_data(s);
             s->host_data.next = 0;
         }
         break;
     case HOST_DATA_LAST:
         s->host_data.acc[s->host_data.next] = data;
-        qemu_log_mask(LOG_UNIMP, "HOST_DATA flush not yet implemented\n");
+        ati_flush_host_data(s);
         ati_host_data_reset(&s->host_data);
         break;
     default:
diff --git a/hw/display/ati_2d.c b/hw/display/ati_2d.c
index 6c36e55412..19130ed291 100644
--- a/hw/display/ati_2d.c
+++ b/hw/display/ati_2d.c
@@ -117,6 +117,11 @@ static ATIBlitDest setup_2d_blt_dst(ATIVGAState *s)
 
 void ati_2d_blt(ATIVGAState *s)
 {
+    if (s->regs.dp_src_source == GMC_SRC_SOURCE_HOST_DATA) {
+        /* HOST_DATA blits are handled separately by ati_flush_host_data() */
+        return;
+    }
+
     /* FIXME it is probably more complex than this and may need to be */
     /* rewritten but for now as a start just to get some output: */
     DisplaySurface *ds = qemu_console_surface(s->vga.con);
@@ -293,3 +298,122 @@ void ati_2d_blt(ATIVGAState *s)
                       s->regs.dp_rop3);
     }
 }
+
+void ati_flush_host_data(ATIVGAState *s)
+{
+    DisplaySurface *ds;
+    ATIBlitDest dst;
+    uint32_t fg, bg;
+    unsigned bypp, row, col, idx;
+    uint8_t pix_buf[ATI_HOST_DATA_ACC_BITS * sizeof(uint32_t)];
+
+    if (s->regs.dp_src_source != GMC_SRC_SOURCE_HOST_DATA) {
+        qemu_log_mask(LOG_UNIMP,
+                      "host_data_blt: only GMC_SRC_SOURCE_HOST_DATA "
+                      "supported\n");
+        return;
+    }
+
+    if (s->regs.dp_src_datatype != GMC_SRC_DATATYPE_MONO_FRGD_BKGD) {
+        qemu_log_mask(LOG_UNIMP,
+                      "host_data_blt: only GMC_SRC_DATATYPE_MONO_FRGD_BKGD "
+                      "supported\n");
+        return;
+    }
+
+    if (s->regs.dp_rop3 != ROP3_SRCCOPY) {
+        qemu_log_mask(LOG_UNIMP,
+                      "host_data_blt: only ROP3_SRCCOPY supported. rop: %x\n",
+                      s->regs.dp_rop3);
+        return;
+    }
+
+    dst = setup_2d_blt_dst(s);
+    if (!dst.valid) {
+        return;
+    }
+
+    if (!dst.left_to_right || !dst.top_to_bottom) {
+        qemu_log_mask(LOG_UNIMP, "host_data_blt: only L->R, T->B supported\n");
+        return;
+    }
+
+    fg = s->regs.dp_src_frgd_clr;
+    bg = s->regs.dp_src_bkgd_clr;
+    bypp = dst.bpp / 8;
+
+    /* Expand monochrome bits to color pixels */
+    idx = 0;
+    for (int word = 0; word < 4; word++) {
+        for (int byte = 0; byte < 4; byte++) {
+            uint8_t byte_val = s->host_data.acc[word] >> (byte * 8);
+            for (int i = 0; i < 8; i++) {
+                int bit = s->regs.byte_pix_order ? i : (7 - i);
+                bool is_fg = extract8(byte_val, bit, 1);
+                uint32_t color = is_fg ? fg : bg;
+                stn_he_p(&pix_buf[idx * bypp], bypp, color);
+                idx += 1;
+            }
+        }
+    }
+
+    /* Copy to VRAM one scanline at a time */
+    row = s->host_data.row;
+    col = s->host_data.col;
+    idx = 0;
+    while (idx < ATI_HOST_DATA_ACC_BITS && row < dst.rect.height) {
+        uint8_t *vram_dst;
+        unsigned start_col, end_col, vis_row, num_pix, pix_idx;
+        unsigned pix_in_scanline = MIN(ATI_HOST_DATA_ACC_BITS -
+                                       idx, dst.rect.width - col);
+
+        /* Row-based clipping */
+        if (row < dst.src_top_offset ||
+            row >= dst.src_top_offset + dst.visible.height) {
+            goto skip_pix;
+        }
+
+        /* Column-based clipping */
+        start_col = MAX(col, dst.src_left_offset);
+        end_col = MIN(col + pix_in_scanline,
+                      dst.src_left_offset + dst.visible.width);
+        if (end_col <= start_col) {
+            goto skip_pix;
+        }
+
+        /* Copy expanded bits/pixels to VRAM */
+        vis_row = row - dst.src_top_offset;
+        num_pix = end_col - start_col;
+        vram_dst = dst.bits +
+                   (dst.visible.y + vis_row) * dst.stride +
+                   (dst.visible.x + (start_col - dst.src_left_offset)) * bypp;
+
+        pix_idx = (idx + (start_col - col)) * bypp;
+        memcpy(vram_dst, &pix_buf[pix_idx], num_pix * bypp);
+
+    skip_pix:
+        idx += pix_in_scanline;
+        col += pix_in_scanline;
+        if (col >= dst.rect.width) {
+            col = 0;
+            row += 1;
+        }
+    }
+    /* Track state of the overall blit for use by the next flush */
+    s->host_data.row = row;
+    s->host_data.col = col;
+
+    /*
+     * TODO: This is setting the entire blit region to dirty.
+     *       We maybe just need this tiny section?
+     */
+    ds = qemu_console_surface(s->vga.con);
+    if (dst.bits >= s->vga.vram_ptr + s->vga.vbe_start_addr &&
+        dst.bits < s->vga.vram_ptr + s->vga.vbe_start_addr +
+        s->vga.vbe_regs[VBE_DISPI_INDEX_YRES] * s->vga.vbe_line_offset) {
+        memory_region_set_dirty(&s->vga.vram, s->vga.vbe_start_addr +
+                                s->regs.dst_offset +
+                                dst.visible.y * surface_stride(ds),
+                                dst.visible.height * surface_stride(ds));
+    }
+}
diff --git a/hw/display/ati_int.h b/hw/display/ati_int.h
index 3029dc7e3c..448daf44a9 100644
--- a/hw/display/ati_int.h
+++ b/hw/display/ati_int.h
@@ -30,6 +30,8 @@
 /* Radeon RV100 (VE) */
 #define PCI_DEVICE_ID_ATI_RADEON_QY 0x5159
 
+#define ATI_HOST_DATA_ACC_BITS 128
+
 #define TYPE_ATI_VGA "ati-vga"
 OBJECT_DECLARE_SIMPLE_TYPE(ATIVGAState, ATI_VGA)
 
@@ -127,5 +129,6 @@ struct ATIVGAState {
 const char *ati_reg_name(int num);
 
 void ati_2d_blt(ATIVGAState *s);
+void ati_flush_host_data(ATIVGAState *s);
 
 #endif /* ATI_INT_H */
-- 
2.51.2

Re: [PATCH v4 9/9] ati-vga: Implement HOST_DATA flush to VRAM

Posted by BALATON Zoltan 1 month ago

On Tue, 6 Jan 2026, Chad Jablonski wrote:
> Implement flushing the 128-bit HOST_DATA accumulator to VRAM to enable
> text rendering in X. Currently supports only the monochrome
> foreground/background datatype with the SRCCOPY ROP.
>
> The flush is broken up into two steps for clarity. First, expansion of the
> monochrome bits to the destination color depth. Then the expanded pixels
> are clipped and copied into VRAM.
>
> Signed-off-by: Chad Jablonski <chad@jablonski.xyz>
> ---
> hw/display/ati.c     |   4 +-
> hw/display/ati_2d.c  | 124 +++++++++++++++++++++++++++++++++++++++++++
> hw/display/ati_int.h |   3 ++
> 3 files changed, 129 insertions(+), 2 deletions(-)
>
> diff --git a/hw/display/ati.c b/hw/display/ati.c
> index 88d30bf532..cc5899981b 100644
> --- a/hw/display/ati.c
> +++ b/hw/display/ati.c
> @@ -1064,13 +1064,13 @@ static void ati_mm_write(void *opaque, hwaddr addr,
>     case HOST_DATA7:
>         s->host_data.acc[s->host_data.next++] = data;
>         if (s->host_data.next >= 4) {
> -            qemu_log_mask(LOG_UNIMP, "HOST_DATA flush not yet implemented\n");
> +            ati_flush_host_data(s);
>             s->host_data.next = 0;
>         }
>         break;
>     case HOST_DATA_LAST:
>         s->host_data.acc[s->host_data.next] = data;
> -        qemu_log_mask(LOG_UNIMP, "HOST_DATA flush not yet implemented\n");
> +        ati_flush_host_data(s);
>         ati_host_data_reset(&s->host_data);
>         break;
>     default:
> diff --git a/hw/display/ati_2d.c b/hw/display/ati_2d.c
> index 6c36e55412..19130ed291 100644
> --- a/hw/display/ati_2d.c
> +++ b/hw/display/ati_2d.c
> @@ -117,6 +117,11 @@ static ATIBlitDest setup_2d_blt_dst(ATIVGAState *s)
>
> void ati_2d_blt(ATIVGAState *s)
> {
> +    if (s->regs.dp_src_source == GMC_SRC_SOURCE_HOST_DATA) {
> +        /* HOST_DATA blits are handled separately by ati_flush_host_data() */
> +        return;
> +    }
> +

I wonder if we could reuse the existing blt function here and keep pixman 
optimisations and avoid duplicating code by filling a src buffer with the 
expanded data after we accumulated the 128 bits and we need to flush and 
then call ati_2d_blt with that as src and dst modified to match where it 
should go instead of the full blit area that the registers describe. That 
needs to split ati_2d_blt or it should take a blit state instead of 
ATIVGAState that can either be filled from regs for a regular blit or for 
a smaller part for a host data blit so that way host data is just split 
into smaller 128 bit blits which I think is how the actual GPU works as 
well. Does that make sense? Do you think that could work?

Regards,
BALATON Zoltan

Re: [PATCH v4 9/9] ati-vga: Implement HOST_DATA flush to VRAM

Posted by Chad Jablonski 1 month ago

>
> I wonder if we could reuse the existing blt function here and keep pixman 
> optimisations and avoid duplicating code by filling a src buffer with the 
> expanded data after we accumulated the 128 bits and we need to flush and 
> then call ati_2d_blt with that as src and dst modified to match where it 
> should go instead of the full blit area that the registers describe. That 
> needs to split ati_2d_blt or it should take a blit state instead of 
> ATIVGAState that can either be filled from regs for a regular blit or for 
> a smaller part for a host data blit so that way host data is just split 
> into smaller 128 bit blits which I think is how the actual GPU works as 
> well. Does that make sense? Do you think that could work?
>

I considered this. I initially rolled this back into the standard ati_2d_blt
function but when I discovered the accumulator behavior it changed things a
bit. The thing that seemed off to me was that you can end up with "jagged"
blits from the accumulator. Meaning if you have a host_data blit with a width
of 100px and you flush the accumulator you don't end up with a rectangular
region to blit with pixman. You have a 100px line and then a 28px line that
you need to initiate two separate blits via pixman for. My gut told me that
the overhead of breaking things up and potentially making multiple calls to
pixman per flush would make it not worth it. But I admit I never tested it!

Another thing that isn't a problem in this patch but will complicate things
in the future is transparency (bg pixel leave alone). I get the impression
there may be some way to handle this with pixman using masks to avoid breaking
blits up even more but I haven't fully investigated that. Actually as I type
this I wonder if that same sort of masking could be used to avoid multiple
calls in the current implementation.

I may be totally off on how we could do this with pixman though too. I'm new to
it so I may not be aware of everything it can do.

Re: [PATCH v4 9/9] ati-vga: Implement HOST_DATA flush to VRAM

Posted by BALATON Zoltan 1 month ago

On Tue, 6 Jan 2026, Chad Jablonski wrote:
>> I wonder if we could reuse the existing blt function here and keep pixman
>> optimisations and avoid duplicating code by filling a src buffer with the
>> expanded data after we accumulated the 128 bits and we need to flush and
>> then call ati_2d_blt with that as src and dst modified to match where it
>> should go instead of the full blit area that the registers describe. That
>> needs to split ati_2d_blt or it should take a blit state instead of
>> ATIVGAState that can either be filled from regs for a regular blit or for
>> a smaller part for a host data blit so that way host data is just split
>> into smaller 128 bit blits which I think is how the actual GPU works as
>> well. Does that make sense? Do you think that could work?
>>
>
> I considered this. I initially rolled this back into the standard ati_2d_blt
> function but when I discovered the accumulator behavior it changed things a
> bit. The thing that seemed off to me was that you can end up with "jagged"
> blits from the accumulator. Meaning if you have a host_data blit with a width
> of 100px and you flush the accumulator you don't end up with a rectangular
> region to blit with pixman. You have a 100px line and then a 28px line that
> you need to initiate two separate blits via pixman for. My gut told me that
> the overhead of breaking things up and potentially making multiple calls to
> pixman per flush would make it not worth it. But I admit I never tested it!

We have fallbacks in the ati_2d_blt function so if pixman does not help 
it's easy to call the function with parameters that make it use the fall 
back. I just think it would be better to have the part that writes to the 
frame buffer at one place so the checks for writing outside the valid 
memory and doing all the pixel combining is not duplicated. So even if we 
don't use pixman it may be better to do the actual blit with the same 
function that does normal blits. I did not consider combining flushes just 
calling the same function for each line when flushing and if pixman proves 
to be an overhead we can use the fallback instead.

> Another thing that isn't a problem in this patch but will complicate things
> in the future is transparency (bg pixel leave alone). I get the impression

Isn't that some ROP which we need to implement for blits using 
transparency anyway? I think the host data gets either straight pixels 
that we need to copy or some bitmask that need to be extended either with 
fg bg or fg over existing image. The latter could be implemented by not 
starting with empty row buffer but getting the row from the frame buffer 
then masking and ORing or whatever combining is needed. But if we have a 
ROP for that even for normal blits we'll need that anyway in the blit 
function.

> there may be some way to handle this with pixman using masks to avoid breaking
> blits up even more but I haven't fully investigated that. Actually as I type
> this I wonder if that same sort of masking could be used to avoid multiple
> calls in the current implementation.

This could be investigated later as possible optimisation but maybe not 
needed at first.

> I may be totally off on how we could do this with pixman though too. I'm new to
> it so I may not be aware of everything it can do.

Pixman has no documentation so the only info is looking at the source. But 
maybe we don't need to change anything if the host data flush prepares the 
source buffer by either using the data for datatype 3, adding bg/fg for 
mono extract and either copying the dest row first into the buffer then do 
the fg with transparency from host data bits then finally in all 3 cases 
just call ati_2d_blt with the prepared source buffer. That way no changes 
to ati_2d_blt is needed other than allowing it to take a state with the 
source and dest data rather than getting it from registers. The part that 
gets data from regs can be split out then what's left is what we need for 
host data. I haven't thought through it in detail but I think it could 
work and avoid duplicating the part that writes pixels to frame buffer.

Regards,
BALATON Zoltan

Re: [PATCH v4 9/9] ati-vga: Implement HOST_DATA flush to VRAM

Posted by Chad Jablonski 1 month ago

>
> Isn't that some ROP which we need to implement for blits using 
> transparency anyway? I think the host data gets either straight pixels 
> that we need to copy or some bitmask that need to be extended either with 
> fg bg or fg over existing image. The latter could be implemented by not 
> starting with empty row buffer but getting the row from the frame buffer 
> then masking and ORing or whatever combining is needed. But if we have a 
> ROP for that even for normal blits we'll need that anyway in the blit 
> function.
>

Ah, yep, very good point. I wasn't even thinking about all of the other
ROPs that are going to need this.

> Pixman has no documentation so the only info is looking at the source. But 
> maybe we don't need to change anything if the host data flush prepares the 
> source buffer by either using the data for datatype 3, adding bg/fg for 
> mono extract and either copying the dest row first into the buffer then do 
> the fg with transparency from host data bits then finally in all 3 cases 
> just call ati_2d_blt with the prepared source buffer. That way no changes 
> to ati_2d_blt is needed other than allowing it to take a state with the 
> source and dest data rather than getting it from registers. The part that 
> gets data from regs can be split out then what's left is what we need for 
> host data. I haven't thought through it in detail but I think it could 
> work and avoid duplicating the part that writes pixels to frame buffer.
>

It took me a bit but I think I see what you mean now! Sending an RFC
patch with a refactor that I think will set us up well to do this.

Re: [PATCH v4 9/9] ati-vga: Implement HOST_DATA flush to VRAM

Posted by BALATON Zoltan 1 month ago

On Thu, 8 Jan 2026, Chad Jablonski wrote:
>> Isn't that some ROP which we need to implement for blits using
>> transparency anyway? I think the host data gets either straight pixels
>> that we need to copy or some bitmask that need to be extended either with
>> fg bg or fg over existing image. The latter could be implemented by not
>> starting with empty row buffer but getting the row from the frame buffer
>> then masking and ORing or whatever combining is needed. But if we have a
>> ROP for that even for normal blits we'll need that anyway in the blit
>> function.
>>
>
> Ah, yep, very good point. I wasn't even thinking about all of the other
> ROPs that are going to need this.
>
>> Pixman has no documentation so the only info is looking at the source. But
>> maybe we don't need to change anything if the host data flush prepares the
>> source buffer by either using the data for datatype 3, adding bg/fg for
>> mono extract and either copying the dest row first into the buffer then do
>> the fg with transparency from host data bits then finally in all 3 cases
>> just call ati_2d_blt with the prepared source buffer. That way no changes
>> to ati_2d_blt is needed other than allowing it to take a state with the
>> source and dest data rather than getting it from registers. The part that
>> gets data from regs can be split out then what's left is what we need for
>> host data. I haven't thought through it in detail but I think it could
>> work and avoid duplicating the part that writes pixels to frame buffer.
>
> It took me a bit but I think I see what you mean now! Sending an RFC
> patch with a refactor that I think will set us up well to do this.

If something is not clear feel free to ask. It might be difficult to 
clearly describe vague ideas in English but as this was not clearly 
thought through yet it might not be a language issue.

Regards,
BALATON Zoltan