Implement flushing the 128-bit HOST_DATA accumulator to VRAM to enable
text rendering in X. Currently supports only the monochrome
foreground/background datatype with the SRCCOPY ROP.
The flush is broken up into two steps for clarity. First, expansion of the
monochrome bits to the destination color depth. Then the expanded pixels
are clipped and copied into VRAM.
Signed-off-by: Chad Jablonski <chad@jablonski.xyz>
---
hw/display/ati.c | 4 +-
hw/display/ati_2d.c | 124 +++++++++++++++++++++++++++++++++++++++++++
hw/display/ati_int.h | 3 ++
3 files changed, 129 insertions(+), 2 deletions(-)
diff --git a/hw/display/ati.c b/hw/display/ati.c
index 88d30bf532..cc5899981b 100644
--- a/hw/display/ati.c
+++ b/hw/display/ati.c
@@ -1064,13 +1064,13 @@ static void ati_mm_write(void *opaque, hwaddr addr,
case HOST_DATA7:
s->host_data.acc[s->host_data.next++] = data;
if (s->host_data.next >= 4) {
- qemu_log_mask(LOG_UNIMP, "HOST_DATA flush not yet implemented\n");
+ ati_flush_host_data(s);
s->host_data.next = 0;
}
break;
case HOST_DATA_LAST:
s->host_data.acc[s->host_data.next] = data;
- qemu_log_mask(LOG_UNIMP, "HOST_DATA flush not yet implemented\n");
+ ati_flush_host_data(s);
ati_host_data_reset(&s->host_data);
break;
default:
diff --git a/hw/display/ati_2d.c b/hw/display/ati_2d.c
index 6c36e55412..19130ed291 100644
--- a/hw/display/ati_2d.c
+++ b/hw/display/ati_2d.c
@@ -117,6 +117,11 @@ static ATIBlitDest setup_2d_blt_dst(ATIVGAState *s)
void ati_2d_blt(ATIVGAState *s)
{
+ if (s->regs.dp_src_source == GMC_SRC_SOURCE_HOST_DATA) {
+ /* HOST_DATA blits are handled separately by ati_flush_host_data() */
+ return;
+ }
+
/* FIXME it is probably more complex than this and may need to be */
/* rewritten but for now as a start just to get some output: */
DisplaySurface *ds = qemu_console_surface(s->vga.con);
@@ -293,3 +298,122 @@ void ati_2d_blt(ATIVGAState *s)
s->regs.dp_rop3);
}
}
+
+void ati_flush_host_data(ATIVGAState *s)
+{
+ DisplaySurface *ds;
+ ATIBlitDest dst;
+ uint32_t fg, bg;
+ unsigned bypp, row, col, idx;
+ uint8_t pix_buf[ATI_HOST_DATA_ACC_BITS * sizeof(uint32_t)];
+
+ if (s->regs.dp_src_source != GMC_SRC_SOURCE_HOST_DATA) {
+ qemu_log_mask(LOG_UNIMP,
+ "host_data_blt: only GMC_SRC_SOURCE_HOST_DATA "
+ "supported\n");
+ return;
+ }
+
+ if (s->regs.dp_src_datatype != GMC_SRC_DATATYPE_MONO_FRGD_BKGD) {
+ qemu_log_mask(LOG_UNIMP,
+ "host_data_blt: only GMC_SRC_DATATYPE_MONO_FRGD_BKGD "
+ "supported\n");
+ return;
+ }
+
+ if (s->regs.dp_rop3 != ROP3_SRCCOPY) {
+ qemu_log_mask(LOG_UNIMP,
+ "host_data_blt: only ROP3_SRCCOPY supported. rop: %x\n",
+ s->regs.dp_rop3);
+ return;
+ }
+
+ dst = setup_2d_blt_dst(s);
+ if (!dst.valid) {
+ return;
+ }
+
+ if (!dst.left_to_right || !dst.top_to_bottom) {
+ qemu_log_mask(LOG_UNIMP, "host_data_blt: only L->R, T->B supported\n");
+ return;
+ }
+
+ fg = s->regs.dp_src_frgd_clr;
+ bg = s->regs.dp_src_bkgd_clr;
+ bypp = dst.bpp / 8;
+
+ /* Expand monochrome bits to color pixels */
+ idx = 0;
+ for (int word = 0; word < 4; word++) {
+ for (int byte = 0; byte < 4; byte++) {
+ uint8_t byte_val = s->host_data.acc[word] >> (byte * 8);
+ for (int i = 0; i < 8; i++) {
+ int bit = s->regs.byte_pix_order ? i : (7 - i);
+ bool is_fg = extract8(byte_val, bit, 1);
+ uint32_t color = is_fg ? fg : bg;
+ stn_he_p(&pix_buf[idx * bypp], bypp, color);
+ idx += 1;
+ }
+ }
+ }
+
+ /* Copy to VRAM one scanline at a time */
+ row = s->host_data.row;
+ col = s->host_data.col;
+ idx = 0;
+ while (idx < ATI_HOST_DATA_ACC_BITS && row < dst.rect.height) {
+ uint8_t *vram_dst;
+ unsigned start_col, end_col, vis_row, num_pix, pix_idx;
+ unsigned pix_in_scanline = MIN(ATI_HOST_DATA_ACC_BITS -
+ idx, dst.rect.width - col);
+
+ /* Row-based clipping */
+ if (row < dst.src_top_offset ||
+ row >= dst.src_top_offset + dst.visible.height) {
+ goto skip_pix;
+ }
+
+ /* Column-based clipping */
+ start_col = MAX(col, dst.src_left_offset);
+ end_col = MIN(col + pix_in_scanline,
+ dst.src_left_offset + dst.visible.width);
+ if (end_col <= start_col) {
+ goto skip_pix;
+ }
+
+ /* Copy expanded bits/pixels to VRAM */
+ vis_row = row - dst.src_top_offset;
+ num_pix = end_col - start_col;
+ vram_dst = dst.bits +
+ (dst.visible.y + vis_row) * dst.stride +
+ (dst.visible.x + (start_col - dst.src_left_offset)) * bypp;
+
+ pix_idx = (idx + (start_col - col)) * bypp;
+ memcpy(vram_dst, &pix_buf[pix_idx], num_pix * bypp);
+
+ skip_pix:
+ idx += pix_in_scanline;
+ col += pix_in_scanline;
+ if (col >= dst.rect.width) {
+ col = 0;
+ row += 1;
+ }
+ }
+ /* Track state of the overall blit for use by the next flush */
+ s->host_data.row = row;
+ s->host_data.col = col;
+
+ /*
+ * TODO: This is setting the entire blit region to dirty.
+ * We maybe just need this tiny section?
+ */
+ ds = qemu_console_surface(s->vga.con);
+ if (dst.bits >= s->vga.vram_ptr + s->vga.vbe_start_addr &&
+ dst.bits < s->vga.vram_ptr + s->vga.vbe_start_addr +
+ s->vga.vbe_regs[VBE_DISPI_INDEX_YRES] * s->vga.vbe_line_offset) {
+ memory_region_set_dirty(&s->vga.vram, s->vga.vbe_start_addr +
+ s->regs.dst_offset +
+ dst.visible.y * surface_stride(ds),
+ dst.visible.height * surface_stride(ds));
+ }
+}
diff --git a/hw/display/ati_int.h b/hw/display/ati_int.h
index 3029dc7e3c..448daf44a9 100644
--- a/hw/display/ati_int.h
+++ b/hw/display/ati_int.h
@@ -30,6 +30,8 @@
/* Radeon RV100 (VE) */
#define PCI_DEVICE_ID_ATI_RADEON_QY 0x5159
+#define ATI_HOST_DATA_ACC_BITS 128
+
#define TYPE_ATI_VGA "ati-vga"
OBJECT_DECLARE_SIMPLE_TYPE(ATIVGAState, ATI_VGA)
@@ -127,5 +129,6 @@ struct ATIVGAState {
const char *ati_reg_name(int num);
void ati_2d_blt(ATIVGAState *s);
+void ati_flush_host_data(ATIVGAState *s);
#endif /* ATI_INT_H */
--
2.51.2
On Tue, 6 Jan 2026, Chad Jablonski wrote:
> Implement flushing the 128-bit HOST_DATA accumulator to VRAM to enable
> text rendering in X. Currently supports only the monochrome
> foreground/background datatype with the SRCCOPY ROP.
>
> The flush is broken up into two steps for clarity. First, expansion of the
> monochrome bits to the destination color depth. Then the expanded pixels
> are clipped and copied into VRAM.
>
> Signed-off-by: Chad Jablonski <chad@jablonski.xyz>
> ---
> hw/display/ati.c | 4 +-
> hw/display/ati_2d.c | 124 +++++++++++++++++++++++++++++++++++++++++++
> hw/display/ati_int.h | 3 ++
> 3 files changed, 129 insertions(+), 2 deletions(-)
>
> diff --git a/hw/display/ati.c b/hw/display/ati.c
> index 88d30bf532..cc5899981b 100644
> --- a/hw/display/ati.c
> +++ b/hw/display/ati.c
> @@ -1064,13 +1064,13 @@ static void ati_mm_write(void *opaque, hwaddr addr,
> case HOST_DATA7:
> s->host_data.acc[s->host_data.next++] = data;
> if (s->host_data.next >= 4) {
> - qemu_log_mask(LOG_UNIMP, "HOST_DATA flush not yet implemented\n");
> + ati_flush_host_data(s);
> s->host_data.next = 0;
> }
> break;
> case HOST_DATA_LAST:
> s->host_data.acc[s->host_data.next] = data;
> - qemu_log_mask(LOG_UNIMP, "HOST_DATA flush not yet implemented\n");
> + ati_flush_host_data(s);
> ati_host_data_reset(&s->host_data);
> break;
> default:
> diff --git a/hw/display/ati_2d.c b/hw/display/ati_2d.c
> index 6c36e55412..19130ed291 100644
> --- a/hw/display/ati_2d.c
> +++ b/hw/display/ati_2d.c
> @@ -117,6 +117,11 @@ static ATIBlitDest setup_2d_blt_dst(ATIVGAState *s)
>
> void ati_2d_blt(ATIVGAState *s)
> {
> + if (s->regs.dp_src_source == GMC_SRC_SOURCE_HOST_DATA) {
> + /* HOST_DATA blits are handled separately by ati_flush_host_data() */
> + return;
> + }
> +
I wonder if we could reuse the existing blt function here and keep pixman
optimisations and avoid duplicating code by filling a src buffer with the
expanded data after we accumulated the 128 bits and we need to flush and
then call ati_2d_blt with that as src and dst modified to match where it
should go instead of the full blit area that the registers describe. That
needs to split ati_2d_blt or it should take a blit state instead of
ATIVGAState that can either be filled from regs for a regular blit or for
a smaller part for a host data blit so that way host data is just split
into smaller 128 bit blits which I think is how the actual GPU works as
well. Does that make sense? Do you think that could work?
Regards,
BALATON Zoltan
> > I wonder if we could reuse the existing blt function here and keep pixman > optimisations and avoid duplicating code by filling a src buffer with the > expanded data after we accumulated the 128 bits and we need to flush and > then call ati_2d_blt with that as src and dst modified to match where it > should go instead of the full blit area that the registers describe. That > needs to split ati_2d_blt or it should take a blit state instead of > ATIVGAState that can either be filled from regs for a regular blit or for > a smaller part for a host data blit so that way host data is just split > into smaller 128 bit blits which I think is how the actual GPU works as > well. Does that make sense? Do you think that could work? > I considered this. I initially rolled this back into the standard ati_2d_blt function but when I discovered the accumulator behavior it changed things a bit. The thing that seemed off to me was that you can end up with "jagged" blits from the accumulator. Meaning if you have a host_data blit with a width of 100px and you flush the accumulator you don't end up with a rectangular region to blit with pixman. You have a 100px line and then a 28px line that you need to initiate two separate blits via pixman for. My gut told me that the overhead of breaking things up and potentially making multiple calls to pixman per flush would make it not worth it. But I admit I never tested it! Another thing that isn't a problem in this patch but will complicate things in the future is transparency (bg pixel leave alone). I get the impression there may be some way to handle this with pixman using masks to avoid breaking blits up even more but I haven't fully investigated that. Actually as I type this I wonder if that same sort of masking could be used to avoid multiple calls in the current implementation. I may be totally off on how we could do this with pixman though too. I'm new to it so I may not be aware of everything it can do.
On Tue, 6 Jan 2026, Chad Jablonski wrote: >> I wonder if we could reuse the existing blt function here and keep pixman >> optimisations and avoid duplicating code by filling a src buffer with the >> expanded data after we accumulated the 128 bits and we need to flush and >> then call ati_2d_blt with that as src and dst modified to match where it >> should go instead of the full blit area that the registers describe. That >> needs to split ati_2d_blt or it should take a blit state instead of >> ATIVGAState that can either be filled from regs for a regular blit or for >> a smaller part for a host data blit so that way host data is just split >> into smaller 128 bit blits which I think is how the actual GPU works as >> well. Does that make sense? Do you think that could work? >> > > I considered this. I initially rolled this back into the standard ati_2d_blt > function but when I discovered the accumulator behavior it changed things a > bit. The thing that seemed off to me was that you can end up with "jagged" > blits from the accumulator. Meaning if you have a host_data blit with a width > of 100px and you flush the accumulator you don't end up with a rectangular > region to blit with pixman. You have a 100px line and then a 28px line that > you need to initiate two separate blits via pixman for. My gut told me that > the overhead of breaking things up and potentially making multiple calls to > pixman per flush would make it not worth it. But I admit I never tested it! We have fallbacks in the ati_2d_blt function so if pixman does not help it's easy to call the function with parameters that make it use the fall back. I just think it would be better to have the part that writes to the frame buffer at one place so the checks for writing outside the valid memory and doing all the pixel combining is not duplicated. So even if we don't use pixman it may be better to do the actual blit with the same function that does normal blits. I did not consider combining flushes just calling the same function for each line when flushing and if pixman proves to be an overhead we can use the fallback instead. > Another thing that isn't a problem in this patch but will complicate things > in the future is transparency (bg pixel leave alone). I get the impression Isn't that some ROP which we need to implement for blits using transparency anyway? I think the host data gets either straight pixels that we need to copy or some bitmask that need to be extended either with fg bg or fg over existing image. The latter could be implemented by not starting with empty row buffer but getting the row from the frame buffer then masking and ORing or whatever combining is needed. But if we have a ROP for that even for normal blits we'll need that anyway in the blit function. > there may be some way to handle this with pixman using masks to avoid breaking > blits up even more but I haven't fully investigated that. Actually as I type > this I wonder if that same sort of masking could be used to avoid multiple > calls in the current implementation. This could be investigated later as possible optimisation but maybe not needed at first. > I may be totally off on how we could do this with pixman though too. I'm new to > it so I may not be aware of everything it can do. Pixman has no documentation so the only info is looking at the source. But maybe we don't need to change anything if the host data flush prepares the source buffer by either using the data for datatype 3, adding bg/fg for mono extract and either copying the dest row first into the buffer then do the fg with transparency from host data bits then finally in all 3 cases just call ati_2d_blt with the prepared source buffer. That way no changes to ati_2d_blt is needed other than allowing it to take a state with the source and dest data rather than getting it from registers. The part that gets data from regs can be split out then what's left is what we need for host data. I haven't thought through it in detail but I think it could work and avoid duplicating the part that writes pixels to frame buffer. Regards, BALATON Zoltan
> > Isn't that some ROP which we need to implement for blits using > transparency anyway? I think the host data gets either straight pixels > that we need to copy or some bitmask that need to be extended either with > fg bg or fg over existing image. The latter could be implemented by not > starting with empty row buffer but getting the row from the frame buffer > then masking and ORing or whatever combining is needed. But if we have a > ROP for that even for normal blits we'll need that anyway in the blit > function. > Ah, yep, very good point. I wasn't even thinking about all of the other ROPs that are going to need this. > Pixman has no documentation so the only info is looking at the source. But > maybe we don't need to change anything if the host data flush prepares the > source buffer by either using the data for datatype 3, adding bg/fg for > mono extract and either copying the dest row first into the buffer then do > the fg with transparency from host data bits then finally in all 3 cases > just call ati_2d_blt with the prepared source buffer. That way no changes > to ati_2d_blt is needed other than allowing it to take a state with the > source and dest data rather than getting it from registers. The part that > gets data from regs can be split out then what's left is what we need for > host data. I haven't thought through it in detail but I think it could > work and avoid duplicating the part that writes pixels to frame buffer. > It took me a bit but I think I see what you mean now! Sending an RFC patch with a refactor that I think will set us up well to do this.
On Thu, 8 Jan 2026, Chad Jablonski wrote: >> Isn't that some ROP which we need to implement for blits using >> transparency anyway? I think the host data gets either straight pixels >> that we need to copy or some bitmask that need to be extended either with >> fg bg or fg over existing image. The latter could be implemented by not >> starting with empty row buffer but getting the row from the frame buffer >> then masking and ORing or whatever combining is needed. But if we have a >> ROP for that even for normal blits we'll need that anyway in the blit >> function. >> > > Ah, yep, very good point. I wasn't even thinking about all of the other > ROPs that are going to need this. > >> Pixman has no documentation so the only info is looking at the source. But >> maybe we don't need to change anything if the host data flush prepares the >> source buffer by either using the data for datatype 3, adding bg/fg for >> mono extract and either copying the dest row first into the buffer then do >> the fg with transparency from host data bits then finally in all 3 cases >> just call ati_2d_blt with the prepared source buffer. That way no changes >> to ati_2d_blt is needed other than allowing it to take a state with the >> source and dest data rather than getting it from registers. The part that >> gets data from regs can be split out then what's left is what we need for >> host data. I haven't thought through it in detail but I think it could >> work and avoid duplicating the part that writes pixels to frame buffer. > > It took me a bit but I think I see what you mean now! Sending an RFC > patch with a refactor that I think will set us up well to do this. If something is not clear feel free to ask. It might be difficult to clearly describe vague ideas in English but as this was not clearly thought through yet it might not be a language issue. Regards, BALATON Zoltan
© 2016 - 2026 Red Hat, Inc.