The text console receives bytes that may be UTF-8 encoded (e.g. from
a guest running a modern distro), but currently treats each byte as a
raw character index into the VGA/CP437 font, producing garbled output
for any multi-byte sequence.
Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA.
The DFA inherently rejects overlong encodings, surrogates, and
codepoints above U+10FFFF. Completed codepoints are then mapped to
CP437, unmappable characters are displayed as '?'.
Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
ui/cp437.h | 13 ++++
ui/console-vc.c | 62 +++++++++++++++++
ui/cp437.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ui/meson.build | 2 +-
4 files changed, 281 insertions(+), 1 deletion(-)
diff --git a/ui/cp437.h b/ui/cp437.h
new file mode 100644
index 00000000000..81ace8317c7
--- /dev/null
+++ b/ui/cp437.h
@@ -0,0 +1,13 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) QEMU contributors
+ */
+#ifndef QEMU_CP437_H
+#define QEMU_CP437_H
+
+#include <stdint.h>
+
+int unicode_to_cp437(uint32_t codepoint);
+
+#endif /* QEMU_CP437_H */
diff --git a/ui/console-vc.c b/ui/console-vc.c
index 8dee1f9bd01..7bbd65dea27 100644
--- a/ui/console-vc.c
+++ b/ui/console-vc.c
@@ -9,6 +9,7 @@
#include "qemu/fifo8.h"
#include "qemu/option.h"
#include "ui/console.h"
+#include "ui/cp437.h"
#include "trace.h"
#include "console-priv.h"
@@ -89,6 +90,8 @@ struct VCChardev {
enum TTYState state;
int esc_params[MAX_ESC_PARAMS];
int nb_esc_params;
+ uint32_t utf8_state; /* UTF-8 DFA decoder state */
+ uint32_t utf8_codepoint; /* accumulated UTF-8 code point */
TextAttributes t_attrib; /* currently active text attributes */
TextAttributes t_attrib_saved;
int x_saved, y_saved;
@@ -598,6 +601,47 @@ static void vc_clear_xy(VCChardev *vc, int x, int y)
vc_update_xy(vc, x, y);
}
+/*
+ * UTF-8 DFA decoder by Bjoern Hoehrmann.
+ * Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ * See https://github.com/polijan/utf8_decode for details.
+ *
+ * SPDX-License-Identifier: MIT
+ */
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+ /* character class lookup */
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ /* state transition lookup */
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static uint32_t utf8_decode(uint32_t *state, uint32_t *codep, uint32_t byte)
+{
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xffu >> type) & (byte);
+
+ *state = utf8d[256 + *state + type];
+ return *state;
+}
+
static void vc_put_one(VCChardev *vc, int ch)
{
QemuTextConsole *s = vc->console;
@@ -761,6 +805,24 @@ static void vc_putchar(VCChardev *vc, int ch)
switch(vc->state) {
case TTY_STATE_NORM:
+ /* Feed byte through the UTF-8 DFA decoder */
+ if (ch >= 0x80) {
+ switch (utf8_decode(&vc->utf8_state, &vc->utf8_codepoint, ch)) {
+ case UTF8_ACCEPT:
+ vc_put_one(vc, unicode_to_cp437(vc->utf8_codepoint));
+ break;
+ case UTF8_REJECT:
+ /* Reset state so the decoder can resync */
+ vc->utf8_state = UTF8_ACCEPT;
+ break;
+ default:
+ /* Need more bytes */
+ break;
+ }
+ break;
+ }
+ /* ASCII byte: abort any pending UTF-8 sequence */
+ vc->utf8_state = UTF8_ACCEPT;
switch(ch) {
case '\r': /* carriage return */
s->x = 0;
diff --git a/ui/cp437.c b/ui/cp437.c
new file mode 100644
index 00000000000..8ec38b73419
--- /dev/null
+++ b/ui/cp437.c
@@ -0,0 +1,205 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) QEMU contributors
+ */
+#include "qemu/osdep.h"
+#include "cp437.h"
+
+/*
+ * Unicode to CP437 page tables.
+ *
+ * Borrowed from the Linux kernel (fs/nls/nls_cp437.c, "Dual BSD/GPL"),
+ * generated from the Unicode Organization tables (www.unicode.org).
+ */
+static const unsigned char uni2cp437_page00[256] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
+
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+ 0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */
+ 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+ 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
+ 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
+ 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
+ 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
+ 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
+ 0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
+ 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
+ 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
+ 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
+ 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */
+};
+
+static const unsigned char uni2cp437_page01[256] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+ 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+};
+
+static const unsigned char uni2cp437_page03[256] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+ 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+ 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+ 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
+ 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
+ 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
+ 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
+};
+
+static const unsigned char uni2cp437_page20[256] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
+
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
+};
+
+static const unsigned char uni2cp437_page22[256] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+ 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+ 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+ 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
+ 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
+};
+
+static const unsigned char uni2cp437_page23[256] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+ 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+ 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+};
+
+static const unsigned char uni2cp437_page25[256] = {
+ 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
+ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
+ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
+ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
+ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
+ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
+ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
+ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
+ 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
+ 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
+ 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
+ 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
+
+ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
+ 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
+ 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
+ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
+};
+
+static const unsigned char *const uni2cp437_page[256] = {
+ [0x00] = uni2cp437_page00, [0x01] = uni2cp437_page01,
+ [0x03] = uni2cp437_page03, [0x20] = uni2cp437_page20,
+ [0x22] = uni2cp437_page22, [0x23] = uni2cp437_page23,
+ [0x25] = uni2cp437_page25,
+};
+
+/*
+ * Convert a Unicode code point to its CP437 equivalent for
+ * rendering with the VGA font.
+ * Returns '?' for characters that cannot be mapped.
+ */
+int unicode_to_cp437(uint32_t codepoint)
+{
+ const unsigned char *page;
+ unsigned char hi = (codepoint >> 8) & 0xff;
+ unsigned char lo = codepoint & 0xff;
+
+ if (codepoint > 0xffff) {
+ return '?';
+ }
+
+ page = uni2cp437_page[hi];
+ if (page && page[lo]) {
+ return page[lo];
+ }
+
+ return '?';
+}
diff --git a/ui/meson.build b/ui/meson.build
index 69404bca71a..d4d9312b98c 100644
--- a/ui/meson.build
+++ b/ui/meson.build
@@ -16,7 +16,7 @@ system_ss.add(files(
'ui-qmp-cmds.c',
'util.c',
))
-system_ss.add(when: pixman, if_true: files('console-vc.c'), if_false: files('console-vc-stubs.c'))
+system_ss.add(when: pixman, if_true: files('console-vc.c', 'cp437.c'), if_false: files('console-vc-stubs.c'))
if dbus_display
system_ss.add(files('dbus-module.c'))
endif
--
2.53.0
Marc-André Lureau <marcandre.lureau@redhat.com> writes: > The text console receives bytes that may be UTF-8 encoded (e.g. from > a guest running a modern distro), but currently treats each byte as a > raw character index into the VGA/CP437 font, producing garbled output > for any multi-byte sequence. > > Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA. > The DFA inherently rejects overlong encodings, surrogates, and > codepoints above U+10FFFF. Completed codepoints are then mapped to > CP437, unmappable characters are displayed as '?'. > > Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com> Have you considered the decoder in util/unicode.c? Do we need two decoders, or could we replace one by the other? There's a mad UTF-8 test suite buried in tests/unit/check-qjson.c derived from Markus Kuhn's UTF-8 decoder capability and stress test at <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>. How does this decoder do on these tests?
HI On Wed, Mar 25, 2026 at 9:35 AM Markus Armbruster <armbru@redhat.com> wrote: > > Marc-André Lureau <marcandre.lureau@redhat.com> writes: > > > The text console receives bytes that may be UTF-8 encoded (e.g. from > > a guest running a modern distro), but currently treats each byte as a > > raw character index into the VGA/CP437 font, producing garbled output > > for any multi-byte sequence. > > > > Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA. > > The DFA inherently rejects overlong encodings, surrogates, and > > codepoints above U+10FFFF. Completed codepoints are then mapped to > > CP437, unmappable characters are displayed as '?'. > > > > Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com> > > Have you considered the decoder in util/unicode.c? Do we need two > decoders, or could we replace one by the other? > Oh! I missed it, I should definitely try to use it. > There's a mad UTF-8 test suite buried in tests/unit/check-qjson.c > derived from Markus Kuhn's UTF-8 decoder capability and stress test at > <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>. How does > this decoder do on these tests? > According to the author original article, it should pass https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
Hi Markus On Wed, Mar 25, 2026 at 10:48 AM Marc-André Lureau <marcandre.lureau@redhat.com> wrote: > > HI > > On Wed, Mar 25, 2026 at 9:35 AM Markus Armbruster <armbru@redhat.com> wrote: > > > > Marc-André Lureau <marcandre.lureau@redhat.com> writes: > > > > > The text console receives bytes that may be UTF-8 encoded (e.g. from > > > a guest running a modern distro), but currently treats each byte as a > > > raw character index into the VGA/CP437 font, producing garbled output > > > for any multi-byte sequence. > > > > > > Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA. > > > The DFA inherently rejects overlong encodings, surrogates, and > > > codepoints above U+10FFFF. Completed codepoints are then mapped to > > > CP437, unmappable characters are displayed as '?'. > > > > > > Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com> > > > > Have you considered the decoder in util/unicode.c? Do we need two > > decoders, or could we replace one by the other? > > > > Oh! I missed it, I should definitely try to use it. The vt100 receives characters one at a time. The DFA is suited for this. mod_utf8_codepoint() works with whole buffers. It's possible to write one with the other, but we lose in performance, and we need extra quirks. I don't think it's worth it. > > > There's a mad UTF-8 test suite buried in tests/unit/check-qjson.c > > derived from Markus Kuhn's UTF-8 decoder capability and stress test at > > <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>. How does > > this decoder do on these tests? > > > > According to the author original article, it should pass > https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
Marc-André Lureau <marcandre.lureau@redhat.com> writes: > Hi Markus > > On Wed, Mar 25, 2026 at 10:48 AM Marc-André Lureau > <marcandre.lureau@redhat.com> wrote: >> >> HI >> >> On Wed, Mar 25, 2026 at 9:35 AM Markus Armbruster <armbru@redhat.com> wrote: >> > >> > Marc-André Lureau <marcandre.lureau@redhat.com> writes: >> > >> > > The text console receives bytes that may be UTF-8 encoded (e.g. from >> > > a guest running a modern distro), but currently treats each byte as a >> > > raw character index into the VGA/CP437 font, producing garbled output >> > > for any multi-byte sequence. >> > > >> > > Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA. >> > > The DFA inherently rejects overlong encodings, surrogates, and >> > > codepoints above U+10FFFF. Completed codepoints are then mapped to >> > > CP437, unmappable characters are displayed as '?'. >> > > >> > > Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com> >> > >> > Have you considered the decoder in util/unicode.c? Do we need two >> > decoders, or could we replace one by the other? >> > >> >> Oh! I missed it, I should definitely try to use it. > > The vt100 receives characters one at a time. The DFA is suited for > this. mod_utf8_codepoint() works with whole buffers. It's possible to > write one with the other, but we lose in performance, and we need > extra quirks. I don't think it's worth it. Implementing a buffer interface on top of a character at a time interface should be easier than the other way round. I doubt rewriting mod_utf8_codepoint() to wrap around the DFA would slow things down materially. I'm not demanding you try that! If we decide to go with two decoders, do we want a unit test to ensure they behave the same? [...]
Hi On Thu, Apr 2, 2026 at 6:40 PM Markus Armbruster <armbru@redhat.com> wrote: > > Marc-André Lureau <marcandre.lureau@redhat.com> writes: > > > Hi Markus > > > > On Wed, Mar 25, 2026 at 10:48 AM Marc-André Lureau > > <marcandre.lureau@redhat.com> wrote: > >> > >> HI > >> > >> On Wed, Mar 25, 2026 at 9:35 AM Markus Armbruster <armbru@redhat.com> wrote: > >> > > >> > Marc-André Lureau <marcandre.lureau@redhat.com> writes: > >> > > >> > > The text console receives bytes that may be UTF-8 encoded (e.g. from > >> > > a guest running a modern distro), but currently treats each byte as a > >> > > raw character index into the VGA/CP437 font, producing garbled output > >> > > for any multi-byte sequence. > >> > > > >> > > Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA. > >> > > The DFA inherently rejects overlong encodings, surrogates, and > >> > > codepoints above U+10FFFF. Completed codepoints are then mapped to > >> > > CP437, unmappable characters are displayed as '?'. > >> > > > >> > > Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com> > >> > > >> > Have you considered the decoder in util/unicode.c? Do we need two > >> > decoders, or could we replace one by the other? > >> > > >> > >> Oh! I missed it, I should definitely try to use it. > > > > The vt100 receives characters one at a time. The DFA is suited for > > this. mod_utf8_codepoint() works with whole buffers. It's possible to > > write one with the other, but we lose in performance, and we need > > extra quirks. I don't think it's worth it. > > Implementing a buffer interface on top of a character at a time > interface should be easier than the other way round. I tried that, it's not hard, except we need to handle the case where the DFA rejects early but mod_utf8_codepoint() should still consume the continuation bytes. > > I doubt rewriting mod_utf8_codepoint() to wrap around the DFA would slow > things down materially. Quick benchmarks showed -50% in performance. > I'm not demanding you try that! > > If we decide to go with two decoders, do we want a unit test to ensure > they behave the same? They don't behave exactly the same, as explained above, and also the U+0000 case (btw, if you could explain what we need it for, it's not obvious to me) -- Marc-André Lureau
On Tue, Mar 17, 2026 at 12:50:25PM +0400, Marc-André Lureau wrote:
> The text console receives bytes that may be UTF-8 encoded (e.g. from
> a guest running a modern distro), but currently treats each byte as a
> raw character index into the VGA/CP437 font, producing garbled output
> for any multi-byte sequence.
>
> Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA.
> The DFA inherently rejects overlong encodings, surrogates, and
> codepoints above U+10FFFF. Completed codepoints are then mapped to
> CP437, unmappable characters are displayed as '?'.
I'm surprised we can't do a charset conversion using GLib APIs ?
Do the g_convert family of APIs (which IIUC wrap the distro iconv)
not do what we would want ? If not, would direct use of iconv not
be an alternative ?
It feels pretty wrong to need to embed UTF8 decoding code in
QEMU
> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> ---
> ui/cp437.h | 13 ++++
> ui/console-vc.c | 62 +++++++++++++++++
> ui/cp437.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> ui/meson.build | 2 +-
> 4 files changed, 281 insertions(+), 1 deletion(-)
>
> diff --git a/ui/cp437.h b/ui/cp437.h
> new file mode 100644
> index 00000000000..81ace8317c7
> --- /dev/null
> +++ b/ui/cp437.h
> @@ -0,0 +1,13 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * Copyright (c) QEMU contributors
> + */
> +#ifndef QEMU_CP437_H
> +#define QEMU_CP437_H
> +
> +#include <stdint.h>
> +
> +int unicode_to_cp437(uint32_t codepoint);
> +
> +#endif /* QEMU_CP437_H */
> diff --git a/ui/console-vc.c b/ui/console-vc.c
> index 8dee1f9bd01..7bbd65dea27 100644
> --- a/ui/console-vc.c
> +++ b/ui/console-vc.c
> @@ -9,6 +9,7 @@
> #include "qemu/fifo8.h"
> #include "qemu/option.h"
> #include "ui/console.h"
> +#include "ui/cp437.h"
>
> #include "trace.h"
> #include "console-priv.h"
> @@ -89,6 +90,8 @@ struct VCChardev {
> enum TTYState state;
> int esc_params[MAX_ESC_PARAMS];
> int nb_esc_params;
> + uint32_t utf8_state; /* UTF-8 DFA decoder state */
> + uint32_t utf8_codepoint; /* accumulated UTF-8 code point */
> TextAttributes t_attrib; /* currently active text attributes */
> TextAttributes t_attrib_saved;
> int x_saved, y_saved;
> @@ -598,6 +601,47 @@ static void vc_clear_xy(VCChardev *vc, int x, int y)
> vc_update_xy(vc, x, y);
> }
>
> +/*
> + * UTF-8 DFA decoder by Bjoern Hoehrmann.
> + * Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
> + * See https://github.com/polijan/utf8_decode for details.
> + *
> + * SPDX-License-Identifier: MIT
> + */
> +#define UTF8_ACCEPT 0
> +#define UTF8_REJECT 12
> +
> +static const uint8_t utf8d[] = {
> + /* character class lookup */
> + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
> + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
> + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
> +
> + /* state transition lookup */
> + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
> + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
> + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
> + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
> + 12,36,12,12,12,12,12,12,12,12,12,12,
> +};
> +
> +static uint32_t utf8_decode(uint32_t *state, uint32_t *codep, uint32_t byte)
> +{
> + uint32_t type = utf8d[byte];
> +
> + *codep = (*state != UTF8_ACCEPT) ?
> + (byte & 0x3fu) | (*codep << 6) :
> + (0xffu >> type) & (byte);
> +
> + *state = utf8d[256 + *state + type];
> + return *state;
> +}
> +
> static void vc_put_one(VCChardev *vc, int ch)
> {
> QemuTextConsole *s = vc->console;
> @@ -761,6 +805,24 @@ static void vc_putchar(VCChardev *vc, int ch)
>
> switch(vc->state) {
> case TTY_STATE_NORM:
> + /* Feed byte through the UTF-8 DFA decoder */
> + if (ch >= 0x80) {
> + switch (utf8_decode(&vc->utf8_state, &vc->utf8_codepoint, ch)) {
> + case UTF8_ACCEPT:
> + vc_put_one(vc, unicode_to_cp437(vc->utf8_codepoint));
> + break;
> + case UTF8_REJECT:
> + /* Reset state so the decoder can resync */
> + vc->utf8_state = UTF8_ACCEPT;
> + break;
> + default:
> + /* Need more bytes */
> + break;
> + }
> + break;
> + }
> + /* ASCII byte: abort any pending UTF-8 sequence */
> + vc->utf8_state = UTF8_ACCEPT;
> switch(ch) {
> case '\r': /* carriage return */
> s->x = 0;
> diff --git a/ui/cp437.c b/ui/cp437.c
> new file mode 100644
> index 00000000000..8ec38b73419
> --- /dev/null
> +++ b/ui/cp437.c
> @@ -0,0 +1,205 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * Copyright (c) QEMU contributors
> + */
> +#include "qemu/osdep.h"
> +#include "cp437.h"
> +
> +/*
> + * Unicode to CP437 page tables.
> + *
> + * Borrowed from the Linux kernel (fs/nls/nls_cp437.c, "Dual BSD/GPL"),
> + * generated from the Unicode Organization tables (www.unicode.org).
> + */
> +static const unsigned char uni2cp437_page00[256] = {
> + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
> + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
> + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
> + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
> + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
> + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
> + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
> + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
> + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
> + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
> + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
> + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
> + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
> + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
> + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
> + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
> +
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
> + 0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */
> + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
> + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
> + 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
> + 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
> + 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
> + 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
> + 0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
> + 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
> + 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
> + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
> + 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */
> +};
> +
> +static const unsigned char uni2cp437_page01[256] = {
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
> +
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> +};
> +
> +static const unsigned char uni2cp437_page03[256] = {
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
> +
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
> + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
> + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
> + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
> + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
> +};
> +
> +static const unsigned char uni2cp437_page20[256] = {
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
> +
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
> +};
> +
> +static const unsigned char uni2cp437_page22[256] = {
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
> + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
> +};
> +
> +static const unsigned char uni2cp437_page23[256] = {
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> +};
> +
> +static const unsigned char uni2cp437_page25[256] = {
> + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
> + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
> + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
> + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
> +
> + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
> + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
> +};
> +
> +static const unsigned char *const uni2cp437_page[256] = {
> + [0x00] = uni2cp437_page00, [0x01] = uni2cp437_page01,
> + [0x03] = uni2cp437_page03, [0x20] = uni2cp437_page20,
> + [0x22] = uni2cp437_page22, [0x23] = uni2cp437_page23,
> + [0x25] = uni2cp437_page25,
> +};
> +
> +/*
> + * Convert a Unicode code point to its CP437 equivalent for
> + * rendering with the VGA font.
> + * Returns '?' for characters that cannot be mapped.
> + */
> +int unicode_to_cp437(uint32_t codepoint)
> +{
> + const unsigned char *page;
> + unsigned char hi = (codepoint >> 8) & 0xff;
> + unsigned char lo = codepoint & 0xff;
> +
> + if (codepoint > 0xffff) {
> + return '?';
> + }
> +
> + page = uni2cp437_page[hi];
> + if (page && page[lo]) {
> + return page[lo];
> + }
> +
> + return '?';
> +}
> diff --git a/ui/meson.build b/ui/meson.build
> index 69404bca71a..d4d9312b98c 100644
> --- a/ui/meson.build
> +++ b/ui/meson.build
> @@ -16,7 +16,7 @@ system_ss.add(files(
> 'ui-qmp-cmds.c',
> 'util.c',
> ))
> -system_ss.add(when: pixman, if_true: files('console-vc.c'), if_false: files('console-vc-stubs.c'))
> +system_ss.add(when: pixman, if_true: files('console-vc.c', 'cp437.c'), if_false: files('console-vc-stubs.c'))
> if dbus_display
> system_ss.add(files('dbus-module.c'))
> endif
>
> --
> 2.53.0
>
>
With regards,
Daniel
--
|: https://berrange.com ~~ https://hachyderm.io/@berrange :|
|: https://libvirt.org ~~ https://entangle-photo.org :|
|: https://pixelfed.art/berrange ~~ https://fstop138.berrange.com :|
Hi
On Tue, Mar 24, 2026 at 6:08 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Tue, Mar 17, 2026 at 12:50:25PM +0400, Marc-André Lureau wrote:
> > The text console receives bytes that may be UTF-8 encoded (e.g. from
> > a guest running a modern distro), but currently treats each byte as a
> > raw character index into the VGA/CP437 font, producing garbled output
> > for any multi-byte sequence.
> >
> > Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA.
> > The DFA inherently rejects overlong encodings, surrogates, and
> > codepoints above U+10FFFF. Completed codepoints are then mapped to
> > CP437, unmappable characters are displayed as '?'.
>
> I'm surprised we can't do a charset conversion using GLib APIs ?
>
> Do the g_convert family of APIs (which IIUC wrap the distro iconv)
> not do what we would want ? If not, would direct use of iconv not
> be an alternative ?
>
I tried to use GIconv but ran into a number of issues, as it doesn't
operate on character level, but strings. And it uses allocation etc. I
didn't manage with iconv either.
> It feels pretty wrong to need to embed UTF8 decoding code in
> QEMU
Yes, but on a standalone qemu-vnc server, is it more acceptable?
>
> > Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > ---
> > ui/cp437.h | 13 ++++
> > ui/console-vc.c | 62 +++++++++++++++++
> > ui/cp437.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > ui/meson.build | 2 +-
> > 4 files changed, 281 insertions(+), 1 deletion(-)
> >
> > diff --git a/ui/cp437.h b/ui/cp437.h
> > new file mode 100644
> > index 00000000000..81ace8317c7
> > --- /dev/null
> > +++ b/ui/cp437.h
> > @@ -0,0 +1,13 @@
> > +/*
> > + * SPDX-License-Identifier: GPL-2.0-or-later
> > + *
> > + * Copyright (c) QEMU contributors
> > + */
> > +#ifndef QEMU_CP437_H
> > +#define QEMU_CP437_H
> > +
> > +#include <stdint.h>
> > +
> > +int unicode_to_cp437(uint32_t codepoint);
> > +
> > +#endif /* QEMU_CP437_H */
> > diff --git a/ui/console-vc.c b/ui/console-vc.c
> > index 8dee1f9bd01..7bbd65dea27 100644
> > --- a/ui/console-vc.c
> > +++ b/ui/console-vc.c
> > @@ -9,6 +9,7 @@
> > #include "qemu/fifo8.h"
> > #include "qemu/option.h"
> > #include "ui/console.h"
> > +#include "ui/cp437.h"
> >
> > #include "trace.h"
> > #include "console-priv.h"
> > @@ -89,6 +90,8 @@ struct VCChardev {
> > enum TTYState state;
> > int esc_params[MAX_ESC_PARAMS];
> > int nb_esc_params;
> > + uint32_t utf8_state; /* UTF-8 DFA decoder state */
> > + uint32_t utf8_codepoint; /* accumulated UTF-8 code point */
> > TextAttributes t_attrib; /* currently active text attributes */
> > TextAttributes t_attrib_saved;
> > int x_saved, y_saved;
> > @@ -598,6 +601,47 @@ static void vc_clear_xy(VCChardev *vc, int x, int y)
> > vc_update_xy(vc, x, y);
> > }
> >
> > +/*
> > + * UTF-8 DFA decoder by Bjoern Hoehrmann.
> > + * Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
> > + * See https://github.com/polijan/utf8_decode for details.
> > + *
> > + * SPDX-License-Identifier: MIT
> > + */
> > +#define UTF8_ACCEPT 0
> > +#define UTF8_REJECT 12
> > +
> > +static const uint8_t utf8d[] = {
> > + /* character class lookup */
> > + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
> > + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
> > + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> > + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
> > +
> > + /* state transition lookup */
> > + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
> > + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
> > + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
> > + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
> > + 12,36,12,12,12,12,12,12,12,12,12,12,
> > +};
> > +
> > +static uint32_t utf8_decode(uint32_t *state, uint32_t *codep, uint32_t byte)
> > +{
> > + uint32_t type = utf8d[byte];
> > +
> > + *codep = (*state != UTF8_ACCEPT) ?
> > + (byte & 0x3fu) | (*codep << 6) :
> > + (0xffu >> type) & (byte);
> > +
> > + *state = utf8d[256 + *state + type];
> > + return *state;
> > +}
> > +
> > static void vc_put_one(VCChardev *vc, int ch)
> > {
> > QemuTextConsole *s = vc->console;
> > @@ -761,6 +805,24 @@ static void vc_putchar(VCChardev *vc, int ch)
> >
> > switch(vc->state) {
> > case TTY_STATE_NORM:
> > + /* Feed byte through the UTF-8 DFA decoder */
> > + if (ch >= 0x80) {
> > + switch (utf8_decode(&vc->utf8_state, &vc->utf8_codepoint, ch)) {
> > + case UTF8_ACCEPT:
> > + vc_put_one(vc, unicode_to_cp437(vc->utf8_codepoint));
> > + break;
> > + case UTF8_REJECT:
> > + /* Reset state so the decoder can resync */
> > + vc->utf8_state = UTF8_ACCEPT;
> > + break;
> > + default:
> > + /* Need more bytes */
> > + break;
> > + }
> > + break;
> > + }
> > + /* ASCII byte: abort any pending UTF-8 sequence */
> > + vc->utf8_state = UTF8_ACCEPT;
> > switch(ch) {
> > case '\r': /* carriage return */
> > s->x = 0;
> > diff --git a/ui/cp437.c b/ui/cp437.c
> > new file mode 100644
> > index 00000000000..8ec38b73419
> > --- /dev/null
> > +++ b/ui/cp437.c
> > @@ -0,0 +1,205 @@
> > +/*
> > + * SPDX-License-Identifier: GPL-2.0-or-later
> > + *
> > + * Copyright (c) QEMU contributors
> > + */
> > +#include "qemu/osdep.h"
> > +#include "cp437.h"
> > +
> > +/*
> > + * Unicode to CP437 page tables.
> > + *
> > + * Borrowed from the Linux kernel (fs/nls/nls_cp437.c, "Dual BSD/GPL"),
> > + * generated from the Unicode Organization tables (www.unicode.org).
> > + */
> > +static const unsigned char uni2cp437_page00[256] = {
> > + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */
> > + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */
> > + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */
> > + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */
> > + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */
> > + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */
> > + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */
> > + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */
> > + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */
> > + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */
> > + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */
> > + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */
> > + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */
> > + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */
> > + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */
> > + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */
> > +
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
> > + 0xff, 0xad, 0x9b, 0x9c, 0x00, 0x9d, 0x00, 0x00, /* 0xa0-0xa7 */
> > + 0x00, 0x00, 0xa6, 0xae, 0xaa, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
> > + 0xf8, 0xf1, 0xfd, 0x00, 0x00, 0xe6, 0x00, 0xfa, /* 0xb0-0xb7 */
> > + 0x00, 0x00, 0xa7, 0xaf, 0xac, 0xab, 0x00, 0xa8, /* 0xb8-0xbf */
> > + 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x80, /* 0xc0-0xc7 */
> > + 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
> > + 0x00, 0xa5, 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, /* 0xd0-0xd7 */
> > + 0x00, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */
> > + 0x85, 0xa0, 0x83, 0x00, 0x84, 0x86, 0x91, 0x87, /* 0xe0-0xe7 */
> > + 0x8a, 0x82, 0x88, 0x89, 0x8d, 0xa1, 0x8c, 0x8b, /* 0xe8-0xef */
> > + 0x00, 0xa4, 0x95, 0xa2, 0x93, 0x00, 0x94, 0xf6, /* 0xf0-0xf7 */
> > + 0x00, 0x97, 0xa3, 0x96, 0x81, 0x00, 0x00, 0x98, /* 0xf8-0xff */
> > +};
> > +
> > +static const unsigned char uni2cp437_page01[256] = {
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
> > +
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> > + 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> > +};
> > +
> > +static const unsigned char uni2cp437_page03[256] = {
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
> > +
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> > + 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> > + 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
> > + 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0xe8, 0x00, /* 0xa0-0xa7 */
> > + 0x00, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
> > + 0x00, 0xe0, 0x00, 0x00, 0xeb, 0xee, 0x00, 0x00, /* 0xb0-0xb7 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
> > + 0xe3, 0x00, 0x00, 0xe5, 0xe7, 0x00, 0xed, 0x00, /* 0xc0-0xc7 */
> > +};
> > +
> > +static const unsigned char uni2cp437_page20[256] = {
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, /* 0x78-0x7f */
> > +
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x9e, /* 0xa0-0xa7 */
> > +};
> > +
> > +static const unsigned char uni2cp437_page22[256] = {
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> > + 0x00, 0xf9, 0xfb, 0x00, 0x00, 0x00, 0xec, 0x00, /* 0x18-0x1f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> > + 0x00, 0xef, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> > + 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
> > + 0x00, 0xf0, 0x00, 0x00, 0xf3, 0xf2, 0x00, 0x00, /* 0x60-0x67 */
> > +};
> > +
> > +static const unsigned char uni2cp437_page23[256] = {
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> > + 0xa9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> > + 0xf4, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> > +};
> > +
> > +static const unsigned char uni2cp437_page25[256] = {
> > + 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
> > + 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */
> > + 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */
> > + 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */
> > + 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */
> > + 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */
> > + 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */
> > + 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
> > + 0xcd, 0xba, 0xd5, 0xd6, 0xc9, 0xb8, 0xb7, 0xbb, /* 0x50-0x57 */
> > + 0xd4, 0xd3, 0xc8, 0xbe, 0xbd, 0xbc, 0xc6, 0xc7, /* 0x58-0x5f */
> > + 0xcc, 0xb5, 0xb6, 0xb9, 0xd1, 0xd2, 0xcb, 0xcf, /* 0x60-0x67 */
> > + 0xd0, 0xca, 0xd8, 0xd7, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
> > +
> > + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */
> > + 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */
> > + 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
> > + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
> > + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
> > +};
> > +
> > +static const unsigned char *const uni2cp437_page[256] = {
> > + [0x00] = uni2cp437_page00, [0x01] = uni2cp437_page01,
> > + [0x03] = uni2cp437_page03, [0x20] = uni2cp437_page20,
> > + [0x22] = uni2cp437_page22, [0x23] = uni2cp437_page23,
> > + [0x25] = uni2cp437_page25,
> > +};
> > +
> > +/*
> > + * Convert a Unicode code point to its CP437 equivalent for
> > + * rendering with the VGA font.
> > + * Returns '?' for characters that cannot be mapped.
> > + */
> > +int unicode_to_cp437(uint32_t codepoint)
> > +{
> > + const unsigned char *page;
> > + unsigned char hi = (codepoint >> 8) & 0xff;
> > + unsigned char lo = codepoint & 0xff;
> > +
> > + if (codepoint > 0xffff) {
> > + return '?';
> > + }
> > +
> > + page = uni2cp437_page[hi];
> > + if (page && page[lo]) {
> > + return page[lo];
> > + }
> > +
> > + return '?';
> > +}
> > diff --git a/ui/meson.build b/ui/meson.build
> > index 69404bca71a..d4d9312b98c 100644
> > --- a/ui/meson.build
> > +++ b/ui/meson.build
> > @@ -16,7 +16,7 @@ system_ss.add(files(
> > 'ui-qmp-cmds.c',
> > 'util.c',
> > ))
> > -system_ss.add(when: pixman, if_true: files('console-vc.c'), if_false: files('console-vc-stubs.c'))
> > +system_ss.add(when: pixman, if_true: files('console-vc.c', 'cp437.c'), if_false: files('console-vc-stubs.c'))
> > if dbus_display
> > system_ss.add(files('dbus-module.c'))
> > endif
> >
> > --
> > 2.53.0
> >
> >
>
> With regards,
> Daniel
> --
> |: https://berrange.com ~~ https://hachyderm.io/@berrange :|
> |: https://libvirt.org ~~ https://entangle-photo.org :|
> |: https://pixelfed.art/berrange ~~ https://fstop138.berrange.com :|
>
>
--
Marc-André Lureau
On Tue, Mar 24, 2026 at 06:17:37PM +0400, Marc-André Lureau wrote:
> Hi
>
> On Tue, Mar 24, 2026 at 6:08 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> >
> > On Tue, Mar 17, 2026 at 12:50:25PM +0400, Marc-André Lureau wrote:
> > > The text console receives bytes that may be UTF-8 encoded (e.g. from
> > > a guest running a modern distro), but currently treats each byte as a
> > > raw character index into the VGA/CP437 font, producing garbled output
> > > for any multi-byte sequence.
> > >
> > > Add a proper UTF-8 decoder using Bjoern Hoehrmann's DFA.
> > > The DFA inherently rejects overlong encodings, surrogates, and
> > > codepoints above U+10FFFF. Completed codepoints are then mapped to
> > > CP437, unmappable characters are displayed as '?'.
> >
> > I'm surprised we can't do a charset conversion using GLib APIs ?
> >
> > Do the g_convert family of APIs (which IIUC wrap the distro iconv)
> > not do what we would want ? If not, would direct use of iconv not
> > be an alternative ?
> >
>
> I tried to use GIconv but ran into a number of issues, as it doesn't
> operate on character level, but strings. And it uses allocation etc. I
> didn't manage with iconv either.
Looking again, the g_utf8_validate function is /almost/ what we
want, but its API design collapses both "invalid utf8" and
"incomplete character" into the same error return value, so we
can't distinguish them to decide whether to wait for more bytes
or reset the state :-(
So yeah, I can see why this is needed now.
>
> > It feels pretty wrong to need to embed UTF8 decoding code in
> > QEMU
>
> Yes, but on a standalone qemu-vnc server, is it more acceptable?
IIUC, this will be linked into regular QEMU too, right ?
> > > Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > > ---
> > > ui/cp437.h | 13 ++++
> > > ui/console-vc.c | 62 +++++++++++++++++
> > > ui/cp437.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > ui/meson.build | 2 +-
> > > 4 files changed, 281 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/ui/cp437.h b/ui/cp437.h
> > > new file mode 100644
> > > index 00000000000..81ace8317c7
> > > --- /dev/null
> > > +++ b/ui/cp437.h
> > > @@ -0,0 +1,13 @@
> > > +/*
> > > + * SPDX-License-Identifier: GPL-2.0-or-later
> > > + *
> > > + * Copyright (c) QEMU contributors
> > > + */
> > > +#ifndef QEMU_CP437_H
> > > +#define QEMU_CP437_H
> > > +
> > > +#include <stdint.h>
Shouldn't be required, since it is pulled in by osdep.h
> > > +
> > > +int unicode_to_cp437(uint32_t codepoint);
Perhaps better as qemu_unicode_to_cp437
> > > +
> > > +#endif /* QEMU_CP437_H */
> > > diff --git a/ui/console-vc.c b/ui/console-vc.c
> > > index 8dee1f9bd01..7bbd65dea27 100644
> > > --- a/ui/console-vc.c
> > > +++ b/ui/console-vc.c
> > > @@ -9,6 +9,7 @@
> > > #include "qemu/fifo8.h"
> > > #include "qemu/option.h"
> > > #include "ui/console.h"
> > > +#include "ui/cp437.h"
> > >
> > > #include "trace.h"
> > > #include "console-priv.h"
> > > @@ -89,6 +90,8 @@ struct VCChardev {
> > > enum TTYState state;
> > > int esc_params[MAX_ESC_PARAMS];
> > > int nb_esc_params;
> > > + uint32_t utf8_state; /* UTF-8 DFA decoder state */
> > > + uint32_t utf8_codepoint; /* accumulated UTF-8 code point */
> > > TextAttributes t_attrib; /* currently active text attributes */
> > > TextAttributes t_attrib_saved;
> > > int x_saved, y_saved;
> > > @@ -598,6 +601,47 @@ static void vc_clear_xy(VCChardev *vc, int x, int y)
> > > vc_update_xy(vc, x, y);
> > > }
> > >
> > > +/*
> > > + * UTF-8 DFA decoder by Bjoern Hoehrmann.
> > > + * Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
> > > + * See https://github.com/polijan/utf8_decode for details.
> > > + *
> > > + * SPDX-License-Identifier: MIT
> > > + */
> > > +#define UTF8_ACCEPT 0
> > > +#define UTF8_REJECT 12
This is an awfully generic define name, could we use something with
QEMU_ as a prefix to avoid risk of clashes with any external headers
we import
> > > +
> > > +static const uint8_t utf8d[] = {
> > > + /* character class lookup */
> > > + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > > + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > > + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > > + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> > > + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
> > > + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
> > > + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> > > + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
> > > +
> > > + /* state transition lookup */
> > > + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
> > > + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
> > > + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
> > > + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
> > > + 12,36,12,12,12,12,12,12,12,12,12,12,
> > > +};
> > > +
> > > +static uint32_t utf8_decode(uint32_t *state, uint32_t *codep, uint32_t byte)
> > > +{
> > > + uint32_t type = utf8d[byte];
> > > +
> > > + *codep = (*state != UTF8_ACCEPT) ?
> > > + (byte & 0x3fu) | (*codep << 6) :
> > > + (0xffu >> type) & (byte);
> > > +
> > > + *state = utf8d[256 + *state + type];
> > > + return *state;
> > > +}
> > > +
> > > static void vc_put_one(VCChardev *vc, int ch)
> > > {
> > > QemuTextConsole *s = vc->console;
> > > @@ -761,6 +805,24 @@ static void vc_putchar(VCChardev *vc, int ch)
> > >
> > > switch(vc->state) {
> > > case TTY_STATE_NORM:
> > > + /* Feed byte through the UTF-8 DFA decoder */
> > > + if (ch >= 0x80) {
> > > + switch (utf8_decode(&vc->utf8_state, &vc->utf8_codepoint, ch)) {
> > > + case UTF8_ACCEPT:
> > > + vc_put_one(vc, unicode_to_cp437(vc->utf8_codepoint));
> > > + break;
> > > + case UTF8_REJECT:
> > > + /* Reset state so the decoder can resync */
> > > + vc->utf8_state = UTF8_ACCEPT;
> > > + break;
> > > + default:
> > > + /* Need more bytes */
> > > + break;
> > > + }
> > > + break;
> > > + }
> > > + /* ASCII byte: abort any pending UTF-8 sequence */
> > > + vc->utf8_state = UTF8_ACCEPT;
> > > switch(ch) {
> > > case '\r': /* carriage return */
> > > s->x = 0;
With regards,
Daniel
--
|: https://berrange.com ~~ https://hachyderm.io/@berrange :|
|: https://libvirt.org ~~ https://entangle-photo.org :|
|: https://pixelfed.art/berrange ~~ https://fstop138.berrange.com :|
© 2016 - 2026 Red Hat, Inc.