[PATCH v6 30/60] i386/tdx: Wire TDX_REPORT_FATAL_ERROR with GuestPanic facility

Xiaoyao Li posted 60 patches 2 weeks, 4 days ago
[PATCH v6 30/60] i386/tdx: Wire TDX_REPORT_FATAL_ERROR with GuestPanic facility
Posted by Xiaoyao Li 2 weeks, 4 days ago
Integrate TDX's TDX_REPORT_FATAL_ERROR into QEMU GuestPanic facility

Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
---
Changes in v6:
- change error_code of GuestPanicInformationTdx from uint64_t to
  uint32_t, to only contains the bit 31:0 returned in r12.

Changes in v5:
- mention additional error information in gpa when it presents;
- refine the documentation; (Markus)

Changes in v4:
- refine the documentation; (Markus)

Changes in v3:
- Add docmentation of new type and struct; (Daniel)
- refine the error message handling; (Daniel)
---
 qapi/run-state.json   | 31 +++++++++++++++++++++--
 system/runstate.c     | 58 +++++++++++++++++++++++++++++++++++++++++++
 target/i386/kvm/tdx.c | 24 +++++++++++++++++-
 3 files changed, 110 insertions(+), 3 deletions(-)

diff --git a/qapi/run-state.json b/qapi/run-state.json
index ce95cfa46b73..c5b0b747b30d 100644
--- a/qapi/run-state.json
+++ b/qapi/run-state.json
@@ -501,10 +501,12 @@
 #
 # @s390: s390 guest panic information type (Since: 2.12)
 #
+# @tdx: tdx guest panic information type (Since: 9.0)
+#
 # Since: 2.9
 ##
 { 'enum': 'GuestPanicInformationType',
-  'data': [ 'hyper-v', 's390' ] }
+  'data': [ 'hyper-v', 's390', 'tdx' ] }
 
 ##
 # @GuestPanicInformation:
@@ -519,7 +521,8 @@
  'base': {'type': 'GuestPanicInformationType'},
  'discriminator': 'type',
  'data': {'hyper-v': 'GuestPanicInformationHyperV',
-          's390': 'GuestPanicInformationS390'}}
+          's390': 'GuestPanicInformationS390',
+          'tdx' : 'GuestPanicInformationTdx'}}
 
 ##
 # @GuestPanicInformationHyperV:
@@ -598,6 +601,30 @@
           'psw-addr': 'uint64',
           'reason': 'S390CrashReason'}}
 
+##
+# @GuestPanicInformationTdx:
+#
+# TDX Guest panic information specific to TDX, as specified in the
+# "Guest-Hypervisor Communication Interface (GHCI) Specification",
+# section TDG.VP.VMCALL<ReportFatalError>.
+#
+# @error-code: TD-specific error code
+#
+# @message: Human-readable error message provided by the guest. Not
+#     to be trusted.
+#
+# @gpa: guest-physical address of a page that contains more verbose
+#     error information, as zero-terminated string.  Present when the
+#     "GPA valid" bit (bit 63) is set in @error-code.
+#
+#
+# Since: 9.0
+##
+{'struct': 'GuestPanicInformationTdx',
+ 'data': {'error-code': 'uint32',
+          'message': 'str',
+          '*gpa': 'uint64'}}
+
 ##
 # @MEMORY_FAILURE:
 #
diff --git a/system/runstate.c b/system/runstate.c
index c2c9afa905a6..9bb8162eb28f 100644
--- a/system/runstate.c
+++ b/system/runstate.c
@@ -565,6 +565,52 @@ static void qemu_system_wakeup(void)
     }
 }
 
+static char *tdx_parse_panic_message(char *message)
+{
+    bool printable = false;
+    char *buf = NULL;
+    int len = 0, i;
+
+    /*
+     * Although message is defined as a json string, we shouldn't
+     * unconditionally treat it as is because the guest generated it and
+     * it's not necessarily trustable.
+     */
+    if (message) {
+        /* The caller guarantees the NUL-terminated string. */
+        len = strlen(message);
+
+        printable = len > 0;
+        for (i = 0; i < len; i++) {
+            if (!(0x20 <= message[i] && message[i] <= 0x7e)) {
+                printable = false;
+                break;
+            }
+        }
+    }
+
+    if (!printable && len) {
+        /* 3 = length of "%02x " */
+        buf = g_malloc(len * 3);
+        for (i = 0; i < len; i++) {
+            if (message[i] == '\0') {
+                break;
+            } else {
+                sprintf(buf + 3 * i, "%02x ", message[i]);
+            }
+        }
+        if (i > 0)
+            /* replace the last ' '(space) to NUL */
+            buf[i * 3 - 1] = '\0';
+        else
+            buf[0] = '\0';
+
+        return buf;
+    }
+
+    return message;
+}
+
 void qemu_system_guest_panicked(GuestPanicInformation *info)
 {
     qemu_log_mask(LOG_GUEST_ERROR, "Guest crashed");
@@ -606,7 +652,19 @@ void qemu_system_guest_panicked(GuestPanicInformation *info)
                           S390CrashReason_str(info->u.s390.reason),
                           info->u.s390.psw_mask,
                           info->u.s390.psw_addr);
+        } else if (info->type == GUEST_PANIC_INFORMATION_TYPE_TDX) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "\nTDX guest reports fatal error:"
+                          " error code: 0x%" PRIx32 " error message:\"%s\"\n",
+                          info->u.tdx.error_code,
+                          tdx_parse_panic_message(info->u.tdx.message));
+            if (info->u.tdx.gpa != -1ull) {
+                qemu_log_mask(LOG_GUEST_ERROR, "Additional error information "
+                              "can be found at gpa page: 0x%" PRIx64 "\n",
+                              info->u.tdx.gpa);
+            }
         }
+
         qapi_free_GuestPanicInformation(info);
     }
 }
diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
index 3f44dfbf6585..394f1d75dc0d 100644
--- a/target/i386/kvm/tdx.c
+++ b/target/i386/kvm/tdx.c
@@ -17,6 +17,7 @@
 #include "qemu/mmap-alloc.h"
 #include "qapi/error.h"
 #include "qom/object_interfaces.h"
+#include "sysemu/runstate.h"
 #include "sysemu/sysemu.h"
 #include "exec/ramblock.h"
 
@@ -574,10 +575,25 @@ int tdx_parse_tdvf(void *flash_ptr, int size)
     return tdvf_parse_metadata(&tdx_guest->tdvf, flash_ptr, size);
 }
 
+static void tdx_panicked_on_fatal_error(X86CPU *cpu, uint64_t error_code,
+                                        char *message, uint64_t gpa)
+{
+    GuestPanicInformation *panic_info;
+
+    panic_info = g_new0(GuestPanicInformation, 1);
+    panic_info->type = GUEST_PANIC_INFORMATION_TYPE_TDX;
+    panic_info->u.tdx.error_code = (uint32_t) error_code;
+    panic_info->u.tdx.message = message;
+    panic_info->u.tdx.gpa = gpa;
+
+    qemu_system_guest_panicked(panic_info);
+}
+
 int tdx_handle_report_fatal_error(X86CPU *cpu, struct kvm_run *run)
 {
     uint64_t error_code = run->system_event.data[0];
     char *message = NULL;
+    uint64_t gpa = -1ull;
 
     if (error_code & 0xffff) {
         error_report("TDX: REPORT_FATAL_ERROR: invalid error code: 0x%lx",
@@ -594,7 +610,13 @@ int tdx_handle_report_fatal_error(X86CPU *cpu, struct kvm_run *run)
         message[TDX_FATAL_MESSAGE_MAX] = '\0';
     }
 
-    error_report("TD guest reports fatal error. %s", message ? : "");
+#define TDX_REPORT_FATAL_ERROR_GPA_VALID    BIT_ULL(63)
+    if (error_code & TDX_REPORT_FATAL_ERROR_GPA_VALID) {
+        gpa = run->system_event.data[1];
+    }
+
+    tdx_panicked_on_fatal_error(cpu, error_code, message, gpa);
+
     return -1;
 }
 
-- 
2.34.1
Re: [PATCH v6 30/60] i386/tdx: Wire TDX_REPORT_FATAL_ERROR with GuestPanic facility
Posted by Daniel P. Berrangé 2 weeks, 4 days ago
On Tue, Nov 05, 2024 at 01:23:38AM -0500, Xiaoyao Li wrote:
> Integrate TDX's TDX_REPORT_FATAL_ERROR into QEMU GuestPanic facility
> 
> Originated-from: Isaku Yamahata <isaku.yamahata@intel.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
> ---
> Changes in v6:
> - change error_code of GuestPanicInformationTdx from uint64_t to
>   uint32_t, to only contains the bit 31:0 returned in r12.
> 
> Changes in v5:
> - mention additional error information in gpa when it presents;
> - refine the documentation; (Markus)
> 
> Changes in v4:
> - refine the documentation; (Markus)
> 
> Changes in v3:
> - Add docmentation of new type and struct; (Daniel)
> - refine the error message handling; (Daniel)
> ---
>  qapi/run-state.json   | 31 +++++++++++++++++++++--
>  system/runstate.c     | 58 +++++++++++++++++++++++++++++++++++++++++++
>  target/i386/kvm/tdx.c | 24 +++++++++++++++++-
>  3 files changed, 110 insertions(+), 3 deletions(-)
> 
> diff --git a/qapi/run-state.json b/qapi/run-state.json
> index ce95cfa46b73..c5b0b747b30d 100644
> --- a/qapi/run-state.json
> +++ b/qapi/run-state.json
> @@ -501,10 +501,12 @@
>  #
>  # @s390: s390 guest panic information type (Since: 2.12)
>  #
> +# @tdx: tdx guest panic information type (Since: 9.0)
> +#
>  # Since: 2.9
>  ##
>  { 'enum': 'GuestPanicInformationType',
> -  'data': [ 'hyper-v', 's390' ] }
> +  'data': [ 'hyper-v', 's390', 'tdx' ] }
>  
>  ##
>  # @GuestPanicInformation:
> @@ -519,7 +521,8 @@
>   'base': {'type': 'GuestPanicInformationType'},
>   'discriminator': 'type',
>   'data': {'hyper-v': 'GuestPanicInformationHyperV',
> -          's390': 'GuestPanicInformationS390'}}
> +          's390': 'GuestPanicInformationS390',
> +          'tdx' : 'GuestPanicInformationTdx'}}
>  
>  ##
>  # @GuestPanicInformationHyperV:
> @@ -598,6 +601,30 @@
>            'psw-addr': 'uint64',
>            'reason': 'S390CrashReason'}}
>  
> +##
> +# @GuestPanicInformationTdx:
> +#
> +# TDX Guest panic information specific to TDX, as specified in the
> +# "Guest-Hypervisor Communication Interface (GHCI) Specification",
> +# section TDG.VP.VMCALL<ReportFatalError>.
> +#
> +# @error-code: TD-specific error code
> +#
> +# @message: Human-readable error message provided by the guest. Not
> +#     to be trusted.
> +#
> +# @gpa: guest-physical address of a page that contains more verbose
> +#     error information, as zero-terminated string.  Present when the
> +#     "GPA valid" bit (bit 63) is set in @error-code.
> +#
> +#
> +# Since: 9.0

This is very outdated. Change to 10.0 as the next possible release
it could land it.

> +##
> +{'struct': 'GuestPanicInformationTdx',
> + 'data': {'error-code': 'uint32',
> +          'message': 'str',
> +          '*gpa': 'uint64'}}
> +
>  ##
>  # @MEMORY_FAILURE:
>  #
> diff --git a/system/runstate.c b/system/runstate.c
> index c2c9afa905a6..9bb8162eb28f 100644
> --- a/system/runstate.c
> +++ b/system/runstate.c
> @@ -565,6 +565,52 @@ static void qemu_system_wakeup(void)
>      }
>  }
>  
> +static char *tdx_parse_panic_message(char *message)
> +{
> +    bool printable = false;
> +    char *buf = NULL;
> +    int len = 0, i;
> +
> +    /*
> +     * Although message is defined as a json string, we shouldn't
> +     * unconditionally treat it as is because the guest generated it and
> +     * it's not necessarily trustable.
> +     */
> +    if (message) {
> +        /* The caller guarantees the NUL-terminated string. */
> +        len = strlen(message);
> +
> +        printable = len > 0;
> +        for (i = 0; i < len; i++) {
> +            if (!(0x20 <= message[i] && message[i] <= 0x7e)) {
> +                printable = false;
> +                break;
> +            }
> +        }
> +    }
> +
> +    if (!printable && len) {
> +        /* 3 = length of "%02x " */
> +        buf = g_malloc(len * 3);

....allocating memory

> +        for (i = 0; i < len; i++) {
> +            if (message[i] == '\0') {
> +                break;
> +            } else {
> +                sprintf(buf + 3 * i, "%02x ", message[i]);
> +            }
> +        }
> +        if (i > 0)
> +            /* replace the last ' '(space) to NUL */
> +            buf[i * 3 - 1] = '\0';
> +        else
> +            buf[0] = '\0';
> +
> +        return buf;

....returning alllocated memory

> +    }
> +
> +    return message;

....returning a pointer that came from a struct field

> +}

This is a bad design - we should require the caller to always
free memory, or never free memory - not a mix.

> +
>  void qemu_system_guest_panicked(GuestPanicInformation *info)
>  {
>      qemu_log_mask(LOG_GUEST_ERROR, "Guest crashed");
> @@ -606,7 +652,19 @@ void qemu_system_guest_panicked(GuestPanicInformation *info)
>                            S390CrashReason_str(info->u.s390.reason),
>                            info->u.s390.psw_mask,
>                            info->u.s390.psw_addr);
> +        } else if (info->type == GUEST_PANIC_INFORMATION_TYPE_TDX) {
> +            qemu_log_mask(LOG_GUEST_ERROR,
> +                          "\nTDX guest reports fatal error:"
> +                          " error code: 0x%" PRIx32 " error message:\"%s\"\n",
> +                          info->u.tdx.error_code,
> +                          tdx_parse_panic_message(info->u.tdx.message));

This is a leak in the case where tdx_parse_panic_message() returned
allocated memory.

> +            if (info->u.tdx.gpa != -1ull) {
> +                qemu_log_mask(LOG_GUEST_ERROR, "Additional error information "
> +                              "can be found at gpa page: 0x%" PRIx64 "\n",
> +                              info->u.tdx.gpa);
> +            }
>          }
> +
>          qapi_free_GuestPanicInformation(info);
>      }
>  }

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|