On 3/23/2024 2:11 AM, Paolo Bonzini wrote:
> From: Chao Peng <chao.p.peng@linux.intel.com>
>
> When geeting KVM_EXIT_MEMORY_FAULT exit, it indicates userspace needs to
typo: /s/geeting/getting
> do the memory conversion on the RAMBlock to turn the memory into desired
> attribute, i.e., private/shared.
>
> Currently only KVM_MEMORY_EXIT_FLAG_PRIVATE in flags is valid when
> KVM_EXIT_MEMORY_FAULT happens.
>
> Note, KVM_EXIT_MEMORY_FAULT makes sense only when the RAMBlock has
> guest_memfd memory backend.
>
> Note, KVM_EXIT_MEMORY_FAULT returns with -EFAULT, so special handling is
> added.
>
> When page is converted from shared to private, the original shared
> memory can be discarded via ram_block_discard_range(). Note, shared
> memory can be discarded only when it's not back'ed by hugetlb because
> hugetlb is supposed to be pre-allocated and no need for discarding.
>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> Co-developed-by: Xiaoyao Li <xiaoyao.li@intel.com>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
>
> Message-ID: <20240320083945.991426-13-michael.roth@amd.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> include/sysemu/kvm.h | 2 +
> accel/kvm/kvm-all.c | 99 +++++++++++++++++++++++++++++++++++++-----
> accel/kvm/trace-events | 2 +
> 3 files changed, 93 insertions(+), 10 deletions(-)
>
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 2cb31925091..698f1640fe2 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -541,4 +541,6 @@ int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp);
>
> int kvm_set_memory_attributes_private(hwaddr start, hwaddr size);
> int kvm_set_memory_attributes_shared(hwaddr start, hwaddr size);
> +
> +int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private);
> #endif
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 56b17cbd8aa..afd7f992e39 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -2893,6 +2893,70 @@ static void kvm_eat_signals(CPUState *cpu)
> } while (sigismember(&chkset, SIG_IPI));
> }
>
> +int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
> +{
> + MemoryRegionSection section;
> + ram_addr_t offset;
> + MemoryRegion *mr;
> + RAMBlock *rb;
> + void *addr;
> + int ret = -1;
> +
> + trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
> +
> + if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) ||
> + !QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) {
> + return -1;
> + }
> +
> + if (!size) {
> + return -1;
> + }
> +
> + section = memory_region_find(get_system_memory(), start, size);
> + mr = section.mr;
> + if (!mr) {
> + return -1;
> + }
> +
> + if (!memory_region_has_guest_memfd(mr)) {
> + error_report("Converting non guest_memfd backed memory region "
> + "(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s",
> + start, size, to_private ? "private" : "shared");
> + ret = -1;
No need for it. ret is initialized as -1 at the function start.
> + goto out_unref;
> + }
> +
> + if (to_private) {
> + ret = kvm_set_memory_attributes_private(start, size);
> + } else {
> + ret = kvm_set_memory_attributes_shared(start, size);
> + }
> + if (ret) {
> + goto out_unref;
> + }
> +
> + addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
> + rb = qemu_ram_block_from_host(addr, false, &offset);
> +
> + if (to_private) {
> + if (rb->page_size == qemu_real_host_page_size()) {
> + /*
> + * shared memory is back'ed by hugetlb, which is supposed to be
Please fix the bad comment indentation for me, as well as the extra
space before 'hugetlb'
> + * pre-allocated and doesn't need to be discarded
> + */
> + goto out_unref;
> + }
> + ret = ram_block_discard_range(rb, offset, size);
> + } else {
> + ret = ram_block_discard_guest_memfd_range(rb, offset, size);
> + }
> +
> +out_unref:
> + memory_region_unref(section.mr);
> + return ret;
> +}
> +
> int kvm_cpu_exec(CPUState *cpu)
> {
> struct kvm_run *run = cpu->kvm_run;
> @@ -2960,18 +3024,20 @@ int kvm_cpu_exec(CPUState *cpu)
> ret = EXCP_INTERRUPT;
> break;
> }
> - fprintf(stderr, "error: kvm run failed %s\n",
> - strerror(-run_ret));
> + if (!(run_ret == -EFAULT && run->exit_reason == KVM_EXIT_MEMORY_FAULT)) {
> + fprintf(stderr, "error: kvm run failed %s\n",
> + strerror(-run_ret));
> #ifdef TARGET_PPC
> - if (run_ret == -EBUSY) {
> - fprintf(stderr,
> - "This is probably because your SMT is enabled.\n"
> - "VCPU can only run on primary threads with all "
> - "secondary threads offline.\n");
> - }
> + if (run_ret == -EBUSY) {
> + fprintf(stderr,
> + "This is probably because your SMT is enabled.\n"
> + "VCPU can only run on primary threads with all "
> + "secondary threads offline.\n");
> + }
> #endif
> - ret = -1;
> - break;
> + ret = -1;
> + break;
> + }
> }
>
> trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
> @@ -3054,6 +3120,19 @@ int kvm_cpu_exec(CPUState *cpu)
> break;
> }
> break;
> + case KVM_EXIT_MEMORY_FAULT:
> + trace_kvm_memory_fault(run->memory_fault.gpa,
> + run->memory_fault.size,
> + run->memory_fault.flags);
> + if (run->memory_fault.flags & ~KVM_MEMORY_EXIT_FLAG_PRIVATE) {
> + error_report("KVM_EXIT_MEMORY_FAULT: Unknown flag 0x%" PRIx64,
> + (uint64_t)run->memory_fault.flags);
> + ret = -1;
> + break;
> + }
> + ret = kvm_convert_memory(run->memory_fault.gpa, run->memory_fault.size,
> + run->memory_fault.flags & KVM_MEMORY_EXIT_FLAG_PRIVATE);
> + break;
> default:
> ret = kvm_arch_handle_exit(cpu, run);
> break;
> diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
> index e8c52cb9e7a..681ccb667d6 100644
> --- a/accel/kvm/trace-events
> +++ b/accel/kvm/trace-events
> @@ -31,3 +31,5 @@ kvm_cpu_exec(void) ""
> kvm_interrupt_exit_request(void) ""
> kvm_io_window_exit(void) ""
> kvm_run_exit_system_event(int cpu_index, uint32_t event_type) "cpu_index %d, system_even_type %"PRIu32
> +kvm_convert_memory(uint64_t start, uint64_t size, const char *msg) "start 0x%" PRIx64 " size 0x%" PRIx64 " %s"
> +kvm_memory_fault(uint64_t start, uint64_t size, uint64_t flags) "start 0x%" PRIx64 " size 0x%" PRIx64 " flags 0x%" PRIx64