[RFC PATCH] tcg, apic: create a separate root memory region for each CPU

Bui Quang Minh posted 1 patch 7 months ago
Failed in applying to current master (apply log)
hw/intc/apic.c                   | 24 ++++++++++++++++++++++--
hw/intc/ioapic.c                 | 32 +++++++++++++++++++++++++++-----
hw/pci/pci.c                     | 25 +++++++++++++++++++++++--
include/hw/i386/apic.h           |  1 +
target/i386/cpu-sysemu.c         | 14 +++++++++++---
target/i386/tcg/sysemu/tcg-cpu.c | 13 +++++++++++++
6 files changed, 97 insertions(+), 12 deletions(-)
[RFC PATCH] tcg, apic: create a separate root memory region for each CPU
Posted by Bui Quang Minh 7 months ago
Currently, by default, every TCG cpu has root memory region to the same
system memory region. In order to support APIC MMIO relocation as well as
correctly disable APIC MMIO in disabled and x2APIC state, in this commit,
we create a separate root memory region for every CPU. The system memory
region is added to this container root memory region, later an separate
APIC MMIO is added with higher priority than system memory . With separate
APIC MMIO region per CPU, APIC MMIO relocation and disable can be done per
CPU without interfering without others.

Because the MSI base address is the same as APIC MMIO, the MMIO region
currently serves 2 purposes: APIC register access and MSI handler. In case
no interrupt remapping device is used, devices send MSI by writing to
0xfee00000 in system memory. However, as we move APIC MMIO out of system
memory we need to change the device code to call apic_send_msi instead of
writing to system memory.

This commit passes the APIC MMIO relocation test in kvm-unit-tests, still
fails APIC disable, however, I think we should treat those as pass

Before:
	FAIL: apic_disable: *0xfee00030: 50014
	FAIL: apic_disable: *0xfee00080: f0
	FAIL: apic_disable: *0xfee00030: 50014
	FAIL: apic_disable: *0xfee00080: f0
After:
	FAIL: apic_disable: *0xfee00030: 0
	FAIL: apic_disable: *0xfee00080: 0
	FAIL: apic_disable: *0xfee00030: 0
	FAIL: apic_disable: *0xfee00080: 0

Before this commit, we still can read APIC register, after this commit we
cannot. However, the test memset disabled MMIO region with 0xff and expects
to read back the 0xff value. As we disable APIC MMIO memory region, the
write is dispatched to system memory which has unassigned read/write
operation, so read returns 0 and write has no effect.

This commit is tested on booting up Linux kernel with and without Intel/AMD
IOMMU on enabled/disabled x2APIC CPUs.

The memory region tree when running without interrupt remapping device

	~ info mtree
Before:
	address-space: cpu-memory-0
	address-space: cpu-memory-1
	address-space: memory
	  0000000000000000-ffffffffffffffff (prio 0, i/o): system
After:
	address-space: cpu-memory-0
	  0000000000000000-ffffffffffffffff (prio 0, i/o): memory
	    0000000000000000-ffffffffffffffff (prio 0, i/o): alias memory
	      @system 0000000000000000-ffffffffffffffff
	    00000000fee00000-00000000feefffff (prio 4096, i/o): apic-msi

	address-space: cpu-memory-1
	  0000000000000000-ffffffffffffffff (prio 0, i/o): memory
	    0000000000000000-ffffffffffffffff (prio 0, i/o): alias memory
	      @system 0000000000000000-ffffffffffffffff
	    00000000fee00000-00000000feefffff (prio 4096, i/o): apic-msi

Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
---
  hw/intc/apic.c                   | 24 ++++++++++++++++++++++--
  hw/intc/ioapic.c                 | 32 +++++++++++++++++++++++++++-----
  hw/pci/pci.c                     | 25 +++++++++++++++++++++++--
  include/hw/i386/apic.h           |  1 +
  target/i386/cpu-sysemu.c         | 14 +++++++++++---
  target/i386/tcg/sysemu/tcg-cpu.c | 13 +++++++++++++
  6 files changed, 97 insertions(+), 12 deletions(-)

diff --git a/hw/intc/apic.c b/hw/intc/apic.c
index b8f56836a6..0bd5b5d1f9 100644
--- a/hw/intc/apic.c
+++ b/hw/intc/apic.c
@@ -340,17 +340,34 @@ static void apic_set_base_check(APICCommonState 
*s, uint64_t val)
          raise_exception_ra(&s->cpu->env, EXCP0D_GPF, GETPC());
  }

+/* set base address and enable/disable APIC MMIO */
+static void apic_io_memory_set_base_enabled(APICCommonState *s,
+                                            hwaddr address,
+                                            bool enabled)
+{
+    if (tcg_enabled()) {
+        qemu_mutex_lock_iothread();
+        memory_region_set_address(&s->io_memory, address);
+        memory_region_set_enabled(&s->io_memory, enabled);
+        qemu_mutex_unlock_iothread();
+    }
+}
+
  static void apic_set_base(APICCommonState *s, uint64_t val)
  {
+    hwaddr new_base = val & MSR_IA32_APICBASE_BASE;
+    bool enabled = true;
+
      apic_set_base_check(s, val);

-    s->apicbase = (val & 0xfffff000) |
+    s->apicbase = new_base |
          (s->apicbase & (MSR_IA32_APICBASE_BSP | 
MSR_IA32_APICBASE_ENABLE));
      /* if disabled, cannot be enabled again */
      if (!(val & MSR_IA32_APICBASE_ENABLE)) {
          s->apicbase &= ~MSR_IA32_APICBASE_ENABLE;
          cpu_clear_apic_feature(&s->cpu->env);
          s->spurious_vec &= ~APIC_SV_ENABLE;
+        enabled = false;
      }

      /* Transition from disabled mode to xAPIC */
@@ -368,7 +385,10 @@ static void apic_set_base(APICCommonState *s, 
uint64_t val)

          s->log_dest = ((s->initial_apic_id & 0xffff0) << 16) |
                        (1 << (s->initial_apic_id & 0xf));
+        enabled = false;
      }
+
+    apic_io_memory_set_base_enabled(s, new_base, enabled);
  }

  static void apic_set_tpr(APICCommonState *s, uint8_t val)
@@ -889,7 +909,7 @@ static uint64_t apic_mem_read(void *opaque, hwaddr 
addr, unsigned size)
      return val;
  }

-static void apic_send_msi(MSIMessage *msi)
+void apic_send_msi(MSIMessage *msi)
  {
      uint64_t addr = msi->address;
      uint32_t data = msi->data;
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 716ffc8bbb..8e85a3165a 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -35,6 +35,7 @@
  #include "hw/i386/apic-msidef.h"
  #include "hw/i386/x86-iommu.h"
  #include "trace.h"
+#include "hw/i386/apic.h"

  #define APIC_DELIVERY_MODE_SHIFT 8
  #define APIC_POLARITY_SHIFT 14
@@ -131,11 +132,32 @@ static void ioapic_service(IOAPICCommonState *s)
                  }
  #endif

-                /* No matter whether IR is enabled, we translate
-                 * the IOAPIC message into a MSI one, and its
-                 * address space will decide whether we need a
-                 * translation. */
-                stl_le_phys(ioapic_as, info.addr, info.data);
+                /*
+                 * We reuse the APIC MMIO memory region for MSI because
+                 * by default, they are at the same base address but
+                 * the register addresses are not overlapped. However,
+                 * in TCG we try to support APIC MMIO relocation and
+                 * disable per cpu APIC MMIO when that CPU switches
+                 * to different APIC state. So in case, when using TCG
+                 * and the no interrupt remapping device is used, we
+                 * call apic_send_msi to send MSI to APIC directly
+                 * instead of writing to MSI address.
+                 */
+                if (tcg_enabled() && ioapic_as == &address_space_memory) {
+                    MSIMessage msg = {
+                        .address = info.addr,
+                        .data = info.data,
+                    };
+                    apic_send_msi(&msg);
+                } else {
+                    /*
+                     * No matter whether IR is enabled, we translate
+                     * the IOAPIC message into a MSI one, and its
+                     * address space will decide whether we need a
+                     * translation.
+                     */
+                    stl_le_phys(ioapic_as, info.addr, info.data);
+                }
              }
          }
      }
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 881d774fb6..b680c3869b 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -50,6 +50,9 @@
  #include "qemu/cutils.h"
  #include "pci-internal.h"

+#include "sysemu/tcg.h"
+#include "hw/i386/apic.h"
+
  #include "hw/xen/xen.h"
  #include "hw/i386/kvm/xen_evtchn.h"

@@ -346,6 +349,7 @@ void pci_device_deassert_intx(PCIDevice *dev)
  static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg)
  {
      MemTxAttrs attrs = {};
+    AddressSpace *dma_as = pci_device_iommu_address_space(dev);

      /*
       * Xen uses the high bits of the address to contain some of the bits
@@ -359,8 +363,25 @@ static void pci_msi_trigger(PCIDevice *dev, 
MSIMessage msg)
          return;
      }
      attrs.requester_id = pci_requester_id(dev);
-    address_space_stl_le(&dev->bus_master_as, msg.address, msg.data,
-                         attrs, NULL);
+
+
+    /*
+     * We reuse the APIC MMIO memory region for MSI because
+     * by default, they are at the same base address but
+     * the register addresses are not overlapped. However,
+     * in TCG we try to support APIC MMIO relocation and
+     * disable per cpu APIC MMIO when that CPU switches
+     * to different APIC state. So in case, when using TCG
+     * and the no interrupt remapping device is used, we
+     * call apic_send_msi to send MSI to APIC directly
+     * instead of writing to MSI address.
+     */
+    if (tcg_enabled() && dma_as == &address_space_memory) {
+        apic_send_msi(&msg);
+    } else {
+        address_space_stl_le(&dev->bus_master_as, msg.address, msg.data,
+                             attrs, NULL);
+    }
  }

  static void pci_reset_regions(PCIDevice *dev)
diff --git a/include/hw/i386/apic.h b/include/hw/i386/apic.h
index 12aad09f4c..045eb96d0e 100644
--- a/include/hw/i386/apic.h
+++ b/include/hw/i386/apic.h
@@ -20,6 +20,7 @@ int apic_get_highest_priority_irr(DeviceState *dev);
  uint64_t apic_register_read(int index);
  void apic_register_write(int index, uint64_t val);
  bool is_x2apic_mode(DeviceState *d);
+void apic_send_msi(MSIMessage *msi);

  /* pc.c */
  DeviceState *cpu_get_current_apic(void);
diff --git a/target/i386/cpu-sysemu.c b/target/i386/cpu-sysemu.c
index 7422096737..a4c10591ef 100644
--- a/target/i386/cpu-sysemu.c
+++ b/target/i386/cpu-sysemu.c
@@ -316,14 +316,22 @@ void x86_cpu_apic_realize(X86CPU *cpu, Error **errp)

      /* Map APIC MMIO area */
      apic = APIC_COMMON(cpu->apic_state);
-    if (!apic_mmio_map_once) {
+    if (tcg_enabled()) {
+        /*
+         * Map the APIC MMIO to the private root memory of the CPU
+         */
+        memory_region_add_subregion_overlap(cpu->parent_obj.memory,
+                                            apic->apicbase &
+                                            MSR_IA32_APICBASE_BASE,
+                                            &apic->io_memory,
+                                            0x1000);
+    } else if (!apic_mmio_map_once) {
          memory_region_add_subregion_overlap(get_system_memory(),
                                              apic->apicbase &
                                              MSR_IA32_APICBASE_BASE,
                                              &apic->io_memory,
                                              0x1000);
-        apic_mmio_map_once = true;
-     }
+    }
  }

  GuestPanicInformation *x86_cpu_get_crash_info(CPUState *cs)
diff --git a/target/i386/tcg/sysemu/tcg-cpu.c 
b/target/i386/tcg/sysemu/tcg-cpu.c
index c223c0fe9b..7731c422b9 100644
--- a/target/i386/tcg/sysemu/tcg-cpu.c
+++ b/target/i386/tcg/sysemu/tcg-cpu.c
@@ -24,6 +24,7 @@
  #include "sysemu/sysemu.h"
  #include "qemu/units.h"
  #include "exec/address-spaces.h"
+#include "hw/i386/x86.h"

  #include "tcg/tcg-cpu.h"

@@ -46,6 +47,7 @@ static void tcg_cpu_machine_done(Notifier *n, void 
*unused)
  bool tcg_cpu_realizefn(CPUState *cs, Error **errp)
  {
      X86CPU *cpu = X86_CPU(cs);
+    MemoryRegion *root_memory, *system_memory_alias;

      /*
       * The realize order is important, since x86_cpu_realize() checks if
@@ -72,6 +74,17 @@ bool tcg_cpu_realizefn(CPUState *cs, Error **errp)
      memory_region_add_subregion_overlap(cpu->cpu_as_root, 0, 
cpu->cpu_as_mem, 0);
      memory_region_set_enabled(cpu->cpu_as_mem, true);

+    root_memory = g_new(MemoryRegion, 1);
+    memory_region_init(root_memory, OBJECT(cs), "memory", ~0ull);
+
+    object_property_set_link(OBJECT(cs), "memory", OBJECT(root_memory), 
errp);
+
+    system_memory_alias = g_new(MemoryRegion, 1);
+    memory_region_init_alias(system_memory_alias, OBJECT(cpu), "memory",
+                             get_system_memory(), 0, ~0ull);
+
+    memory_region_add_subregion_overlap(cs->memory, 0, 
system_memory_alias, 0);
+
      cs->num_ases = 2;
      cpu_address_space_init(cs, 0, "cpu-memory", cs->memory);
      cpu_address_space_init(cs, 1, "cpu-smm", cpu->cpu_as_root);
-- 
2.25.1