From: Svetly Todorov <svetly.todorov@memverge.com>
Using a shared-memory bytemap, validates that DC adds, releases,
and reclamations happen on extents belonging to the appropriate
host.
The MHSLD device inherits from the CXL_TYPE3 class and adds the following
configuration options:
--mhd-head=<u32>
--mhd-state_file=<str>
--mhd-init=<bool>
--mhd-head specifies the head ID of the host on the given device.
--mhd-state_file is the name of the shared-memory-backed file used
to store the MHD state.
--mhd-init indicates whether this QEMU instance should initialize
the state_file; if so, the instance will create the file if it does
not exist, ftruncate it to the appropriate size, and initialize its
header. It is assumed that the --mhd-init instance is run and allowed
to completely finish configuration before any other guests access the
shared state.
The shared state file only needs to be intialized once. Even if a guest
dies without clearing the ownership bits associated with its head-ID,
future guests with that ID will clear those bits in cxl_mhsld_realize(),
regardless of whether mhd_init is true or false.
The following command line options create an MHSLD with 4GB of
backing memory, whose state is tracked in /dev/shm/mhd_metadata.
--mhd-init=true tells this instance to initialize the state as
described above.
./qemu-system_x86-64 \
[... other options ...] \
-device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 \
-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,port=0,slot=0 \
-object memory-backend-ram,id=mem0,size=4G \
-device cxl-mhsld,bus=rp0,num-dc-regions=1,volatile-dc-memdev=mem0,id=cxl-mem0,sn=66667,mhd-head=0,mhd-state_file=mhd_metadata,mhd-init=true \
-M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G \
-qmp unix:/tmp/qmp-sock-1,server,nowait
Once this guest completes setup, other guests looking to access the
device can be booted with the same configuration options, but with
--mhd-head != 0,
--mhd-init=false,
and a different QMP socket.
Signed-off-by: Gregory Price <gourry@gourry.net>
Signed-off-by: Svetly Todorov <svetly.todorov@memverge.com>
---
hw/cxl/Kconfig | 1 +
hw/cxl/meson.build | 1 +
hw/cxl/mhsld/Kconfig | 4 +
hw/cxl/mhsld/meson.build | 3 +
hw/cxl/mhsld/mhsld.c | 456 +++++++++++++++++++++++++++++++++++++++
hw/cxl/mhsld/mhsld.h | 75 +++++++
6 files changed, 540 insertions(+)
create mode 100644 hw/cxl/mhsld/Kconfig
create mode 100644 hw/cxl/mhsld/meson.build
create mode 100644 hw/cxl/mhsld/mhsld.c
create mode 100644 hw/cxl/mhsld/mhsld.h
diff --git a/hw/cxl/Kconfig b/hw/cxl/Kconfig
index e603839a62..919e59b598 100644
--- a/hw/cxl/Kconfig
+++ b/hw/cxl/Kconfig
@@ -1,3 +1,4 @@
+source mhsld/Kconfig
source vendor/Kconfig
config CXL
diff --git a/hw/cxl/meson.build b/hw/cxl/meson.build
index e8c8c1355a..394750dd19 100644
--- a/hw/cxl/meson.build
+++ b/hw/cxl/meson.build
@@ -16,4 +16,5 @@ system_ss.add(when: 'CONFIG_I2C_MCTP_CXL', if_true: files('i2c_mctp_cxl.c'))
system_ss.add(when: 'CONFIG_ALL', if_true: files('cxl-host-stubs.c'))
+subdir('mhsld')
subdir('vendor')
diff --git a/hw/cxl/mhsld/Kconfig b/hw/cxl/mhsld/Kconfig
new file mode 100644
index 0000000000..dc2be15140
--- /dev/null
+++ b/hw/cxl/mhsld/Kconfig
@@ -0,0 +1,4 @@
+config CXL_MHSLD
+ bool
+ depends on CXL_MEM_DEVICE
+ default y
diff --git a/hw/cxl/mhsld/meson.build b/hw/cxl/mhsld/meson.build
new file mode 100644
index 0000000000..c595558f8a
--- /dev/null
+++ b/hw/cxl/mhsld/meson.build
@@ -0,0 +1,3 @@
+if host_os == 'linux'
+ system_ss.add(when: 'CONFIG_CXL_MHSLD', if_true: files('mhsld.c',))
+endif
diff --git a/hw/cxl/mhsld/mhsld.c b/hw/cxl/mhsld/mhsld.c
new file mode 100644
index 0000000000..2a3023607e
--- /dev/null
+++ b/hw/cxl/mhsld/mhsld.c
@@ -0,0 +1,456 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024 MemVerge Inc.
+ *
+ */
+
+#include <sys/file.h>
+#include "qemu/osdep.h"
+#include "qemu/bitmap.h"
+#include "hw/irq.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "hw/cxl/cxl.h"
+#include "hw/cxl/cxl_mailbox.h"
+#include "hw/cxl/cxl_device.h"
+#include "hw/pci/pcie.h"
+#include "hw/pci/pcie_port.h"
+#include "hw/qdev-properties.h"
+#include "sysemu/hostmem.h"
+#include "mhsld.h"
+
+#define TYPE_CXL_MHSLD "cxl-mhsld"
+OBJECT_DECLARE_TYPE(CXLMHSLDState, CXLMHSLDClass, CXL_MHSLD)
+
+/*
+ * CXL r3.0 section 7.6.7.5.1 - Get Multi-Headed Info (Opcode 5500h)
+ *
+ * This command retrieves the number of heads, number of supported LDs,
+ * and Head-to-LD mapping of a Multi-Headed device.
+ */
+static CXLRetCode cmd_mhd_get_info(const struct cxl_cmd *cmd,
+ uint8_t *payload_in, size_t len_in,
+ uint8_t *payload_out, size_t *len_out,
+ CXLCCI * cci)
+{
+ CXLMHSLDState *s = CXL_MHSLD(cci->d);
+ MHDGetInfoInput *input = (void *)payload_in;
+ MHDGetInfoOutput *output = (void *)payload_out;
+
+ uint8_t start_ld = input->start_ld;
+ uint8_t ldmap_len = input->ldmap_len;
+ uint8_t i;
+
+ if (start_ld >= s->mhd_state->nr_lds) {
+ return CXL_MBOX_INVALID_INPUT;
+ }
+
+ output->nr_lds = s->mhd_state->nr_lds;
+ output->nr_heads = s->mhd_state->nr_heads;
+ output->resv1 = 0;
+ output->start_ld = start_ld;
+ output->resv2 = 0;
+
+ for (i = 0; i < ldmap_len && (start_ld + i) < output->nr_lds; i++) {
+ output->ldmap[i] = s->mhd_state->ldmap[start_ld + i];
+ }
+ output->ldmap_len = i;
+
+ *len_out = sizeof(*output) + output->ldmap_len;
+ return CXL_MBOX_SUCCESS;
+}
+
+static const struct cxl_cmd cxl_cmd_set_mhsld[256][256] = {
+ [MHSLD_MHD][GET_MHD_INFO] = {"GET_MULTI_HEADED_INFO",
+ cmd_mhd_get_info, 2, 0},
+};
+
+static Property cxl_mhsld_props[] = {
+ DEFINE_PROP_UINT32("mhd-head", CXLMHSLDState, mhd_head, ~(0)),
+ DEFINE_PROP_STRING("mhd-state_file", CXLMHSLDState, mhd_state_file),
+ DEFINE_PROP_BOOL("mhd-init", CXLMHSLDState, mhd_init, false),
+ DEFINE_PROP_END_OF_LIST(),
+};
+
+static int cxl_mhsld_state_open(const char *filename, int flags)
+{
+ char name[128];
+ snprintf(name, sizeof(name), "/%s", filename);
+ return shm_open(name, flags, 0666);
+}
+
+static int cxl_mhsld_state_unlink(const char *filename)
+{
+ char name[128];
+ snprintf(name, sizeof(name), "/%s", filename);
+ return shm_unlink(name);
+}
+
+static int cxl_mhsld_state_create(const char *filename, size_t size)
+{
+ int fd, rc;
+
+ fd = cxl_mhsld_state_open(filename, O_RDWR | O_CREAT);
+ if (fd == -1) {
+ return -1;
+ }
+
+ rc = ftruncate(fd, size);
+
+ if (rc) {
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+static bool cxl_mhsld_state_set(CXLMHSLDState *s, size_t block_start,
+ size_t block_count)
+{
+ uint8_t prev, val, *block;
+ size_t i;
+
+ val = (1 << s->mhd_head);
+
+ /*
+ * Try to claim all extents from start -> start + count;
+ * break early if a claimed extent is encountered
+ */
+ for (i = 0; i < block_count; ++i) {
+ block = &s->mhd_state->blocks[block_start + i];
+ prev = __sync_val_compare_and_swap(block, 0, val);
+ if (prev != 0) {
+ break;
+ }
+ }
+
+ if (prev == 0) {
+ return true;
+ }
+
+ /* Roll back incomplete claims */
+ for (;; --i) {
+ block = &s->mhd_state->blocks[block_start + i];
+ __sync_fetch_and_and(block, ~(1u << s->mhd_head));
+ if (i == 0) {
+ break;
+ }
+ }
+
+ return false;
+}
+
+static void cxl_mhsld_state_clear(CXLMHSLDState *s, size_t block_start,
+ size_t block_count)
+{
+ size_t i;
+ uint8_t *block;
+
+ for (i = 0; i < block_count; ++i) {
+ block = &s->mhd_state->blocks[block_start + i];
+ __sync_fetch_and_and(block, ~(1u << s->mhd_head));
+ }
+}
+
+static void cxl_mhsld_state_initialize(CXLMHSLDState *s, size_t dc_size)
+{
+ if (!s->mhd_init) {
+ cxl_mhsld_state_clear(s, 0, dc_size / MHSLD_BLOCK_SZ);
+ return;
+ }
+
+ memset(s->mhd_state, 0, s->mhd_state_size);
+ s->mhd_state->nr_heads = MHSLD_HEADS;
+ s->mhd_state->nr_lds = MHSLD_HEADS;
+ s->mhd_state->nr_blocks = dc_size / MHSLD_BLOCK_SZ;
+}
+
+/* Returns starting index of region in MHD map. */
+static inline size_t cxl_mhsld_find_dc_region_start(PCIDevice *d,
+ CXLDCRegion *r)
+{
+ CXLType3Dev *dcd = CXL_TYPE3(d);
+ size_t start = 0;
+ uint8_t rid;
+
+ for (rid = 0; rid < dcd->dc.num_regions; ++rid) {
+ if (&dcd->dc.regions[rid] == r) {
+ break;
+ }
+ start += dcd->dc.regions[rid].len / dcd->dc.regions[rid].block_size;
+ }
+
+ return start;
+}
+
+static MHSLDSharedState *cxl_mhsld_state_map(CXLMHSLDState *s)
+{
+ void *map;
+ size_t size = s->mhd_state_size;
+ int fd = s->mhd_state_fd;
+
+ if (fd < 0) {
+ return NULL;
+ }
+
+ map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (map == MAP_FAILED) {
+ return NULL;
+ }
+
+ return (MHSLDSharedState *)map;
+}
+
+/*
+ * Triggered during an add_capacity command to a CXL device:
+ * takes a list of extent records and preallocates them,
+ * in anticipation of a "dcd accept" response from the host.
+ *
+ * Extents that are not accepted by the host will be rolled
+ * back later.
+ */
+static bool cxl_mhsld_reserve_extents(PCIDevice *d,
+ CxlDynamicCapacityExtentList *records,
+ uint8_t rid)
+{
+ uint64_t len, dpa;
+ bool rc;
+
+ CXLMHSLDState *s = CXL_MHSLD(d);
+ CxlDynamicCapacityExtentList *list = records, *rollback = NULL;
+
+ CXLType3Dev *ct3d = CXL_TYPE3(d);
+ CXLDCRegion *region = &ct3d->dc.regions[rid];
+
+ for (; list; list = list->next) {
+ len = list->value->len / MHSLD_BLOCK_SZ;
+ dpa = (list->value->offset + region->base) / MHSLD_BLOCK_SZ;
+
+ rc = cxl_mhsld_state_set(s, dpa, len);
+
+ if (!rc) {
+ rollback = records;
+ break;
+ }
+ }
+
+ /* Setting the mhd state failed. Roll back the extents that were added */
+ for (; rollback; rollback = rollback->next) {
+ len = rollback->value->len / MHSLD_BLOCK_SZ;
+ dpa = (list->value->offset + region->base) / MHSLD_BLOCK_SZ;
+
+ cxl_mhsld_state_clear(s, dpa, len);
+
+ if (rollback == list) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool cxl_mhsld_reclaim_extents(PCIDevice *d,
+ CXLDCExtentGroupList *ext_groups,
+ CXLUpdateDCExtentListInPl *in)
+{
+ CXLMHSLDState *s = CXL_MHSLD(d);
+ CXLType3Dev *ct3d = CXL_TYPE3(d);
+ CXLDCExtentGroup *ext_group = QTAILQ_FIRST(ext_groups);
+ CXLDCExtent *ent;
+ CXLDCRegion *region;
+ g_autofree unsigned long *blk_bitmap = NULL;
+ uint64_t dpa, off, len, size, i;
+
+ /* Get the DCD region via the first requested extent */
+ ent = QTAILQ_FIRST(&ext_group->list);
+ dpa = ent->start_dpa;
+ len = ent->len;
+ region = cxl_find_dc_region(ct3d, dpa, len);
+ size = region->len / MHSLD_BLOCK_SZ;
+ blk_bitmap = bitmap_new(size);
+
+ /* Set all requested extents to 1 in a bitmap */
+ QTAILQ_FOREACH(ent, &ext_group->list, node) {
+ off = ent->start_dpa - region->base;
+ len = ent->len;
+ bitmap_set(blk_bitmap, off / MHSLD_BLOCK_SZ, len / MHSLD_BLOCK_SZ);
+ }
+
+ /* Clear bits associated with accepted extents */
+ for (i = 0; i < in->num_entries_updated; i++) {
+ off = in->updated_entries[i].start_dpa - region->base;
+ len = in->updated_entries[i].len;
+ bitmap_clear(blk_bitmap, off / MHSLD_BLOCK_SZ, len / MHSLD_BLOCK_SZ);
+ }
+
+ /*
+ * Reclaim only the extents that belong to unaccepted extents,
+ * i.e. those whose bits are still raised in blk_bitmap
+ */
+ for (off = find_first_bit(blk_bitmap, size); off < size;) {
+ len = find_next_zero_bit(blk_bitmap, size, off) - off;
+ cxl_mhsld_state_clear(s, off, len);
+ off = find_next_bit(blk_bitmap, size, off + len);
+ }
+
+ return true;
+}
+
+static bool cxl_mhsld_release_extent(PCIDevice *d, uint64_t dpa, uint64_t len)
+{
+ cxl_mhsld_state_clear(CXL_MHSLD(d), dpa / MHSLD_BLOCK_SZ,
+ len / MHSLD_BLOCK_SZ);
+ return true;
+}
+
+static bool cxl_mhsld_access_valid(PCIDevice *d, uint64_t addr,
+ unsigned int size)
+{
+ CXLType3Dev *ct3d = CXL_TYPE3(d);
+ CXLMHSLDState *s = CXL_MHSLD(d);
+ CXLDCRegion *r = cxl_find_dc_region(ct3d, addr, size);
+ size_t i;
+
+ addr = addr / r->block_size;
+ size = size / r->block_size;
+
+ for (i = 0; i < size; ++i) {
+ if (s->mhd_state->blocks[addr + i] != (1 << s->mhd_head)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void cxl_mhsld_realize(PCIDevice *pci_dev, Error **errp)
+{
+ CXLMHSLDState *s = CXL_MHSLD(pci_dev);
+ MemoryRegion *mr;
+ int fd = -1;
+ size_t dc_size;
+
+ ct3_realize(pci_dev, errp);
+
+ /* Get number of blocks from dcd size */
+ mr = host_memory_backend_get_memory(s->ct3d.dc.host_dc);
+ if (!mr) {
+ return;
+ }
+ dc_size = memory_region_size(mr);
+ if (!dc_size) {
+ error_setg(errp, "MHSLD does not have dynamic capacity to manage");
+ return;
+ }
+
+ s->mhd_state_size = (dc_size / MHSLD_BLOCK_SZ) + sizeof(MHSLDSharedState);
+
+ /* Sanity check the head idx */
+ if (s->mhd_head >= MHSLD_HEADS) {
+ error_setg(errp, "MHD Head ID must be between 0-7");
+ return;
+ }
+
+ /* Create the state file if this is the 'mhd_init' instance */
+ if (s->mhd_init) {
+ fd = cxl_mhsld_state_create(s->mhd_state_file, s->mhd_state_size);
+ } else {
+ fd = cxl_mhsld_state_open(s->mhd_state_file, O_RDWR);
+ }
+
+ if (fd < 0) {
+ error_setg(errp, "failed to open mhsld state errno %d", errno);
+ return;
+ }
+
+ s->mhd_state_fd = fd;
+
+ /* Map the state and initialize it as needed */
+ s->mhd_state = cxl_mhsld_state_map(s);
+ if (!s->mhd_state) {
+ error_setg(errp, "Failed to mmap mhd state file");
+ close(fd);
+ cxl_mhsld_state_unlink(s->mhd_state_file);
+ return;
+ }
+
+ cxl_mhsld_state_initialize(s, dc_size);
+
+ /* Set the LD ownership for this head to this system */
+ s->mhd_state->ldmap[s->mhd_head] = s->mhd_head;
+ return;
+}
+
+
+static void cxl_mhsld_exit(PCIDevice *pci_dev)
+{
+ CXLMHSLDState *s = CXL_MHSLD(pci_dev);
+
+ ct3_exit(pci_dev);
+
+ if (s->mhd_state_fd) {
+ munmap(s->mhd_state, s->mhd_state_size);
+ close(s->mhd_state_fd);
+ cxl_mhsld_state_unlink(s->mhd_state_file);
+ s->mhd_state = NULL;
+ }
+}
+
+static void cxl_mhsld_reset(DeviceState *d)
+{
+ CXLMHSLDState *s = CXL_MHSLD(d);
+
+ ct3d_reset(d);
+ cxl_add_cci_commands(&s->ct3d.cci, cxl_cmd_set_mhsld, 512);
+
+ cxl_mhsld_state_clear(s, 0, s->mhd_state->nr_blocks);
+}
+
+/*
+ * Example: DCD-add events need to validate that the requested extent
+ * does not already have a mapping (or, if it does, it is
+ * a shared extent with the right tagging).
+ *
+ * Since this operates on the shared state, we will need to serialize
+ * these callbacks across QEMU instances via a mutex in shared state.
+ */
+
+static void cxl_mhsld_class_init(ObjectClass *klass, void *data)
+{
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ PCIDeviceClass *pc = PCI_DEVICE_CLASS(klass);
+
+ pc->realize = cxl_mhsld_realize;
+ pc->exit = cxl_mhsld_exit;
+ device_class_set_legacy_reset(dc, cxl_mhsld_reset);
+ device_class_set_props(dc, cxl_mhsld_props);
+
+ CXLType3Class *cvc = CXL_TYPE3_CLASS(klass);
+ cvc->mhd_get_info = cmd_mhd_get_info;
+ cvc->mhd_access_valid = cxl_mhsld_access_valid;
+ cvc->mhd_reserve_extents = cxl_mhsld_reserve_extents;
+ cvc->mhd_reclaim_extents = cxl_mhsld_reclaim_extents;
+ cvc->mhd_release_extent = cxl_mhsld_release_extent;
+}
+
+static const TypeInfo cxl_mhsld_info = {
+ .name = TYPE_CXL_MHSLD,
+ .parent = TYPE_CXL_TYPE3,
+ .class_size = sizeof(struct CXLMHSLDClass),
+ .class_init = cxl_mhsld_class_init,
+ .instance_size = sizeof(CXLMHSLDState),
+ .interfaces = (InterfaceInfo[]) {
+ { INTERFACE_CXL_DEVICE },
+ { INTERFACE_PCIE_DEVICE },
+ {}
+ },
+};
+
+static void cxl_mhsld_register_types(void)
+{
+ type_register_static(&cxl_mhsld_info);
+}
+
+type_init(cxl_mhsld_register_types)
diff --git a/hw/cxl/mhsld/mhsld.h b/hw/cxl/mhsld/mhsld.h
new file mode 100644
index 0000000000..e7ead1f0d2
--- /dev/null
+++ b/hw/cxl/mhsld/mhsld.h
@@ -0,0 +1,75 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024 MemVerge Inc.
+ *
+ */
+
+#ifndef CXL_MHSLD_H
+#define CXL_MHSLD_H
+#include <stdint.h>
+#include "hw/cxl/cxl.h"
+#include "hw/cxl/cxl_mailbox.h"
+#include "hw/cxl/cxl_device.h"
+#include "qemu/units.h"
+
+#define MHSLD_BLOCK_SZ (2 * MiB)
+
+/*
+ * We limit the number of heads to prevent the shared state
+ * region from becoming a major memory hog. We need 512MB of
+ * memory space to track 8-host ownership of 4GB of memory in
+ * blocks of 2MB. This can change if the block size is increased.
+ */
+#define MHSLD_HEADS (8)
+
+/*
+ * The shared state cannot have 2 variable sized regions
+ * so we have to max out the ldmap.
+ */
+typedef struct MHSLDSharedState {
+ uint8_t nr_heads;
+ uint8_t nr_lds;
+ uint8_t ldmap[MHSLD_HEADS];
+ uint64_t nr_blocks;
+ uint8_t blocks[];
+} MHSLDSharedState;
+
+struct CXLMHSLDState {
+ CXLType3Dev ct3d;
+ bool mhd_init;
+ char *mhd_state_file;
+ int mhd_state_fd;
+ size_t mhd_state_size;
+ uint32_t mhd_head;
+ MHSLDSharedState *mhd_state;
+};
+
+struct CXLMHSLDClass {
+ CXLType3Class parent_class;
+};
+
+enum {
+ MHSLD_MHD = 0x55,
+ #define GET_MHD_INFO 0x0
+};
+
+/*
+ * MHD Get Info Command
+ * Returns information the LD's associated with this head
+ */
+typedef struct MHDGetInfoInput {
+ uint8_t start_ld;
+ uint8_t ldmap_len;
+} QEMU_PACKED MHDGetInfoInput;
+
+typedef struct MHDGetInfoOutput {
+ uint8_t nr_lds;
+ uint8_t nr_heads;
+ uint16_t resv1;
+ uint8_t start_ld;
+ uint8_t ldmap_len;
+ uint16_t resv2;
+ uint8_t ldmap[];
+} QEMU_PACKED MHDGetInfoOutput;
+#endif
--
2.43.0
On Fri, 18 Oct 2024 12:12:52 -0400 Gregory Price <gourry@gourry.net> wrote: > From: Svetly Todorov <svetly.todorov@memverge.com> > > Using a shared-memory bytemap, validates that DC adds, releases, > and reclamations happen on extents belonging to the appropriate > host. > I'll carry this on my staging tree for now gitlab.com/jic23/qemu (latest date branch). I'll push out a bit later today. Few rebase related changes were needed. I've not tested it yet. Also I have dropped the previous prototype device Niagara on basis this should give people a better platform to work with. Jonathan
On Fri, Jan 24, 2025 at 02:12:46PM +0000, Jonathan Cameron wrote: > On Fri, 18 Oct 2024 12:12:52 -0400 > Gregory Price <gourry@gourry.net> wrote: > > > From: Svetly Todorov <svetly.todorov@memverge.com> > > > > Using a shared-memory bytemap, validates that DC adds, releases, > > and reclamations happen on extents belonging to the appropriate > > host. > > > I'll carry this on my staging tree for now gitlab.com/jic23/qemu (latest date branch). > I'll push out a bit later today. > Few rebase related changes were needed. I've not tested it yet. > > Also I have dropped the previous prototype device Niagara on basis this > should give people a better platform to work with. > Agreed. Niagara was a good step to get here. Let me know how I can help. ~Gregory
On Fri, 18 Oct 2024 12:12:52 -0400
Gregory Price <gourry@gourry.net> wrote:
> From: Svetly Todorov <svetly.todorov@memverge.com>
>
> Using a shared-memory bytemap, validates that DC adds, releases,
> and reclamations happen on extents belonging to the appropriate
> host.
>
> The MHSLD device inherits from the CXL_TYPE3 class and adds the following
> configuration options:
As in previous the -- syntax is somewhat confusing as that's
not how the parameters are applied.
> --mhd-head=<u32>
> --mhd-state_file=<str>
> --mhd-init=<bool>
>
> --mhd-head specifies the head ID of the host on the given device.
>
> --mhd-state_file is the name of the shared-memory-backed file used
> to store the MHD state.
>
> --mhd-init indicates whether this QEMU instance should initialize
> the state_file; if so, the instance will create the file if it does
> not exist, ftruncate it to the appropriate size, and initialize its
> header. It is assumed that the --mhd-init instance is run and allowed
> to completely finish configuration before any other guests access the
> shared state.
>
> The shared state file only needs to be intialized once. Even if a guest
> dies without clearing the ownership bits associated with its head-ID,
> future guests with that ID will clear those bits in cxl_mhsld_realize(),
> regardless of whether mhd_init is true or false.
That sounds like a race condition if not all hosts are brought
up before the first add.
>
> The following command line options create an MHSLD with 4GB of
> backing memory, whose state is tracked in /dev/shm/mhd_metadata.
> --mhd-init=true tells this instance to initialize the state as
> described above.
>
> ./qemu-system_x86-64 \
> [... other options ...] \
> -device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 \
> -device cxl-rp,id=rp0,bus=cxl.0,chassis=0,port=0,slot=0 \
> -object memory-backend-ram,id=mem0,size=4G \
> -device cxl-mhsld,bus=rp0,num-dc-regions=1,volatile-dc-memdev=mem0,id=cxl-mem0,sn=66667,mhd-head=0,mhd-state_file=mhd_metadata,mhd-init=true \
> -M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G \
> -qmp unix:/tmp/qmp-sock-1,server,nowait
>
> Once this guest completes setup, other guests looking to access the
> device can be booted with the same configuration options, but with
> --mhd-head != 0,
> --mhd-init=false,
> and a different QMP socket.
>
> Signed-off-by: Gregory Price <gourry@gourry.net>
> Signed-off-by: Svetly Todorov <svetly.todorov@memverge.com>
A few trivial things inline.
In general the scheme looks workable but I'm not sure the contraints at setup time
etc are suitable for an upstream solution. Certainly a useful tool to have
for kernel development though so I'll try and find time in next few days to apply
this on my gitlab tree.
Longer term I think we need a more complex external program or a main / proxy
type arrangement so that ordering requirements can be enforce and we can have
richer info. Having to chat to each qmp interface independently works fine is
also a bit more complex than I think we would eventually want.
Having a solution in place though will make it much easier to move towards
an eventual upstreamable solution. This is a great place to start from!
Jonathan
> +static bool cxl_mhsld_release_extent(PCIDevice *d, uint64_t dpa, uint64_t len)
> +{
> + cxl_mhsld_state_clear(CXL_MHSLD(d), dpa / MHSLD_BLOCK_SZ,
> + len / MHSLD_BLOCK_SZ);
Trivial but align after (
> + return true;
> +}
> diff --git a/hw/cxl/mhsld/mhsld.h b/hw/cxl/mhsld/mhsld.h
> new file mode 100644
> index 0000000000..e7ead1f0d2
> --- /dev/null
> +++ b/hw/cxl/mhsld/mhsld.h
> @@ -0,0 +1,75 @@
> +/*
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * Copyright (c) 2024 MemVerge Inc.
> + *
> + */
> +
> +#ifndef CXL_MHSLD_H
> +#define CXL_MHSLD_H
> +#include <stdint.h>
> +#include "hw/cxl/cxl.h"
> +#include "hw/cxl/cxl_mailbox.h"
> +#include "hw/cxl/cxl_device.h"
> +#include "qemu/units.h"
> +
> +#define MHSLD_BLOCK_SZ (2 * MiB)
> +
> +/*
> + * We limit the number of heads to prevent the shared state
> + * region from becoming a major memory hog. We need 512MB of
> + * memory space to track 8-host ownership of 4GB of memory in
> + * blocks of 2MB. This can change if the block size is increased.
I'm lost what makes up that size?
> + */
> +#define MHSLD_HEADS (8)
> +
> +/*
> + * The shared state cannot have 2 variable sized regions
> + * so we have to max out the ldmap.
> + */
> +typedef struct MHSLDSharedState {
> + uint8_t nr_heads;
> + uint8_t nr_lds;
> + uint8_t ldmap[MHSLD_HEADS];
> + uint64_t nr_blocks;
> + uint8_t blocks[];
> +} MHSLDSharedState;
> +
> +struct CXLMHSLDState {
> + CXLType3Dev ct3d;
> + bool mhd_init;
> + char *mhd_state_file;
> + int mhd_state_fd;
> + size_t mhd_state_size;
> + uint32_t mhd_head;
> + MHSLDSharedState *mhd_state;
> +};
> +
> +struct CXLMHSLDClass {
> + CXLType3Class parent_class;
> +};
> +
> +enum {
> + MHSLD_MHD = 0x55,
> + #define GET_MHD_INFO 0x0
> +};
> +
> +/*
> + * MHD Get Info Command
> + * Returns information the LD's associated with this head
> + */
> +typedef struct MHDGetInfoInput {
> + uint8_t start_ld;
> + uint8_t ldmap_len;
> +} QEMU_PACKED MHDGetInfoInput;
> +
> +typedef struct MHDGetInfoOutput {
> + uint8_t nr_lds;
> + uint8_t nr_heads;
> + uint16_t resv1;
> + uint8_t start_ld;
> + uint8_t ldmap_len;
> + uint16_t resv2;
> + uint8_t ldmap[];
> +} QEMU_PACKED MHDGetInfoOutput;
> +#endif
On Thu, Dec 12, 2024 at 05:40:16PM +0000, Jonathan Cameron via wrote: > On Fri, 18 Oct 2024 12:12:52 -0400 > Gregory Price <gourry@gourry.net> wrote: > > > From: Svetly Todorov <svetly.todorov@memverge.com> > > > > The shared state file only needs to be intialized once. Even if a guest > > dies without clearing the ownership bits associated with its head-ID, > > future guests with that ID will clear those bits in cxl_mhsld_realize(), > > regardless of whether mhd_init is true or false. > > That sounds like a race condition if not all hosts are brought > up before the first add. > We weighed this against having to do an external setup like # SHMID = ./create_sharedmem.sh # ./launch_qemu --shmid=$SHMID Which is what the original non-generalized prototype did. So yeah, there's a race condition AND a footgun (setting init AFTER qemu instances are already using the memory will blow the state away). This was intended. As you allude to in the next chunk - the only real way to get around this is to have an entirely external process serialize access. > > > > The following command line options create an MHSLD with 4GB of > > backing memory, whose state is tracked in /dev/shm/mhd_metadata. > > --mhd-init=true tells this instance to initialize the state as > > described above. > > > > ./qemu-system_x86-64 \ > > [... other options ...] \ > > -device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 \ > > -device cxl-rp,id=rp0,bus=cxl.0,chassis=0,port=0,slot=0 \ > > -object memory-backend-ram,id=mem0,size=4G \ > > -device cxl-mhsld,bus=rp0,num-dc-regions=1,volatile-dc-memdev=mem0,id=cxl-mem0,sn=66667,mhd-head=0,mhd-state_file=mhd_metadata,mhd-init=true \ > > -M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G \ > > -qmp unix:/tmp/qmp-sock-1,server,nowait > > > > Once this guest completes setup, other guests looking to access the > > device can be booted with the same configuration options, but with > > --mhd-head != 0, > > --mhd-init=false, > > and a different QMP socket. > > > > Signed-off-by: Gregory Price <gourry@gourry.net> > > Signed-off-by: Svetly Todorov <svetly.todorov@memverge.com> > > A few trivial things inline. > > In general the scheme looks workable but I'm not sure the contraints at setup time > etc are suitable for an upstream solution. Certainly a useful tool to have > for kernel development though so I'll try and find time in next few days to apply > this on my gitlab tree. > > Longer term I think we need a more complex external program or a main / proxy > type arrangement so that ordering requirements can be enforce I marginally disagree. We have to check ownership during memory use. We should try not to have an external process dependency for deferencing a pointer backed by an emulated DCD device. The current overhead is bad enough. The shared memory use here mostly limits that overhead to cache invalidation, and keeps the entire system fairly simple. All this is to say - we err'd on the side of keeping it simple, even if it has a few stupid footguns. Obviously open to ideas, though. > and we can have > richer info. Having to chat to each qmp interface independently works fine is > also a bit more complex than I think we would eventually want. > This is a small component in someone's fabric manager that translates their requests into QMP commands. Whatever we ultimately decide on, the complexity here is about the same. > Having a solution in place though will make it much easier to move towards > an eventual upstreamable solution. This is a great place to start from! > > Jonathan > > > +/* > > + * We limit the number of heads to prevent the shared state > > + * region from becoming a major memory hog. We need 512MB of > > + * memory space to track 8-host ownership of 4GB of memory in > > + * blocks of 2MB. This can change if the block size is increased. > > I'm lost what makes up that size? > I think the math is wrong here, we may have calculated based on a larger capacity. I need to go back and look at how we came to 512MB. ~Gregory
On Thu, 12 Dec 2024 14:52:02 -0500 Gregory Price <gourry@gourry.net> wrote: > On Thu, Dec 12, 2024 at 05:40:16PM +0000, Jonathan Cameron via wrote: > > On Fri, 18 Oct 2024 12:12:52 -0400 > > Gregory Price <gourry@gourry.net> wrote: > > > > > From: Svetly Todorov <svetly.todorov@memverge.com> > > > > > > The shared state file only needs to be intialized once. Even if a guest > > > dies without clearing the ownership bits associated with its head-ID, > > > future guests with that ID will clear those bits in cxl_mhsld_realize(), > > > regardless of whether mhd_init is true or false. > > > > That sounds like a race condition if not all hosts are brought > > up before the first add. > > > > We weighed this against having to do an external setup like > > # SHMID = ./create_sharedmem.sh > # ./launch_qemu --shmid=$SHMID > > Which is what the original non-generalized prototype did. > > So yeah, there's a race condition AND a footgun (setting init AFTER > qemu instances are already using the memory will blow the state away). > > This was intended. > > As you allude to in the next chunk - the only real way to get around > this is to have an entirely external process serialize access. > > > > > > > The following command line options create an MHSLD with 4GB of > > > backing memory, whose state is tracked in /dev/shm/mhd_metadata. > > > --mhd-init=true tells this instance to initialize the state as > > > described above. > > > > > > ./qemu-system_x86-64 \ > > > [... other options ...] \ > > > -device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 \ > > > -device cxl-rp,id=rp0,bus=cxl.0,chassis=0,port=0,slot=0 \ > > > -object memory-backend-ram,id=mem0,size=4G \ > > > -device cxl-mhsld,bus=rp0,num-dc-regions=1,volatile-dc-memdev=mem0,id=cxl-mem0,sn=66667,mhd-head=0,mhd-state_file=mhd_metadata,mhd-init=true \ > > > -M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G \ > > > -qmp unix:/tmp/qmp-sock-1,server,nowait > > > > > > Once this guest completes setup, other guests looking to access the > > > device can be booted with the same configuration options, but with > > > --mhd-head != 0, > > > --mhd-init=false, > > > and a different QMP socket. > > > > > > Signed-off-by: Gregory Price <gourry@gourry.net> > > > Signed-off-by: Svetly Todorov <svetly.todorov@memverge.com> > > > > A few trivial things inline. > > > > In general the scheme looks workable but I'm not sure the contraints at setup time > > etc are suitable for an upstream solution. Certainly a useful tool to have > > for kernel development though so I'll try and find time in next few days to apply > > this on my gitlab tree. > > > > Longer term I think we need a more complex external program or a main / proxy > > type arrangement so that ordering requirements can be enforce > > I marginally disagree. We have to check ownership during memory use. > > We should try not to have an external process dependency for deferencing a > pointer backed by an emulated DCD device. The current overhead is bad enough. > > The shared memory use here mostly limits that overhead to cache invalidation, > and keeps the entire system fairly simple. > > All this is to say - we err'd on the side of keeping it simple, even if it > has a few stupid footguns. Obviously open to ideas, though. Need to mess around with it, but I don't in general think each host needs to see the global protection map - they just need to know what they can access. I don't think anyone is going to do 2MiB granularity on a real device (or at least not for bulk capacity) so the necessary data may not be that large. Obviously the cost of updating it will be higher, but meh, that's not that common an operation. > > > and we can have > > richer info. Having to chat to each qmp interface independently works fine is > > also a bit more complex than I think we would eventually want. > > > > This is a small component in someone's fabric manager that translates their > requests into QMP commands. Whatever we ultimately decide on, the complexity > here is about the same. It's the qmp commands to a bunch of different qemu instances pretending to be one Multi Head device. Sure we can insert a proxy to act as that one, but then we more or less have an external process mediating that. > > > Having a solution in place though will make it much easier to move towards > > an eventual upstreamable solution. This is a great place to start from! > > > > Jonathan > > > > > +/* > > > + * We limit the number of heads to prevent the shared state > > > + * region from becoming a major memory hog. We need 512MB of > > > + * memory space to track 8-host ownership of 4GB of memory in > > > + * blocks of 2MB. This can change if the block size is increased. > > > > I'm lost what makes up that size? > > > > I think the math is wrong here, we may have calculated based on > a larger capacity. I need to go back and look at how we came to 512MB. 4TB maybe? Jonathan > > ~Gregory
On Fri, Dec 13, 2024 at 04:18:36PM +0000, Jonathan Cameron wrote: > On Thu, 12 Dec 2024 14:52:02 -0500 > Gregory Price <gourry@gourry.net> wrote: > > I don't think anyone is going to do 2MiB granularity on a real device > (or at least not for bulk capacity) so the necessary data may not be that large. > Obviously the cost of updating it will be higher, but meh, that's not that > common an operation. > > > > > > and we can have > > > richer info. Having to chat to each qmp interface independently works fine is > > > also a bit more complex than I think we would eventually want. > > > > > > > This is a small component in someone's fabric manager that translates their > > requests into QMP commands. Whatever we ultimately decide on, the complexity > > here is about the same. > > It's the qmp commands to a bunch of different qemu instances pretending to be one > Multi Head device. > Sure we can insert a proxy to act as that one, but then we more or less > have an external process mediating that. > I'm not against changing this, I just think it's going to increase complexity for little value. I personally like the idea of keeping it the way it is and slapping a small proxy on top that emulates the "out of band" interface to the device - but all it does behind the scenes is send out a QMP command (that already exists) to a guest that registered itself with the proxy. This separates the complexity of implementing FMAPI functionality and the complexity of how an individual instance of QEMU go about handling add/access/remove semantics for DCD events. The annoying thought experiment here is when someone wants their in-band host interface to have access to the FMAPI great now i've made myself grumpy > > > > > Having a solution in place though will make it much easier to move towards > > > an eventual upstreamable solution. This is a great place to start from! > > > > > > Jonathan > > > > > > > +/* > > > > + * We limit the number of heads to prevent the shared state > > > > + * region from becoming a major memory hog. We need 512MB of > > > > + * memory space to track 8-host ownership of 4GB of memory in > > > > + * blocks of 2MB. This can change if the block size is increased. > > > > > > I'm lost what makes up that size? > > > > > > > I think the math is wrong here, we may have calculated based on > > a larger capacity. I need to go back and look at how we came to 512MB. > 4TB maybe? > > Jonathan Yeah that's probably what it was. I think that's what I was basing it off of at the time. ~Gregory
© 2016 - 2026 Red Hat, Inc.