[v3] cxl: Multi-headed Single Logical Device (MHSLD)

[PATCH RFC v3 3/3] mhsld: implement MHSLD device
Posted by Gregory Price 4 days, 6 hours ago
From: Svetly Todorov <svetly.todorov@memverge.com>

Using a shared-memory bytemap, validates that DC adds, releases,
and reclamations happen on extents belonging to the appropriate
host.

The MHSLD device inherits from the CXL_TYPE3 class and adds the following
configuration options:
--mhd-head=<u32>
--mhd-state_file=<str>
--mhd-init=<bool>

--mhd-head specifies the head ID of the host on the given device.

--mhd-state_file is the name of the shared-memory-backed file used
to store the MHD state.

--mhd-init indicates whether this QEMU instance should initialize
the state_file; if so, the instance will create the file if it does
not exist, ftruncate it to the appropriate size, and initialize its
header. It is assumed that the --mhd-init instance is run and allowed
to completely finish configuration before any other guests access the
shared state.

The shared state file only needs to be intialized once. Even if a guest
dies without clearing the ownership bits associated with its head-ID,
future guests with that ID will clear those bits in cxl_mhsld_realize(),
regardless of whether mhd_init is true or false.

The following command line options create an MHSLD with 4GB of
backing memory, whose state is tracked in /dev/shm/mhd_metadata.
--mhd-init=true tells this instance to initialize the state as
described above.

./qemu-system_x86-64 \
[... other options ...] \
-device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 \
-device cxl-rp,id=rp0,bus=cxl.0,chassis=0,port=0,slot=0 \
-object memory-backend-ram,id=mem0,size=4G \
-device cxl-mhsld,bus=rp0,num-dc-regions=1,volatile-dc-memdev=mem0,id=cxl-mem0,sn=66667,mhd-head=0,mhd-state_file=mhd_metadata,mhd-init=true \
-M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G \
-qmp unix:/tmp/qmp-sock-1,server,nowait

Once this guest completes setup, other guests looking to access the
device can be booted with the same configuration options, but with
--mhd-head != 0,
--mhd-init=false,
and a different QMP socket.

Signed-off-by: Gregory Price <gourry@gourry.net>
Signed-off-by: Svetly Todorov <svetly.todorov@memverge.com>
---
 hw/cxl/Kconfig           |   1 +
 hw/cxl/meson.build       |   1 +
 hw/cxl/mhsld/Kconfig     |   4 +
 hw/cxl/mhsld/meson.build |   3 +
 hw/cxl/mhsld/mhsld.c     | 456 +++++++++++++++++++++++++++++++++++++++
 hw/cxl/mhsld/mhsld.h     |  75 +++++++
 6 files changed, 540 insertions(+)
 create mode 100644 hw/cxl/mhsld/Kconfig
 create mode 100644 hw/cxl/mhsld/meson.build
 create mode 100644 hw/cxl/mhsld/mhsld.c
 create mode 100644 hw/cxl/mhsld/mhsld.h

diff --git a/hw/cxl/Kconfig b/hw/cxl/Kconfig
index e603839a62..919e59b598 100644
--- a/hw/cxl/Kconfig
+++ b/hw/cxl/Kconfig
@@ -1,3 +1,4 @@
+source mhsld/Kconfig
 source vendor/Kconfig
 
 config CXL
diff --git a/hw/cxl/meson.build b/hw/cxl/meson.build
index e8c8c1355a..394750dd19 100644
--- a/hw/cxl/meson.build
+++ b/hw/cxl/meson.build
@@ -16,4 +16,5 @@ system_ss.add(when: 'CONFIG_I2C_MCTP_CXL', if_true: files('i2c_mctp_cxl.c'))
 
 system_ss.add(when: 'CONFIG_ALL', if_true: files('cxl-host-stubs.c'))
 
+subdir('mhsld')
 subdir('vendor')
diff --git a/hw/cxl/mhsld/Kconfig b/hw/cxl/mhsld/Kconfig
new file mode 100644
index 0000000000..dc2be15140
--- /dev/null
+++ b/hw/cxl/mhsld/Kconfig
@@ -0,0 +1,4 @@
+config CXL_MHSLD
+    bool
+    depends on CXL_MEM_DEVICE
+    default y
diff --git a/hw/cxl/mhsld/meson.build b/hw/cxl/mhsld/meson.build
new file mode 100644
index 0000000000..c595558f8a
--- /dev/null
+++ b/hw/cxl/mhsld/meson.build
@@ -0,0 +1,3 @@
+if host_os == 'linux'
+	system_ss.add(when: 'CONFIG_CXL_MHSLD', if_true: files('mhsld.c',))
+endif
diff --git a/hw/cxl/mhsld/mhsld.c b/hw/cxl/mhsld/mhsld.c
new file mode 100644
index 0000000000..2a3023607e
--- /dev/null
+++ b/hw/cxl/mhsld/mhsld.c
@@ -0,0 +1,456 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024 MemVerge Inc.
+ *
+ */
+
+#include <sys/file.h>
+#include "qemu/osdep.h"
+#include "qemu/bitmap.h"
+#include "hw/irq.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "hw/cxl/cxl.h"
+#include "hw/cxl/cxl_mailbox.h"
+#include "hw/cxl/cxl_device.h"
+#include "hw/pci/pcie.h"
+#include "hw/pci/pcie_port.h"
+#include "hw/qdev-properties.h"
+#include "sysemu/hostmem.h"
+#include "mhsld.h"
+
+#define TYPE_CXL_MHSLD "cxl-mhsld"
+OBJECT_DECLARE_TYPE(CXLMHSLDState, CXLMHSLDClass, CXL_MHSLD)
+
+/*
+ * CXL r3.0 section 7.6.7.5.1 - Get Multi-Headed Info (Opcode 5500h)
+ *
+ * This command retrieves the number of heads, number of supported LDs,
+ * and Head-to-LD mapping of a Multi-Headed device.
+ */
+static CXLRetCode cmd_mhd_get_info(const struct cxl_cmd *cmd,
+                                   uint8_t *payload_in, size_t len_in,
+                                   uint8_t *payload_out, size_t *len_out,
+                                   CXLCCI * cci)
+{
+    CXLMHSLDState *s = CXL_MHSLD(cci->d);
+    MHDGetInfoInput *input = (void *)payload_in;
+    MHDGetInfoOutput *output = (void *)payload_out;
+
+    uint8_t start_ld = input->start_ld;
+    uint8_t ldmap_len = input->ldmap_len;
+    uint8_t i;
+
+    if (start_ld >= s->mhd_state->nr_lds) {
+        return CXL_MBOX_INVALID_INPUT;
+    }
+
+    output->nr_lds = s->mhd_state->nr_lds;
+    output->nr_heads = s->mhd_state->nr_heads;
+    output->resv1 = 0;
+    output->start_ld = start_ld;
+    output->resv2 = 0;
+
+    for (i = 0; i < ldmap_len && (start_ld + i) < output->nr_lds; i++) {
+        output->ldmap[i] = s->mhd_state->ldmap[start_ld + i];
+    }
+    output->ldmap_len = i;
+
+    *len_out = sizeof(*output) + output->ldmap_len;
+    return CXL_MBOX_SUCCESS;
+}
+
+static const struct cxl_cmd cxl_cmd_set_mhsld[256][256] = {
+    [MHSLD_MHD][GET_MHD_INFO] = {"GET_MULTI_HEADED_INFO",
+        cmd_mhd_get_info, 2, 0},
+};
+
+static Property cxl_mhsld_props[] = {
+    DEFINE_PROP_UINT32("mhd-head", CXLMHSLDState, mhd_head, ~(0)),
+    DEFINE_PROP_STRING("mhd-state_file", CXLMHSLDState, mhd_state_file),
+    DEFINE_PROP_BOOL("mhd-init", CXLMHSLDState, mhd_init, false),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static int cxl_mhsld_state_open(const char *filename, int flags)
+{
+    char name[128];
+    snprintf(name, sizeof(name), "/%s", filename);
+    return shm_open(name, flags, 0666);
+}
+
+static int cxl_mhsld_state_unlink(const char *filename)
+{
+    char name[128];
+    snprintf(name, sizeof(name), "/%s", filename);
+    return shm_unlink(name);
+}
+
+static int cxl_mhsld_state_create(const char *filename, size_t size)
+{
+    int fd, rc;
+
+    fd = cxl_mhsld_state_open(filename, O_RDWR | O_CREAT);
+    if (fd == -1) {
+        return -1;
+    }
+
+    rc = ftruncate(fd, size);
+
+    if (rc) {
+        close(fd);
+        return -1;
+    }
+
+    return fd;
+}
+
+static bool cxl_mhsld_state_set(CXLMHSLDState *s, size_t block_start,
+                                size_t block_count)
+{
+    uint8_t prev, val, *block;
+    size_t i;
+
+    val = (1 << s->mhd_head);
+
+    /*
+     * Try to claim all extents from start -> start + count;
+     * break early if a claimed extent is encountered
+     */
+    for (i = 0; i < block_count; ++i) {
+        block = &s->mhd_state->blocks[block_start + i];
+        prev = __sync_val_compare_and_swap(block, 0, val);
+        if (prev != 0) {
+            break;
+        }
+    }
+
+    if (prev == 0) {
+        return true;
+    }
+
+    /* Roll back incomplete claims */
+    for (;; --i) {
+        block = &s->mhd_state->blocks[block_start + i];
+        __sync_fetch_and_and(block, ~(1u << s->mhd_head));
+        if (i == 0) {
+            break;
+        }
+    }
+
+    return false;
+}
+
+static void cxl_mhsld_state_clear(CXLMHSLDState *s, size_t block_start,
+                                  size_t block_count)
+{
+    size_t i;
+    uint8_t *block;
+
+    for (i = 0; i < block_count; ++i) {
+        block = &s->mhd_state->blocks[block_start + i];
+        __sync_fetch_and_and(block, ~(1u << s->mhd_head));
+    }
+}
+
+static void cxl_mhsld_state_initialize(CXLMHSLDState *s, size_t dc_size)
+{
+    if (!s->mhd_init) {
+        cxl_mhsld_state_clear(s, 0, dc_size / MHSLD_BLOCK_SZ);
+        return;
+    }
+
+    memset(s->mhd_state, 0, s->mhd_state_size);
+    s->mhd_state->nr_heads = MHSLD_HEADS;
+    s->mhd_state->nr_lds = MHSLD_HEADS;
+    s->mhd_state->nr_blocks = dc_size / MHSLD_BLOCK_SZ;
+}
+
+/* Returns starting index of region in MHD map. */
+static inline size_t cxl_mhsld_find_dc_region_start(PCIDevice *d,
+                                                    CXLDCRegion *r)
+{
+    CXLType3Dev *dcd = CXL_TYPE3(d);
+    size_t start = 0;
+    uint8_t rid;
+
+    for (rid = 0; rid < dcd->dc.num_regions; ++rid) {
+        if (&dcd->dc.regions[rid] == r) {
+            break;
+        }
+        start += dcd->dc.regions[rid].len / dcd->dc.regions[rid].block_size;
+    }
+
+    return start;
+}
+
+static MHSLDSharedState *cxl_mhsld_state_map(CXLMHSLDState *s)
+{
+    void *map;
+    size_t size = s->mhd_state_size;
+    int fd = s->mhd_state_fd;
+
+    if (fd < 0) {
+        return NULL;
+    }
+
+    map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if (map == MAP_FAILED) {
+        return NULL;
+    }
+
+    return (MHSLDSharedState *)map;
+}
+
+/*
+ * Triggered during an add_capacity command to a CXL device:
+ * takes a list of extent records and preallocates them,
+ * in anticipation of a "dcd accept" response from the host.
+ *
+ * Extents that are not accepted by the host will be rolled
+ * back later.
+ */
+static bool cxl_mhsld_reserve_extents(PCIDevice *d,
+                                      CxlDynamicCapacityExtentList *records,
+                                      uint8_t rid)
+{
+    uint64_t len, dpa;
+    bool rc;
+
+    CXLMHSLDState *s = CXL_MHSLD(d);
+    CxlDynamicCapacityExtentList *list = records, *rollback = NULL;
+
+    CXLType3Dev *ct3d = CXL_TYPE3(d);
+    CXLDCRegion *region = &ct3d->dc.regions[rid];
+
+    for (; list; list = list->next) {
+        len = list->value->len / MHSLD_BLOCK_SZ;
+        dpa = (list->value->offset + region->base) / MHSLD_BLOCK_SZ;
+
+        rc = cxl_mhsld_state_set(s, dpa, len);
+
+        if (!rc) {
+            rollback = records;
+            break;
+        }
+    }
+
+    /* Setting the mhd state failed. Roll back the extents that were added */
+    for (; rollback; rollback = rollback->next) {
+        len = rollback->value->len / MHSLD_BLOCK_SZ;
+        dpa = (list->value->offset + region->base) / MHSLD_BLOCK_SZ;
+
+        cxl_mhsld_state_clear(s, dpa, len);
+
+        if (rollback == list) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool cxl_mhsld_reclaim_extents(PCIDevice *d,
+                                      CXLDCExtentGroupList *ext_groups,
+                                      CXLUpdateDCExtentListInPl *in)
+{
+    CXLMHSLDState *s = CXL_MHSLD(d);
+    CXLType3Dev *ct3d = CXL_TYPE3(d);
+    CXLDCExtentGroup *ext_group = QTAILQ_FIRST(ext_groups);
+    CXLDCExtent *ent;
+    CXLDCRegion *region;
+    g_autofree unsigned long *blk_bitmap = NULL;
+    uint64_t dpa, off, len, size, i;
+
+    /* Get the DCD region via the first requested extent */
+    ent = QTAILQ_FIRST(&ext_group->list);
+    dpa = ent->start_dpa;
+    len = ent->len;
+    region = cxl_find_dc_region(ct3d, dpa, len);
+    size = region->len / MHSLD_BLOCK_SZ;
+    blk_bitmap = bitmap_new(size);
+
+    /* Set all requested extents to 1 in a bitmap */
+    QTAILQ_FOREACH(ent, &ext_group->list, node) {
+        off = ent->start_dpa - region->base;
+        len = ent->len;
+        bitmap_set(blk_bitmap, off / MHSLD_BLOCK_SZ, len / MHSLD_BLOCK_SZ);
+    }
+
+    /* Clear bits associated with accepted extents */
+    for (i = 0; i < in->num_entries_updated; i++) {
+        off = in->updated_entries[i].start_dpa - region->base;
+        len = in->updated_entries[i].len;
+        bitmap_clear(blk_bitmap, off / MHSLD_BLOCK_SZ, len / MHSLD_BLOCK_SZ);
+    }
+
+    /*
+     * Reclaim only the extents that belong to unaccepted extents,
+     * i.e. those whose bits are still raised in blk_bitmap
+     */
+    for (off = find_first_bit(blk_bitmap, size); off < size;) {
+        len = find_next_zero_bit(blk_bitmap, size, off) - off;
+        cxl_mhsld_state_clear(s, off, len);
+        off = find_next_bit(blk_bitmap, size, off + len);
+    }
+
+    return true;
+}
+
+static bool cxl_mhsld_release_extent(PCIDevice *d, uint64_t dpa, uint64_t len)
+{
+    cxl_mhsld_state_clear(CXL_MHSLD(d), dpa / MHSLD_BLOCK_SZ,
+        len / MHSLD_BLOCK_SZ);
+    return true;
+}
+
+static bool cxl_mhsld_access_valid(PCIDevice *d, uint64_t addr,
+                                   unsigned int size)
+{
+    CXLType3Dev *ct3d = CXL_TYPE3(d);
+    CXLMHSLDState *s = CXL_MHSLD(d);
+    CXLDCRegion *r = cxl_find_dc_region(ct3d, addr, size);
+    size_t i;
+
+    addr = addr / r->block_size;
+    size = size / r->block_size;
+
+    for (i = 0; i < size; ++i) {
+        if (s->mhd_state->blocks[addr + i] != (1 << s->mhd_head)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void cxl_mhsld_realize(PCIDevice *pci_dev, Error **errp)
+{
+    CXLMHSLDState *s = CXL_MHSLD(pci_dev);
+    MemoryRegion *mr;
+    int fd = -1;
+    size_t dc_size;
+
+    ct3_realize(pci_dev, errp);
+
+    /* Get number of blocks from dcd size */
+    mr = host_memory_backend_get_memory(s->ct3d.dc.host_dc);
+    if (!mr) {
+        return;
+    }
+    dc_size = memory_region_size(mr);
+    if (!dc_size) {
+        error_setg(errp, "MHSLD does not have dynamic capacity to manage");
+        return;
+    }
+
+    s->mhd_state_size = (dc_size / MHSLD_BLOCK_SZ) + sizeof(MHSLDSharedState);
+
+    /* Sanity check the head idx */
+    if (s->mhd_head >= MHSLD_HEADS) {
+        error_setg(errp, "MHD Head ID must be between 0-7");
+        return;
+    }
+
+    /* Create the state file if this is the 'mhd_init' instance */
+    if (s->mhd_init) {
+        fd = cxl_mhsld_state_create(s->mhd_state_file, s->mhd_state_size);
+    } else {
+        fd = cxl_mhsld_state_open(s->mhd_state_file, O_RDWR);
+    }
+
+    if (fd < 0) {
+        error_setg(errp, "failed to open mhsld state errno %d", errno);
+        return;
+    }
+
+    s->mhd_state_fd = fd;
+
+    /* Map the state and initialize it as needed */
+    s->mhd_state = cxl_mhsld_state_map(s);
+    if (!s->mhd_state) {
+        error_setg(errp, "Failed to mmap mhd state file");
+        close(fd);
+        cxl_mhsld_state_unlink(s->mhd_state_file);
+        return;
+    }
+
+    cxl_mhsld_state_initialize(s, dc_size);
+
+    /* Set the LD ownership for this head to this system */
+    s->mhd_state->ldmap[s->mhd_head] = s->mhd_head;
+    return;
+}
+
+
+static void cxl_mhsld_exit(PCIDevice *pci_dev)
+{
+    CXLMHSLDState *s = CXL_MHSLD(pci_dev);
+
+    ct3_exit(pci_dev);
+
+    if (s->mhd_state_fd) {
+        munmap(s->mhd_state, s->mhd_state_size);
+        close(s->mhd_state_fd);
+        cxl_mhsld_state_unlink(s->mhd_state_file);
+        s->mhd_state = NULL;
+    }
+}
+
+static void cxl_mhsld_reset(DeviceState *d)
+{
+    CXLMHSLDState *s = CXL_MHSLD(d);
+
+    ct3d_reset(d);
+    cxl_add_cci_commands(&s->ct3d.cci, cxl_cmd_set_mhsld, 512);
+
+    cxl_mhsld_state_clear(s, 0, s->mhd_state->nr_blocks);
+}
+
+/*
+ * Example: DCD-add events need to validate that the requested extent
+ *          does not already have a mapping (or, if it does, it is
+ *          a shared extent with the right tagging).
+ *
+ * Since this operates on the shared state, we will need to serialize
+ * these callbacks across QEMU instances via a mutex in shared state.
+ */
+
+static void cxl_mhsld_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *pc = PCI_DEVICE_CLASS(klass);
+
+    pc->realize = cxl_mhsld_realize;
+    pc->exit = cxl_mhsld_exit;
+    device_class_set_legacy_reset(dc, cxl_mhsld_reset);
+    device_class_set_props(dc, cxl_mhsld_props);
+
+    CXLType3Class *cvc = CXL_TYPE3_CLASS(klass);
+    cvc->mhd_get_info = cmd_mhd_get_info;
+    cvc->mhd_access_valid = cxl_mhsld_access_valid;
+    cvc->mhd_reserve_extents = cxl_mhsld_reserve_extents;
+    cvc->mhd_reclaim_extents = cxl_mhsld_reclaim_extents;
+    cvc->mhd_release_extent = cxl_mhsld_release_extent;
+}
+
+static const TypeInfo cxl_mhsld_info = {
+    .name = TYPE_CXL_MHSLD,
+    .parent = TYPE_CXL_TYPE3,
+    .class_size = sizeof(struct CXLMHSLDClass),
+    .class_init = cxl_mhsld_class_init,
+    .instance_size = sizeof(CXLMHSLDState),
+    .interfaces = (InterfaceInfo[]) {
+        { INTERFACE_CXL_DEVICE },
+        { INTERFACE_PCIE_DEVICE },
+        {}
+    },
+};
+
+static void cxl_mhsld_register_types(void)
+{
+    type_register_static(&cxl_mhsld_info);
+}
+
+type_init(cxl_mhsld_register_types)
diff --git a/hw/cxl/mhsld/mhsld.h b/hw/cxl/mhsld/mhsld.h
new file mode 100644
index 0000000000..e7ead1f0d2
--- /dev/null
+++ b/hw/cxl/mhsld/mhsld.h
@@ -0,0 +1,75 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (c) 2024 MemVerge Inc.
+ *
+ */
+
+#ifndef CXL_MHSLD_H
+#define CXL_MHSLD_H
+#include <stdint.h>
+#include "hw/cxl/cxl.h"
+#include "hw/cxl/cxl_mailbox.h"
+#include "hw/cxl/cxl_device.h"
+#include "qemu/units.h"
+
+#define MHSLD_BLOCK_SZ (2 * MiB)
+
+/*
+ * We limit the number of heads to prevent the shared state
+ * region from becoming a major memory hog.  We need 512MB of
+ * memory space to track 8-host ownership of 4GB of memory in
+ * blocks of 2MB.  This can change if the block size is increased.
+ */
+#define MHSLD_HEADS  (8)
+
+/*
+ * The shared state cannot have 2 variable sized regions
+ * so we have to max out the ldmap.
+ */
+typedef struct MHSLDSharedState {
+    uint8_t nr_heads;
+    uint8_t nr_lds;
+    uint8_t ldmap[MHSLD_HEADS];
+    uint64_t nr_blocks;
+    uint8_t blocks[];
+} MHSLDSharedState;
+
+struct CXLMHSLDState {
+    CXLType3Dev ct3d;
+    bool mhd_init;
+    char *mhd_state_file;
+    int mhd_state_fd;
+    size_t mhd_state_size;
+    uint32_t mhd_head;
+    MHSLDSharedState *mhd_state;
+};
+
+struct CXLMHSLDClass {
+    CXLType3Class parent_class;
+};
+
+enum {
+    MHSLD_MHD = 0x55,
+        #define GET_MHD_INFO 0x0
+};
+
+/*
+ * MHD Get Info Command
+ * Returns information the LD's associated with this head
+ */
+typedef struct MHDGetInfoInput {
+    uint8_t start_ld;
+    uint8_t ldmap_len;
+} QEMU_PACKED MHDGetInfoInput;
+
+typedef struct MHDGetInfoOutput {
+    uint8_t nr_lds;
+    uint8_t nr_heads;
+    uint16_t resv1;
+    uint8_t start_ld;
+    uint8_t ldmap_len;
+    uint16_t resv2;
+    uint8_t ldmap[];
+} QEMU_PACKED MHDGetInfoOutput;
+#endif
-- 
2.43.0
[PATCH RFC v3 1/3] cxl-mailbox-utils: move CXLUpdateDCExtentListInPl into header
[PATCH RFC v3 2/3] cxl_type3: add MHD callbacks
[PATCH RFC v3 3/3] mhsld: implement MHSLD device