[PATCH v5 3/3] hw/cxl: Add a performant (and correct) path for the non interleaved cases

Alireza Sanaee via qemu development posted 3 patches 1 month, 2 weeks ago
Maintainers: Jonathan Cameron <jonathan.cameron@huawei.com>, Fan Ni <fan.ni@samsung.com>
There is a newer version of this series
[PATCH v5 3/3] hw/cxl: Add a performant (and correct) path for the non interleaved cases
Posted by Alireza Sanaee via qemu development 1 month, 2 weeks ago
The CXL address to device decoding logic is complex because of the need to
correctly decode fine grained interleave. The current implementation
prevents use with KVM where executed instructions may reside in that memory
and gives very slow performance even in TCG.

In many real cases non interleaved memory configurations are useful and for
those we can use a more conventional memory region alias allowing similar
performance to other memory in the system.

Whether this fast path is applicable can be established once the full set
of HDM decoders has been committed (in whatever order the guest decides to
commit them). As such a check is performed on each commit/uncommit of HDM
decoder to establish if the alias should be added or removed.

Co-developed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Alireza Sanaee <alireza.sanaee@huawei.com>
---
 hw/cxl/cxl-component-utils.c |   6 ++
 hw/cxl/cxl-host.c            | 189 +++++++++++++++++++++++++++++++++++
 hw/mem/cxl_type3.c           |   4 +
 include/hw/cxl/cxl.h         |   1 +
 include/hw/cxl/cxl_device.h  |   4 +
 5 files changed, 204 insertions(+)

diff --git a/hw/cxl/cxl-component-utils.c b/hw/cxl/cxl-component-utils.c
index 07aabe331c..a624357978 100644
--- a/hw/cxl/cxl-component-utils.c
+++ b/hw/cxl/cxl-component-utils.c
@@ -143,6 +143,12 @@ static void dumb_hdm_handler(CXLComponentState *cxl_cstate, hwaddr offset,
         value = FIELD_DP32(value, CXL_HDM_DECODER0_CTRL, COMMITTED, 0);
     }
     stl_le_p((uint8_t *)cache_mem + offset, value);
+
+    if (should_commit) {
+        cfmws_update_non_interleaved(true);
+    } else if (should_uncommit) {
+        cfmws_update_non_interleaved(false);
+    }
 }
 
 static void bi_handler(CXLComponentState *cxl_cstate, hwaddr offset,
diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
index 2dc9f77007..d51a4dc201 100644
--- a/hw/cxl/cxl-host.c
+++ b/hw/cxl/cxl-host.c
@@ -264,6 +264,195 @@ static PCIDevice *cxl_cfmws_find_device(CXLFixedWindow *fw, hwaddr addr,
     return d;
 }
 
+typedef struct CXLDirectPTState {
+    CXLType3Dev *ct3d;
+    hwaddr decoder_base;
+    hwaddr decoder_size;
+    hwaddr dpa_base;
+    unsigned int hdm_decoder_idx;
+} CXLDirectPTState;
+
+static void cxl_fmws_direct_passthrough_setup(CXLDirectPTState *state,
+                                              CXLFixedWindow *fw)
+{
+    CXLType3Dev *ct3d = state->ct3d;
+    MemoryRegion *mr = NULL;
+    uint64_t vmr_size = 0, pmr_size = 0, offset = 0;
+    MemoryRegion *direct_mr;
+    g_autofree char *direct_mr_name;
+    unsigned int idx = state->hdm_decoder_idx;
+
+    if (ct3d->hostvmem) {
+        MemoryRegion *vmr = host_memory_backend_get_memory(ct3d->hostvmem);
+
+        vmr_size = memory_region_size(vmr);
+        if (state->dpa_base < vmr_size) {
+            mr = vmr;
+            offset = state->dpa_base;
+        }
+    }
+    if (!mr && ct3d->hostpmem) {
+        MemoryRegion *pmr = host_memory_backend_get_memory(ct3d->hostpmem);
+
+        pmr_size = memory_region_size(pmr);
+        if (state->dpa_base - vmr_size < pmr_size) {
+            mr = pmr;
+            offset = state->dpa_base - vmr_size;
+        }
+    }
+    if (!mr) {
+        return;
+    }
+
+    if (ct3d->direct_mr_fw[idx]) {
+        return;
+    }
+
+    direct_mr = &ct3d->direct_mr[idx];
+    direct_mr_name = g_strdup_printf("cxl-direct-mapping-alias-%u", idx);
+    if (!direct_mr_name) {
+        return;
+    }
+
+    memory_region_init_alias(direct_mr, OBJECT(ct3d), direct_mr_name, mr,
+                             offset, state->decoder_size);
+    memory_region_add_subregion(&fw->mr,
+                                state->decoder_base - fw->base, direct_mr);
+    ct3d->direct_mr_fw[idx] = fw;
+}
+
+static void cxl_fmws_direct_passthrough_remove(CXLType3Dev *ct3d,
+                                               uint64_t decoder_base,
+                                               unsigned int idx)
+{
+    CXLFixedWindow *owner_fw = ct3d->direct_mr_fw[idx];
+    MemoryRegion *direct_mr = &ct3d->direct_mr[idx];
+
+    if (!owner_fw) {
+        return;
+    }
+
+    if (!memory_region_is_mapped(direct_mr)) {
+        return;
+    }
+
+    if (cxl_cfmws_find_device(owner_fw, decoder_base, false)) {
+        return;
+    }
+
+    memory_region_del_subregion(&owner_fw->mr, direct_mr);
+    ct3d->direct_mr_fw[idx] = NULL;
+}
+
+static int cxl_fmws_direct_passthrough(Object *obj, void *opaque)
+{
+    CXLDirectPTState *state = opaque;
+    CXLFixedWindow *fw;
+
+    if (!object_dynamic_cast(obj, TYPE_CXL_FMW)) {
+        return 0;
+    }
+
+    fw = CXL_FMW(obj);
+
+    /* Verify not interleaved */
+    if (!cxl_cfmws_find_device(fw, state->decoder_base, false)) {
+        return 0;
+    }
+
+    cxl_fmws_direct_passthrough_setup(state, fw);
+
+    return 0;
+}
+
+static int update_non_interleaved(Object *obj, void *opaque)
+{
+    const int hdm_inc = R_CXL_HDM_DECODER1_BASE_LO - R_CXL_HDM_DECODER0_BASE_LO;
+    bool commit = *(bool *)opaque;
+    CXLType3Dev *ct3d;
+    uint32_t *cache_mem;
+    unsigned int hdm_count, i;
+    uint32_t cap;
+    uint64_t dpa_base = 0;
+
+    if (!object_dynamic_cast(obj, TYPE_CXL_TYPE3)) {
+        return 0;
+    }
+
+    ct3d = CXL_TYPE3(obj);
+    cache_mem = ct3d->cxl_cstate.crb.cache_mem_registers;
+    cap = ldl_le_p(cache_mem + R_CXL_HDM_DECODER_CAPABILITY);
+    hdm_count = cxl_decoder_count_dec(FIELD_EX32(cap,
+                                                 CXL_HDM_DECODER_CAPABILITY,
+                                                 DECODER_COUNT));
+    for (i = 0; i < hdm_count; i++) {
+        uint64_t decoder_base, decoder_size, skip;
+        uint32_t hdm_ctrl, low, high;
+        int iw, committed;
+
+        hdm_ctrl = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL + i * hdm_inc);
+        committed = FIELD_EX32(hdm_ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED);
+
+        /*
+         * Optimization: Looking for a fully committed path; if the type 3 HDM
+         * decoder is not commmitted, it cannot lie on such a path.
+         */
+        if (commit && !committed) {
+            return 0;
+        }
+
+        low = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_DPA_SKIP_LO +
+                       i * hdm_inc);
+        high = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_DPA_SKIP_HI +
+                        i * hdm_inc);
+        skip = ((uint64_t)high << 32) | (low & 0xf0000000);
+        dpa_base += skip;
+
+        low = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_SIZE_LO + i * hdm_inc);
+        high = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_SIZE_HI + i * hdm_inc);
+        decoder_size = ((uint64_t)high << 32) | (low & 0xf0000000);
+
+        low = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_BASE_LO + i * hdm_inc);
+        high = ldl_le_p(cache_mem + R_CXL_HDM_DECODER0_BASE_HI + i * hdm_inc);
+        decoder_base = ((uint64_t)high << 32) | (low & 0xf0000000);
+
+        iw = FIELD_EX32(hdm_ctrl, CXL_HDM_DECODER0_CTRL, IW);
+
+        if (iw == 0) {
+            if (!commit) {
+                cxl_fmws_direct_passthrough_remove(ct3d, decoder_base, i);
+            } else {
+                CXLDirectPTState state = {
+                    .ct3d = ct3d,
+                    .decoder_base = decoder_base,
+                    .decoder_size = decoder_size,
+                    .dpa_base = dpa_base,
+                    .hdm_decoder_idx = i,
+                };
+
+                object_child_foreach_recursive(object_get_root(),
+                                               cxl_fmws_direct_passthrough,
+                                               &state);
+            }
+        }
+        dpa_base += decoder_size / cxl_interleave_ways_dec(iw, &error_fatal);
+    }
+
+    return false;
+}
+
+bool cfmws_update_non_interleaved(bool commit)
+{
+    /*
+     * Walk endpoints to find both committed and uncommitted decoders,
+     * then check if they are not interleaved (but the path is fully set up).
+     */
+    object_child_foreach_recursive(object_get_root(),
+                                   update_non_interleaved, &commit);
+
+    return false;
+}
+
 static MemTxResult cxl_read_cfmws(void *opaque, hwaddr addr, uint64_t *data,
                                   unsigned size, MemTxAttrs attrs)
 {
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 4739239da3..d9fc0bec8f 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -427,6 +427,8 @@ static void hdm_decoder_commit(CXLType3Dev *ct3d, int which)
     ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED, 1);
 
     stl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL + which * hdm_inc, ctrl);
+
+    cfmws_update_non_interleaved(true);
 }
 
 static void hdm_decoder_uncommit(CXLType3Dev *ct3d, int which)
@@ -442,6 +444,8 @@ static void hdm_decoder_uncommit(CXLType3Dev *ct3d, int which)
     ctrl = FIELD_DP32(ctrl, CXL_HDM_DECODER0_CTRL, COMMITTED, 0);
 
     stl_le_p(cache_mem + R_CXL_HDM_DECODER0_CTRL + which * hdm_inc, ctrl);
+
+    cfmws_update_non_interleaved(false);
 }
 
 static int ct3d_qmp_uncor_err_to_cxl(CxlUncorErrorType qmp_err)
diff --git a/include/hw/cxl/cxl.h b/include/hw/cxl/cxl.h
index 998f495a98..931f5680bd 100644
--- a/include/hw/cxl/cxl.h
+++ b/include/hw/cxl/cxl.h
@@ -71,4 +71,5 @@ CXLComponentState *cxl_usp_to_cstate(CXLUpstreamPort *usp);
 typedef struct CXLDownstreamPort CXLDownstreamPort;
 DECLARE_INSTANCE_CHECKER(CXLDownstreamPort, CXL_DSP, TYPE_CXL_DSP)
 
+bool cfmws_update_non_interleaved(bool commit);
 #endif
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index 393f312217..ba551fa5f9 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -685,6 +685,8 @@ typedef struct CXLSetFeatureInfo {
     size_t data_size;
 } CXLSetFeatureInfo;
 
+typedef struct CXLFixedWindow CXLFixedWindow;
+
 struct CXLSanitizeInfo;
 
 typedef struct CXLAlertConfig {
@@ -712,6 +714,8 @@ struct CXLType3Dev {
     uint64_t sn;
 
     /* State */
+    MemoryRegion direct_mr[CXL_HDM_DECODER_COUNT];
+    CXLFixedWindow *direct_mr_fw[CXL_HDM_DECODER_COUNT];
     AddressSpace hostvmem_as;
     AddressSpace hostpmem_as;
     CXLComponentState cxl_cstate;
-- 
2.43.0
Re: [PATCH v5 3/3] hw/cxl: Add a performant (and correct) path for the non interleaved cases
Posted by Gregory Price 1 month, 2 weeks ago
On Wed, Feb 25, 2026 at 06:07:18PM +0000, Alireza Sanaee via qemu development wrote:
> +static void cxl_fmws_direct_passthrough_remove(CXLType3Dev *ct3d,
> +                                               uint64_t decoder_base,
> +                                               unsigned int idx)
> +{
> +    CXLFixedWindow *owner_fw = ct3d->direct_mr_fw[idx];
> +    MemoryRegion *direct_mr = &ct3d->direct_mr[idx];
> +
> +    if (!owner_fw) {
> +        return;
> +    }
> +
> +    if (!memory_region_is_mapped(direct_mr)) {
> +        return;
> +    }
> +
> +    if (cxl_cfmws_find_device(owner_fw, decoder_base, false)) {
> +        return;
> +    }
> +
> +    memory_region_del_subregion(&owner_fw->mr, direct_mr);
> +    ct3d->direct_mr_fw[idx] = NULL;
> +}
> +

kreview flagged this, seems reasonable given other examples in the tree

The other commits looked ok, I'll try to get around to testing this soon.

~Gregory

---

Reported-by: kreview-8842242

The alias parameters (target MR, offset, size) are all guest-controlled
and can change between cycles -- the guest can reprogram DPA skip, decoder
size, or decoder base before recommitting.  There is no API to change the
alias target (mr->alias) after init, so an init-once approach would not
work here.

The VGA chain4_alias in hw/display/vga.c handles the same situation
(embedded MR alias rebuilt with different parameters on guest register
writes) using object_unparent():

    vga_update_memory_access() {
        memory_region_del_subregion(s->legacy_address_space, &s->chain4_alias);
        object_unparent(OBJECT(&s->chain4_alias));
        ...
        memory_region_init_alias(&s->chain4_alias, ...);
        memory_region_add_subregion_overlap(..., &s->chain4_alias, 2);
    }

Adding object_unparent() in remove() should fix this:

    memory_region_del_subregion(&owner_fw->mr, direct_mr);
    object_unparent(OBJECT(direct_mr));
    ct3d->direct_mr_fw[idx] = NULL;
Re: [PATCH v5 3/3] hw/cxl: Add a performant (and correct) path for the non interleaved cases
Posted by Alireza Sanaee via qemu development 1 month, 2 weeks ago
On Wed, 25 Feb 2026 15:34:27 -0500
Gregory Price <gourry@gourry.net> wrote:

Hi Gregory,

> On Wed, Feb 25, 2026 at 06:07:18PM +0000, Alireza Sanaee via qemu development wrote:
> > +static void cxl_fmws_direct_passthrough_remove(CXLType3Dev *ct3d,
> > +                                               uint64_t decoder_base,
> > +                                               unsigned int idx)
> > +{
> > +    CXLFixedWindow *owner_fw = ct3d->direct_mr_fw[idx];
> > +    MemoryRegion *direct_mr = &ct3d->direct_mr[idx];
> > +
> > +    if (!owner_fw) {
> > +        return;
> > +    }
> > +
> > +    if (!memory_region_is_mapped(direct_mr)) {
> > +        return;
> > +    }
> > +
> > +    if (cxl_cfmws_find_device(owner_fw, decoder_base, false)) {
> > +        return;
> > +    }
> > +
> > +    memory_region_del_subregion(&owner_fw->mr, direct_mr);
> > +    ct3d->direct_mr_fw[idx] = NULL;
> > +}
> > +  
> 
> kreview flagged this, seems reasonable given other examples in the tree
> 
> The other commits looked ok, I'll try to get around to testing this soon.
> 
> ~Gregory
> 
> ---
> 
> Reported-by: kreview-8842242
> 
> The alias parameters (target MR, offset, size) are all guest-controlled
> and can change between cycles -- the guest can reprogram DPA skip, decoder
> size, or decoder base before recommitting.  There is no API to change the
> alias target (mr->alias) after init, so an init-once approach would not
> work here.
> 
> The VGA chain4_alias in hw/display/vga.c handles the same situation
> (embedded MR alias rebuilt with different parameters on guest register
> writes) using object_unparent():
> 
>     vga_update_memory_access() {
>         memory_region_del_subregion(s->legacy_address_space, &s->chain4_alias);
>         object_unparent(OBJECT(&s->chain4_alias));
>         ...
>         memory_region_init_alias(&s->chain4_alias, ...);
>         memory_region_add_subregion_overlap(..., &s->chain4_alias, 2);
>     }
> 
> Adding object_unparent() in remove() should fix this:
> 
>     memory_region_del_subregion(&owner_fw->mr, direct_mr);
>     object_unparent(OBJECT(direct_mr));
>     ct3d->direct_mr_fw[idx] = NULL;
> 

This was actually my question as well, but I tested with a scenario in which I 
created a region with its alias. Then I torn down that region so that the alias 
goes away, and finally reinitialized the same region to reuse the deleted alias 
object. It worked, but still it does not mean it is correct. I can send another 
revision using unparent API.
Re: [PATCH v5 3/3] hw/cxl: Add a performant (and correct) path for the non interleaved cases
Posted by Jonathan Cameron via qemu development 1 month, 1 week ago
On Thu, 26 Feb 2026 00:14:05 +0000
Alireza Sanaee <alireza.sanaee@huawei.com> wrote:

> On Wed, 25 Feb 2026 15:34:27 -0500
> Gregory Price <gourry@gourry.net> wrote:
> 
> Hi Gregory,
> 
> > On Wed, Feb 25, 2026 at 06:07:18PM +0000, Alireza Sanaee via qemu development wrote:  
> > > +static void cxl_fmws_direct_passthrough_remove(CXLType3Dev *ct3d,
> > > +                                               uint64_t decoder_base,
> > > +                                               unsigned int idx)
> > > +{
> > > +    CXLFixedWindow *owner_fw = ct3d->direct_mr_fw[idx];
> > > +    MemoryRegion *direct_mr = &ct3d->direct_mr[idx];
> > > +
> > > +    if (!owner_fw) {
> > > +        return;
> > > +    }
> > > +
> > > +    if (!memory_region_is_mapped(direct_mr)) {
> > > +        return;
> > > +    }
> > > +
> > > +    if (cxl_cfmws_find_device(owner_fw, decoder_base, false)) {
> > > +        return;
> > > +    }
> > > +
> > > +    memory_region_del_subregion(&owner_fw->mr, direct_mr);
> > > +    ct3d->direct_mr_fw[idx] = NULL;
> > > +}
> > > +    
> > 
> > kreview flagged this, seems reasonable given other examples in the tree
> > 
> > The other commits looked ok, I'll try to get around to testing this soon.
> > 
> > ~Gregory
> > 
> > ---
> > 
> > Reported-by: kreview-8842242
> > 
> > The alias parameters (target MR, offset, size) are all guest-controlled
> > and can change between cycles -- the guest can reprogram DPA skip, decoder
> > size, or decoder base before recommitting.  There is no API to change the
> > alias target (mr->alias) after init, so an init-once approach would not
> > work here.
> > 
> > The VGA chain4_alias in hw/display/vga.c handles the same situation
> > (embedded MR alias rebuilt with different parameters on guest register
> > writes) using object_unparent():
> > 
> >     vga_update_memory_access() {
> >         memory_region_del_subregion(s->legacy_address_space, &s->chain4_alias);
> >         object_unparent(OBJECT(&s->chain4_alias));
> >         ...
> >         memory_region_init_alias(&s->chain4_alias, ...);
> >         memory_region_add_subregion_overlap(..., &s->chain4_alias, 2);
> >     }
> > 
> > Adding object_unparent() in remove() should fix this:
> > 
> >     memory_region_del_subregion(&owner_fw->mr, direct_mr);
> >     object_unparent(OBJECT(direct_mr));
> >     ct3d->direct_mr_fw[idx] = NULL;
> >   
> 
> This was actually my question as well, but I tested with a scenario in which I 
> created a region with its alias. Then I torn down that region so that the alias 
> goes away, and finally reinitialized the same region to reuse the deleted alias 
> object. It worked, but still it does not mean it is correct. I can send another 
> revision using unparent API.

It is more elegant (and keeps the introspection stuff reporting correctly)
to unparent the object on delete. To test this you'd need to bring it up in a different
cfmws so the parent changes.  So this is good feedback.

There is a secondary issue that around the HBM decoder registers being modified at random
points that don't make sense from an OS flow point of view, but which could lead to
mismatch.  For that one we'd need to snapshot the decoders at commit time and use
that snapshot as the valid settings.

Note that form a spec viewpoint I think it's undefined what happens if the OS messes
with these registers at inappropriate moments.  Snap shot should be a valid 
implementation choice though.

Given we aren't currently that defensive against the OS doing crazy things I don't
mind pushing that snapshot thing to a follow up patch.  Maybe merge into this once
it's ready.

Jonathan