Endpoints requiring address translation might not be aware of the
system's interleaving configuration. Instead, interleaving can be
configured on an upper memory domain (from an endpoint view) and thus
is not visible to the endpoint. For region creation this might cause
an invalid interleaving config that does not match the CFMWS entries.
Use the interleaving configuration of the root decoders to create a
region which bases on CFMWS entries. This always matches the system's
interleaving configuration and is independent of the underlying memory
topology.
Signed-off-by: Robert Richter <rrichter@amd.com>
---
drivers/cxl/core/region.c | 39 ++++++++++++++++++++++++++++++++++-----
drivers/cxl/cxl.h | 2 ++
2 files changed, 36 insertions(+), 5 deletions(-)
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 6e0434eee6df..3afcc9ca06ae 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -1749,6 +1749,15 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
}
}
+ if (p->interleave_ways != cxled->interleave_ways ||
+ p->interleave_granularity != cxled->interleave_granularity ) {
+ dev_dbg(&cxlr->dev, "interleaving config mismatch with %s: ways: %d:%d granularity: %d:%d\n",
+ dev_name(&cxled->cxld.dev), p->interleave_ways,
+ cxled->interleave_ways, p->interleave_granularity,
+ cxled->interleave_granularity);
+ return -ENXIO;
+ }
+
return 0;
}
@@ -1852,7 +1861,7 @@ static int match_switch_decoder_by_range(struct device *dev,
}
static int find_pos_and_ways(struct cxl_port *port, struct range *range,
- int *pos, int *ways)
+ int *pos, int *ways, int *granularity)
{
struct cxl_switch_decoder *cxlsd;
struct cxl_port *parent;
@@ -1873,6 +1882,7 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range,
}
cxlsd = to_cxl_switch_decoder(dev);
*ways = cxlsd->cxld.interleave_ways;
+ *granularity = cxlsd->cxld.interleave_granularity;
for (int i = 0; i < *ways; i++) {
if (cxlsd->target[i] == port->parent_dport) {
@@ -1896,6 +1906,8 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range,
struct cxl_interleave_context {
struct range *hpa_range;
int pos;
+ int interleave_ways;
+ int interleave_granularity;
};
/**
@@ -1914,13 +1926,17 @@ struct cxl_interleave_context {
* the topology from the endpoint to the root decoder and iteratively
* applying the function for each port.
*
+ * Calculation of interleaving ways:
+ *
+ * interleave_ways = interleave_ways * parent_ways;
+ *
* Return: position >= 0 on success
* -ENXIO on failure
*/
static int cxl_port_calc_interleave(struct cxl_port *port,
struct cxl_interleave_context *ctx)
{
- int parent_ways = 0, parent_pos = 0;
+ int parent_ways = 0, parent_pos = 0, parent_granularity = 0;
int rc;
/*
@@ -1955,12 +1971,23 @@ static int cxl_port_calc_interleave(struct cxl_port *port,
if (is_cxl_root(port))
return 0;
- rc = find_pos_and_ways(port, ctx->hpa_range, &parent_pos, &parent_ways);
+ rc = find_pos_and_ways(port, ctx->hpa_range, &parent_pos, &parent_ways,
+ &parent_granularity);
if (rc)
return rc;
ctx->pos = ctx->pos * parent_ways + parent_pos;
+ if (ctx->interleave_ways)
+ ctx->interleave_ways *= parent_ways;
+ else
+ ctx->interleave_ways = parent_ways;
+
+ if (ctx->interleave_granularity)
+ ctx->interleave_granularity *= ctx->interleave_ways;
+ else
+ ctx->interleave_granularity = parent_granularity;
+
return ctx->pos;
}
@@ -3407,6 +3434,8 @@ static int cxl_endpoint_decoder_initialize(struct cxl_endpoint_decoder *cxled)
cxled->cxlrd = to_cxl_root_decoder(&cxld->dev);
cxled->spa_range = hpa;
cxled->pos = ctx.pos;
+ cxled->interleave_ways = ctx.interleave_ways;
+ cxled->interleave_granularity = ctx.interleave_granularity;
return 0;
}
@@ -3508,8 +3537,8 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
}
p->res = res;
- p->interleave_ways = cxled->cxld.interleave_ways;
- p->interleave_granularity = cxled->cxld.interleave_granularity;
+ p->interleave_ways = cxled->interleave_ways;
+ p->interleave_granularity = cxled->interleave_granularity;
p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 7303aec1c31c..31afd71c3c8e 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -399,6 +399,8 @@ struct cxl_endpoint_decoder {
enum cxl_decoder_state state;
int part;
int pos;
+ int interleave_ways;
+ int interleave_granularity;
};
/**
--
2.39.5
On Tue, Feb 18, 2025 at 02:23:51PM +0100, Robert Richter wrote:
> Endpoints requiring address translation might not be aware of the
> system's interleaving configuration. Instead, interleaving can be
> configured on an upper memory domain (from an endpoint view) and thus
> is not visible to the endpoint. For region creation this might cause
> an invalid interleaving config that does not match the CFMWS entries.
>
> Use the interleaving configuration of the root decoders to create a
> region which bases on CFMWS entries. This always matches the system's
> interleaving configuration and is independent of the underlying memory
> topology.
>
> Signed-off-by: Robert Richter <rrichter@amd.com>
... snip ...
> @@ -1955,12 +1971,23 @@ static int cxl_port_calc_interleave(struct cxl_port *port,
> if (is_cxl_root(port))
> return 0;
>
> - rc = find_pos_and_ways(port, ctx->hpa_range, &parent_pos, &parent_ways);
> + rc = find_pos_and_ways(port, ctx->hpa_range, &parent_pos, &parent_ways,
> + &parent_granularity);
> if (rc)
> return rc;
>
> ctx->pos = ctx->pos * parent_ways + parent_pos;
>
> + if (ctx->interleave_ways)
> + ctx->interleave_ways *= parent_ways;
> + else
> + ctx->interleave_ways = parent_ways;
> +
> + if (ctx->interleave_granularity)
> + ctx->interleave_granularity *= ctx->interleave_ways;
> + else
> + ctx->interleave_granularity = parent_granularity;
> +
> return ctx->pos;
> }
The root is always build from the CFMWS, so you don't need to do any
math, you just need to take the root. Since the logic calling this will
always climb to the root, you'll always get the right value.
So I think you just want this
---
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index fe23dc106956..05b24b008a1b 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -1977,17 +1977,8 @@ static int cxl_port_calc_interleave(struct cxl_port *port,
return rc;
ctx->pos = ctx->pos * parent_ways + parent_pos;
-
- if (ctx->interleave_ways)
- ctx->interleave_ways *= parent_ways;
- else
- ctx->interleave_ways = parent_ways;
-
- if (ctx->interleave_granularity)
- ctx->interleave_granularity *= ctx->interleave_ways;
- else
- ctx->interleave_granularity = parent_granularity;
-
+ ctx->interleave_ways = parent_ways;
+ ctx->interleave_granularity = parent_granularity;
return ctx->pos;
}
On Tue, Feb 18, 2025 at 02:23:51PM +0100, Robert Richter wrote:
> @@ -1955,12 +1971,23 @@ static int cxl_port_calc_interleave(struct cxl_port *port,
> if (is_cxl_root(port))
> return 0;
>
> - rc = find_pos_and_ways(port, ctx->hpa_range, &parent_pos, &parent_ways);
> + rc = find_pos_and_ways(port, ctx->hpa_range, &parent_pos, &parent_ways,
> + &parent_granularity);
> if (rc)
> return rc;
>
> ctx->pos = ctx->pos * parent_ways + parent_pos;
>
> + if (ctx->interleave_ways)
> + ctx->interleave_ways *= parent_ways;
> + else
> + ctx->interleave_ways = parent_ways;
> +
> + if (ctx->interleave_granularity)
> + ctx->interleave_granularity *= ctx->interleave_ways;
> + else
> + ctx->interleave_granularity = parent_granularity;
> +
> return ctx->pos;
> }
>
I have discovered on my Zen5 that either this code is incorrect, or my
decoders are programmed incorrectly.
decoderN.M | ig iw
----------------------
decoder0.0 | 2 256
decoder1.0 | 1 256
decoder3.0 | 1 256
decoder5.0 | 1 256
decoder6.0 | 1 256
region0 | 2 512 <--- Wrong
*Arch quirk aside*, everything except region is as expected.
I finally dropped a bunch of hacks from my branch, and my Zen5 stopped
bringing devices up correctly, with the error:
[]cxl region0: pci0000:d2:port1 cxl_port_setup_targets expected
iw: 1 ig: 1024 [... snip ...]
[]cxl region0: pci0000:d2:port1 cxl_port_setup_targets got
iw: 1 ig: 256 [... snip ...]
Sitting here scratching my head how I could possibly end up with an ig
of 1024 with the above set of decoders when I realized the region
inherited interleave_ways/granularity from the ENDPOINT decoder, which
is not exposed to sysfs.
Had to come back around to realize this patch set adds new
ways/granularity fields to the endpoint decoder.
struct cxl_endpoint_decoder {
struct cxl_decoder cxld;
...
int interleave_ways;
int interleave_granularity;
}
struct cxl_decoder {
...
int interleave_ways;
int interleave_granularity;
}
1) the cxl_endpoint_decoder descriptor needs to be updated to explain
why these ways/granularity differ from the cxl_decoder inside of the
cxl_endpoint_decoder. This is very, very confusing.
The reason appears to be that the endpoint decoder ways/granularity
is the region ways/granularity. So the endpoint decoder is passing
this information along.
Makes me think the region creation code should call this directly,
rather than all this indirection.
2) This calculation appears to just be plain wrong.
static int cxl_endpoint_decoder_initialize(struct cxl_endpoint_decoder *cxled)
{
ctx = (struct cxl_interleave_context) {
.hpa_range = &hpa,
};
...
while (iter && parent) {
endpoint host bridge root
decoder6.0 -> decoder3.0 -> decoder0.0
/* Convert interleave settings to next port upstream. */
rc = cxl_port_calc_interleave(iter, &ctx);
...
}
...
cxled->interleave_ways = ctx.interleave_ways;
cxled->interleave_granularity = ctx.interleave_granularity;
}
On my setup, I would expect to iterate decoder3.0 and decoder0.0
decoderN.M | ig iw
----------------------
decoder0.0 | 2 256
decoder3.0 | 1 256
on entry [iw,ig] = [0,0]
[parent_ways, parent_gran] -> [1,256]
[iw * piw, ig * piw] -> [2,512]
Looking at a normal system, we'd expect this configuration:
decoderN.M | ig iw
----------------------
decoder0.0 | 2 256
decoder1.0 | 1 512
decoder3.0 | 1 512
decoder5.0 | 2 256
decoder6.0 | 2 256
The above code produces the following:
[1,512]
[2,1024] <--- still wrong
in cxl_port_setup_targets we have this comment:
if (is_cxl_root(parent_port)) {
/*
* Root decoder IG is always set to value in CFMWS which
* may be different than this region's IG. We can use the
* region's IG here since interleave_granularity_store()
* does not allow interleaved host-bridges with
* root IG != region IG.
*/
parent_ig = p->interleave_granularity;
parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
}
Can we not just always report the parent ways/granularity, and skip all
the math? We'll always iterate to the root, and that's what we want the
region to match, right?
Better yet, can we not just do this in the region creation code, rather
than having the endpoint carry it through to the region for some reason?
Avoid adding the duplicate ways/granularity field all together.
~Gregory
On Mon, Mar 31, 2025 at 09:59:45PM -0400, Gregory Price wrote:
> I have discovered on my Zen5 that either this code is incorrect, or my
> decoders are programmed incorrectly.
>
> decoderN.M | ig iw
> ----------------------
> decoder0.0 | 2 256
> decoder3.0 | 1 256
> decoder6.0 | 1 256
> region0 | 2 512 <--- Wrong
>
> *Arch quirk aside*, everything except region is as expected.
>
... snip ...
>
> Looking at a normal system, we'd expect this configuration:
>
> decoderN.M | ig iw
> ----------------------
> decoder0.0 | 2 256
> decoder3.0 | 1 512
> decoder6.0 | 2 256
>
> The above code produces the following:
> [1,512]
> [2,1024] <--- still wrong
>
... snip ...
>
> Can we not just always report the parent ways/granularity, and skip all
> the math? We'll always iterate to the root, and that's what we want the
> region to match, right?
>
> Better yet, can we not just do this in the region creation code, rather
> than having the endpoint carry it through to the region for some reason?
> Avoid adding the duplicate ways/granularity field all together.
>
Having tested just using the root decoder data, i now get the expected
1:512, but i realized the issue is also that the regionref uses the
endpoint->decoder interleave ways/granularity
Before:
[]cxl region0: pci0000:d2:port1 cxl_port_setup_targets expected
iw: 1 ig: 1024 [... snip ...]
[]cxl region0: pci0000:d2:port1 cxl_port_setup_targets got
iw: 1 ig: 256 [... snip ...]
After:
[]cxl region0: pci0000:d2:port1 cxl_port_setup_targets expected
iw: 1 ig: 512
[]cxl region0: pci0000:d2:port1 cxl_port_setup_targets got
iw: 1 ig: 256
This makes sense, as the Zen5 quirk here is that the endpoints are
programmed with a 0-base for just their capacity, and they have no
interleave set on them - while the host bridges have 1:256 to match
the endpoint and 2:256 in the root is the only "correct" (in-spec)
programming of the topology.
I think the only choice here is to have another arch_check_interleave()
check here in `cxl_port_setup_targets()` that checks for this. I
haven't run the numbers on what the results would be if the HB had 1:512
instead of 1:256, but I imagine there lies another round of madness.
~Gregory
On Tue, 18 Feb 2025 14:23:51 +0100
Robert Richter <rrichter@amd.com> wrote:
> Endpoints requiring address translation might not be aware of the
> system's interleaving configuration. Instead, interleaving can be
> configured on an upper memory domain (from an endpoint view) and thus
> is not visible to the endpoint. For region creation this might cause
> an invalid interleaving config that does not match the CFMWS entries.
>
> Use the interleaving configuration of the root decoders to create a
> region which bases on CFMWS entries. This always matches the system's
> interleaving configuration and is independent of the underlying memory
> topology.
>
> Signed-off-by: Robert Richter <rrichter@amd.com>
> ---
> drivers/cxl/core/region.c | 39 ++++++++++++++++++++++++++++++++++-----
> drivers/cxl/cxl.h | 2 ++
> 2 files changed, 36 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 6e0434eee6df..3afcc9ca06ae 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1749,6 +1749,15 @@ static int cxl_region_validate_position(struct cxl_region *cxlr,
> }
> }
>
> + if (p->interleave_ways != cxled->interleave_ways ||
> + p->interleave_granularity != cxled->interleave_granularity ) {
> + dev_dbg(&cxlr->dev, "interleaving config mismatch with %s: ways: %d:%d granularity: %d:%d\n",
> + dev_name(&cxled->cxld.dev), p->interleave_ways,
> + cxled->interleave_ways, p->interleave_granularity,
> + cxled->interleave_granularity);
> + return -ENXIO;
> + }
> +
> return 0;
> }
>
> @@ -1852,7 +1861,7 @@ static int match_switch_decoder_by_range(struct device *dev,
> }
>
> static int find_pos_and_ways(struct cxl_port *port, struct range *range,
> - int *pos, int *ways)
> + int *pos, int *ways, int *granularity)
> {
> struct cxl_switch_decoder *cxlsd;
> struct cxl_port *parent;
> @@ -1873,6 +1882,7 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range,
> }
> cxlsd = to_cxl_switch_decoder(dev);
> *ways = cxlsd->cxld.interleave_ways;
> + *granularity = cxlsd->cxld.interleave_granularity;
>
> for (int i = 0; i < *ways; i++) {
> if (cxlsd->target[i] == port->parent_dport) {
> @@ -1896,6 +1906,8 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range,
> struct cxl_interleave_context {
> struct range *hpa_range;
> int pos;
> + int interleave_ways;
> + int interleave_granularity;
Ah. And here is our context expansion
> };
>
> /**
> @@ -1914,13 +1926,17 @@ struct cxl_interleave_context {
> * the topology from the endpoint to the root decoder and iteratively
> * applying the function for each port.
> *
> + * Calculation of interleaving ways:
> + *
> + * interleave_ways = interleave_ways * parent_ways;
> + *
> * Return: position >= 0 on success
> * -ENXIO on failure
> */
> static int cxl_port_calc_interleave(struct cxl_port *port,
> struct cxl_interleave_context *ctx)
> {
> - int parent_ways = 0, parent_pos = 0;
> + int parent_ways = 0, parent_pos = 0, parent_granularity = 0;
> int rc;
>
> /*
> @@ -1955,12 +1971,23 @@ static int cxl_port_calc_interleave(struct cxl_port *port,
> if (is_cxl_root(port))
> return 0;
>
> - rc = find_pos_and_ways(port, ctx->hpa_range, &parent_pos, &parent_ways);
> + rc = find_pos_and_ways(port, ctx->hpa_range, &parent_pos, &parent_ways,
> + &parent_granularity);
> if (rc)
> return rc;
>
> ctx->pos = ctx->pos * parent_ways + parent_pos;
>
> + if (ctx->interleave_ways)
> + ctx->interleave_ways *= parent_ways;
> + else
> + ctx->interleave_ways = parent_ways;
> +
> + if (ctx->interleave_granularity)
> + ctx->interleave_granularity *= ctx->interleave_ways;
> + else
> + ctx->interleave_granularity = parent_granularity;
> +
> return ctx->pos;
I think Gregory called this out in earlier patch. Mixing and matching
between returning pos and use of ctx makes things hard to read. If
we need to have it in context, then make this return 0 or -ERR instead.
> }
On Tue, Feb 18, 2025 at 02:23:51PM +0100, Robert Richter wrote: ... snip ... > static int find_pos_and_ways(struct cxl_port *port, struct range *range, > - int *pos, int *ways) > + int *pos, int *ways, int *granularity) Function name starting to diverge from functionality find_interleave_config() ? ~Gregory
© 2016 - 2025 Red Hat, Inc.