drivers/cxl/core/region.c | 201 ++++++++++++++++++++------------------ 1 file changed, 108 insertions(+), 93 deletions(-)
The CXL specification supports multi-level interleaving "as long as
all the levels use different, but consecutive, HPA bits to select the
target and no Interleave Set has more than 8 devices" (from 3.2).
Currently the kernel expects that a decoder's "interleave granularity
is a multiple of @parent_port granularity". That is, the granularity
of a lower level is bigger than those of the parent and uses the outer
HPA bits as selector. It works e.g. for the following 8-way config:
* cross-link (cross-hostbridge config in CFMWS):
* 4-way
* 256 granularity
* Selector: HPA[8:9]
* sub-link (CXL Host bridge config of the HDM):
* 2-way
* 1024 granularity
* Selector: HPA[10]
Now, if the outer HPA bits are used for the cross-hostbridge, an 8-way
config could look like this:
* cross-link (cross-hostbridge config in CFMWS):
* 4-way
* 512 granularity
* Selector: HPA[9:10]
* sub-link (CXL Host bridge config of the HDM):
* 2-way
* 256 granularity
* Selector: HPA[8]
The enumeration of decoders for this configuration fails then with
following error:
cxl region0: pci0000:00:port1 cxl_port_setup_targets expected iw: 2 ig: 1024 [mem 0x10000000000-0x1ffffffffff flags 0x200]
cxl region0: pci0000:00:port1 cxl_port_setup_targets got iw: 2 ig: 256 state: enabled 0x10000000000:0x1ffffffffff
cxl_port endpoint12: failed to attach decoder12.0 to region0: -6
Note that this happens only if firmware is setting up the decoders
(CXL_REGION_F_AUTO). For userspace region assembly the granularities
are chosen to increase from root down to the lower levels. That is,
outer HPA bits are always used for lower interleaving levels.
Rework the implementation to also support multi-level interleaving
with smaller granularities for lower levels. Determine the interleave
set of autodetected decoders. Check that it is a subset of the root
interleave.
The HPA selector bits are extracted for all decoders of the set and
checked that there is no overlap and bits are consecutive. All
decoders can be programmed now to use any bit range within the
region's target selector.
Signed-off-by: Robert Richter <rrichter@amd.com>
---
drivers/cxl/core/region.c | 201 ++++++++++++++++++++------------------
1 file changed, 108 insertions(+), 93 deletions(-)
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index b06fee1978ba..79d35def7c79 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -1323,57 +1323,119 @@ static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig)
return 0;
}
+static inline u64 get_selector(u64 ways, u64 gran)
+{
+ if (!is_power_of_2(ways))
+ ways /= 3;
+
+ if (!is_power_of_2(ways) || !is_power_of_2(gran))
+ return 0;
+
+ return (ways - 1) * gran;
+}
+
static int cxl_port_setup_targets(struct cxl_port *port,
struct cxl_region *cxlr,
struct cxl_endpoint_decoder *cxled)
{
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
- int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos;
struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
struct cxl_region_params *p = &cxlr->params;
struct cxl_decoder *cxld = cxl_rr->decoder;
- struct cxl_switch_decoder *cxlsd;
+ struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(&cxld->dev);
struct cxl_port *iter = port;
- u16 eig, peig;
- u8 eiw, peiw;
+ int ig, iw = cxl_rr->nr_targets, rc, inc = 0, pos = cxled->pos;
+ int distance, parent_distance;
+ u64 selector, cxlr_sel;
+ u16 eig;
+ u8 eiw;
/*
* While root level decoders support x3, x6, x12, switch level
* decoders only support powers of 2 up to x16.
*/
- if (!is_power_of_2(cxl_rr->nr_targets)) {
+ if (!is_power_of_2(iw)) {
dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
- dev_name(port->uport_dev), dev_name(&port->dev),
- cxl_rr->nr_targets);
+ dev_name(port->uport_dev), dev_name(&port->dev), iw);
return -EINVAL;
}
- cxlsd = to_cxl_switch_decoder(&cxld->dev);
- if (cxl_rr->nr_targets_set) {
- int i, distance = 1;
- struct cxl_region_ref *cxl_rr_iter;
+ if (iw > 8 || iw > cxlsd->nr_targets) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s:%s: ways: %d overflows targets: %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxld->dev), iw, cxlsd->nr_targets);
+ return -ENXIO;
+ }
- /*
- * The "distance" between peer downstream ports represents which
- * endpoint positions in the region interleave a given port can
- * host.
- *
- * For example, at the root of a hierarchy the distance is
- * always 1 as every index targets a different host-bridge. At
- * each subsequent switch level those ports map every Nth region
- * position where N is the width of the switch == distance.
- */
- do {
- cxl_rr_iter = cxl_rr_load(iter, cxlr);
- distance *= cxl_rr_iter->nr_targets;
- iter = to_cxl_port(iter->dev.parent);
- } while (!is_cxl_root(iter));
- distance *= cxlrd->cxlsd.cxld.interleave_ways;
+ /*
+ * Calculate the effective granularity and ways to determine
+ * HPA bits used as target selectors of the interleave set.
+ * Use this to check if the root decoder and all subsequent
+ * HDM decoders only use bits from that range as selectors.
+ *
+ * The "distance" between peer downstream ports represents which
+ * endpoint positions in the region interleave a given port can
+ * host.
+ *
+ * For example, at the root of a hierarchy the distance is
+ * always 1 as every index targets a different host-bridge. At
+ * each subsequent switch level those ports map every Nth region
+ * position where N is the width of the switch == distance.
+ */
+
+ /* Start with the root decoders selector and distance. */
+ selector = get_selector(cxlrd->cxlsd.cxld.interleave_ways,
+ cxlrd->cxlsd.cxld.interleave_granularity);
+ distance = cxlrd->cxlsd.cxld.interleave_ways;
+ if (!is_power_of_2(distance))
+ distance /= 3;
+
+ for (iter = parent_port; !is_cxl_root(iter);
+ iter = to_cxl_port(iter->dev.parent)) {
+ struct cxl_region_ref *cxl_rr_iter = cxl_rr_load(iter, cxlr);
+ struct cxl_decoder *cxld_iter = cxl_rr_iter->decoder;
+ u64 cxld_sel;
+
+ if (cxld_iter->interleave_ways == 1)
+ continue;
+
+ cxld_sel = get_selector(cxld_iter->interleave_ways,
+ cxld_iter->interleave_granularity);
+
+ if (cxld_sel & selector) {
+ dev_dbg(&cxlr->dev, "%s:%s: overlapping selectors: %#llx:%#llx\n",
+ dev_name(iter->uport_dev),
+ dev_name(&iter->dev), cxld_sel, selector);
+ return -ENXIO;
+ }
- for (i = 0; i < cxl_rr->nr_targets_set; i++)
+ selector |= cxld_sel;
+ distance *= cxl_rr_iter->nr_targets;
+ }
+
+ parent_distance = distance;
+ distance *= iw;
+
+ /* The combined selector bits must fit the region selector. */
+ cxlr_sel = get_selector(p->interleave_ways,
+ p->interleave_granularity);
+
+ if ((cxlr_sel & selector) != selector) {
+ dev_dbg(&cxlr->dev, "%s:%s: invalid selectors: %#llx:%#llx\n",
+ dev_name(iter->uport_dev),
+ dev_name(&iter->dev), cxlr_sel, selector);
+ return -ENXIO;
+ }
+
+ /* Calculate remaining selector bits available for use. */
+ selector = cxlr_sel & ~selector;
+
+ if (cxl_rr->nr_targets_set) {
+ for (int i = 0; i < cxl_rr->nr_targets_set; i++)
if (ep->dport == cxlsd->target[i]) {
rc = check_last_peer(cxled, ep, cxl_rr,
distance);
@@ -1384,87 +1446,40 @@ static int cxl_port_setup_targets(struct cxl_port *port,
goto add_target;
}
- if (is_cxl_root(parent_port)) {
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
+ ig = cxld->interleave_granularity;
+ else
/*
+ * Set the interleave granularity with each interleave
+ * level to a multiple of it's parent port interleave
+ * ways. Beginning with the granularity of the root
+ * decoder set to the region granularity (starting
+ * with the inner selector bits of the HPA), the
+ * granularity is increased with each level. Calculate
+ * this using the parent distance and region
+ * granularity.
+ *
* Root decoder IG is always set to value in CFMWS which
* may be different than this region's IG. We can use the
* region's IG here since interleave_granularity_store()
* does not allow interleaved host-bridges with
* root IG != region IG.
*/
- parent_ig = p->interleave_granularity;
- parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
- /*
- * For purposes of address bit routing, use power-of-2 math for
- * switch ports.
- */
- if (!is_power_of_2(parent_iw))
- parent_iw /= 3;
- } else {
- struct cxl_region_ref *parent_rr;
- struct cxl_decoder *parent_cxld;
-
- parent_rr = cxl_rr_load(parent_port, cxlr);
- parent_cxld = parent_rr->decoder;
- parent_ig = parent_cxld->interleave_granularity;
- parent_iw = parent_cxld->interleave_ways;
- }
-
- rc = granularity_to_eig(parent_ig, &peig);
- if (rc) {
- dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
- dev_name(parent_port->uport_dev),
- dev_name(&parent_port->dev), parent_ig);
- return rc;
- }
-
- rc = ways_to_eiw(parent_iw, &peiw);
- if (rc) {
- dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
- dev_name(parent_port->uport_dev),
- dev_name(&parent_port->dev), parent_iw);
- return rc;
- }
+ ig = p->interleave_granularity * parent_distance;
- iw = cxl_rr->nr_targets;
rc = ways_to_eiw(iw, &eiw);
- if (rc) {
- dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
- dev_name(port->uport_dev), dev_name(&port->dev), iw);
- return rc;
- }
-
- /*
- * Interleave granularity is a multiple of @parent_port granularity.
- * Multiplier is the parent port interleave ways.
- */
- rc = granularity_to_eig(parent_ig * parent_iw, &eig);
- if (rc) {
- dev_dbg(&cxlr->dev,
- "%s: invalid granularity calculation (%d * %d)\n",
- dev_name(&parent_port->dev), parent_ig, parent_iw);
- return rc;
- }
-
- rc = eig_to_granularity(eig, &ig);
- if (rc) {
- dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
- dev_name(port->uport_dev), dev_name(&port->dev),
- 256 << eig);
- return rc;
- }
+ if (!rc)
+ rc = granularity_to_eig(ig, &eig);
- if (iw > 8 || iw > cxlsd->nr_targets) {
- dev_dbg(&cxlr->dev,
- "%s:%s:%s: ways: %d overflows targets: %d\n",
+ if (rc || (iw > 1 && ~selector & get_selector(iw, ig))) {
+ dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d:%d:%#llx\n",
dev_name(port->uport_dev), dev_name(&port->dev),
- dev_name(&cxld->dev), iw, cxlsd->nr_targets);
+ iw, ig, selector);
return -ENXIO;
}
if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
if (cxld->interleave_ways != iw ||
- (iw > 1 && cxld->interleave_granularity != ig) ||
!region_res_match_cxl_range(p, &cxld->hpa_range) ||
((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
dev_err(&cxlr->dev,
--
2.47.3
Hi Robert, kernel test robot noticed the following build errors: [auto build test ERROR on cxl/next] [also build test ERROR on linus/master v6.18-rc3 next-20251030] [cannot apply to cxl/pending] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Robert-Richter/cxl-region-Support-multi-level-interleaving-with-smaller-granularities-for-lower-levels/20251028-175032 base: https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git next patch link: https://lore.kernel.org/r/20251028094754.72816-1-rrichter%40amd.com patch subject: [PATCH] cxl/region: Support multi-level interleaving with smaller granularities for lower levels config: i386-randconfig-141-20251031 (https://download.01.org/0day-ci/archive/20251031/202510311413.HdTizWHK-lkp@intel.com/config) compiler: gcc-14 (Debian 14.2.0-19) 14.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251031/202510311413.HdTizWHK-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202510311413.HdTizWHK-lkp@intel.com/ All errors (new ones prefixed by >>, old ones prefixed by <<): >> ERROR: modpost: "__udivdi3" [drivers/cxl/core/cxl_core.ko] undefined! -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On Fri, Oct 31, 2025 at 03:03:10PM +0800, kernel test robot wrote: > Hi Robert, > > All errors (new ones prefixed by >>, old ones prefixed by <<): > > >> ERROR: modpost: "__udivdi3" [drivers/cxl/core/cxl_core.ko] undefined! > IIRC this happens when you do 64-bit division because it's annoying and complicated. Ran into this madness in mempolicy. ~Gregory
On 10/28/25 09:47, Robert Richter wrote:
> The CXL specification supports multi-level interleaving "as long as
> all the levels use different, but consecutive, HPA bits to select the
> target and no Interleave Set has more than 8 devices" (from 3.2).
>
> Currently the kernel expects that a decoder's "interleave granularity
> is a multiple of @parent_port granularity". That is, the granularity
> of a lower level is bigger than those of the parent and uses the outer
> HPA bits as selector. It works e.g. for the following 8-way config:
>
> * cross-link (cross-hostbridge config in CFMWS):
> * 4-way
> * 256 granularity
> * Selector: HPA[8:9]
> * sub-link (CXL Host bridge config of the HDM):
> * 2-way
> * 1024 granularity
> * Selector: HPA[10]
>
> Now, if the outer HPA bits are used for the cross-hostbridge, an 8-way
> config could look like this:
>
> * cross-link (cross-hostbridge config in CFMWS):
> * 4-way
> * 512 granularity
> * Selector: HPA[9:10]
> * sub-link (CXL Host bridge config of the HDM):
> * 2-way
> * 256 granularity
> * Selector: HPA[8]
>
> The enumeration of decoders for this configuration fails then with
> following error:
>
> cxl region0: pci0000:00:port1 cxl_port_setup_targets expected iw: 2 ig: 1024 [mem 0x10000000000-0x1ffffffffff flags 0x200]
> cxl region0: pci0000:00:port1 cxl_port_setup_targets got iw: 2 ig: 256 state: enabled 0x10000000000:0x1ffffffffff
> cxl_port endpoint12: failed to attach decoder12.0 to region0: -6
>
> Note that this happens only if firmware is setting up the decoders
> (CXL_REGION_F_AUTO). For userspace region assembly the granularities
> are chosen to increase from root down to the lower levels. That is,
> outer HPA bits are always used for lower interleaving levels.
>
> Rework the implementation to also support multi-level interleaving
> with smaller granularities for lower levels. Determine the interleave
> set of autodetected decoders. Check that it is a subset of the root
> interleave.
>
> The HPA selector bits are extracted for all decoders of the set and
> checked that there is no overlap and bits are consecutive. All
> decoders can be programmed now to use any bit range within the
> region's target selector.
>
> Signed-off-by: Robert Richter <rrichter@amd.com>
> ---
> drivers/cxl/core/region.c | 201 ++++++++++++++++++++------------------
> 1 file changed, 108 insertions(+), 93 deletions(-)
>
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index b06fee1978ba..79d35def7c79 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -1323,57 +1323,119 @@ static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig)
> return 0;
> }
>
> +static inline u64 get_selector(u64 ways, u64 gran)
> +{
> + if (!is_power_of_2(ways))
> + ways /= 3;
> +
> + if (!is_power_of_2(ways) || !is_power_of_2(gran))
> + return 0;
> +
> + return (ways - 1) * gran;
> +}
> +
> static int cxl_port_setup_targets(struct cxl_port *port,
> struct cxl_region *cxlr,
> struct cxl_endpoint_decoder *cxled)
> {
> struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
> - int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos;
> struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
> struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
> struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
> struct cxl_region_params *p = &cxlr->params;
> struct cxl_decoder *cxld = cxl_rr->decoder;
> - struct cxl_switch_decoder *cxlsd;
> + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(&cxld->dev);
> struct cxl_port *iter = port;
> - u16 eig, peig;
> - u8 eiw, peiw;
> + int ig, iw = cxl_rr->nr_targets, rc, inc = 0, pos = cxled->pos;
> + int distance, parent_distance;
> + u64 selector, cxlr_sel;
> + u16 eig;
> + u8 eiw;
>
Hi Robert,
The number of local variables suggest to me maybe it is worth to
refactor this code for simplicity, factoring out the selector and
distance calculation. You added a new helper above which should include
most of this, IMO.
Alejandro
> /*
> * While root level decoders support x3, x6, x12, switch level
> * decoders only support powers of 2 up to x16.
> */
> - if (!is_power_of_2(cxl_rr->nr_targets)) {
> + if (!is_power_of_2(iw)) {
> dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
> - dev_name(port->uport_dev), dev_name(&port->dev),
> - cxl_rr->nr_targets);
> + dev_name(port->uport_dev), dev_name(&port->dev), iw);
> return -EINVAL;
> }
>
> - cxlsd = to_cxl_switch_decoder(&cxld->dev);
> - if (cxl_rr->nr_targets_set) {
> - int i, distance = 1;
> - struct cxl_region_ref *cxl_rr_iter;
> + if (iw > 8 || iw > cxlsd->nr_targets) {
> + dev_dbg(&cxlr->dev,
> + "%s:%s:%s: ways: %d overflows targets: %d\n",
> + dev_name(port->uport_dev), dev_name(&port->dev),
> + dev_name(&cxld->dev), iw, cxlsd->nr_targets);
> + return -ENXIO;
> + }
>
> - /*
> - * The "distance" between peer downstream ports represents which
> - * endpoint positions in the region interleave a given port can
> - * host.
> - *
> - * For example, at the root of a hierarchy the distance is
> - * always 1 as every index targets a different host-bridge. At
> - * each subsequent switch level those ports map every Nth region
> - * position where N is the width of the switch == distance.
> - */
> - do {
> - cxl_rr_iter = cxl_rr_load(iter, cxlr);
> - distance *= cxl_rr_iter->nr_targets;
> - iter = to_cxl_port(iter->dev.parent);
> - } while (!is_cxl_root(iter));
> - distance *= cxlrd->cxlsd.cxld.interleave_ways;
> + /*
> + * Calculate the effective granularity and ways to determine
> + * HPA bits used as target selectors of the interleave set.
> + * Use this to check if the root decoder and all subsequent
> + * HDM decoders only use bits from that range as selectors.
> + *
> + * The "distance" between peer downstream ports represents which
> + * endpoint positions in the region interleave a given port can
> + * host.
> + *
> + * For example, at the root of a hierarchy the distance is
> + * always 1 as every index targets a different host-bridge. At
> + * each subsequent switch level those ports map every Nth region
> + * position where N is the width of the switch == distance.
> + */
> +
> + /* Start with the root decoders selector and distance. */
> + selector = get_selector(cxlrd->cxlsd.cxld.interleave_ways,
> + cxlrd->cxlsd.cxld.interleave_granularity);
> + distance = cxlrd->cxlsd.cxld.interleave_ways;
> + if (!is_power_of_2(distance))
> + distance /= 3;
> +
> + for (iter = parent_port; !is_cxl_root(iter);
> + iter = to_cxl_port(iter->dev.parent)) {
> + struct cxl_region_ref *cxl_rr_iter = cxl_rr_load(iter, cxlr);
> + struct cxl_decoder *cxld_iter = cxl_rr_iter->decoder;
> + u64 cxld_sel;
> +
> + if (cxld_iter->interleave_ways == 1)
> + continue;
> +
> + cxld_sel = get_selector(cxld_iter->interleave_ways,
> + cxld_iter->interleave_granularity);
> +
> + if (cxld_sel & selector) {
> + dev_dbg(&cxlr->dev, "%s:%s: overlapping selectors: %#llx:%#llx\n",
> + dev_name(iter->uport_dev),
> + dev_name(&iter->dev), cxld_sel, selector);
> + return -ENXIO;
> + }
>
> - for (i = 0; i < cxl_rr->nr_targets_set; i++)
> + selector |= cxld_sel;
> + distance *= cxl_rr_iter->nr_targets;
> + }
> +
> + parent_distance = distance;
> + distance *= iw;
> +
> + /* The combined selector bits must fit the region selector. */
> + cxlr_sel = get_selector(p->interleave_ways,
> + p->interleave_granularity);
> +
> + if ((cxlr_sel & selector) != selector) {
> + dev_dbg(&cxlr->dev, "%s:%s: invalid selectors: %#llx:%#llx\n",
> + dev_name(iter->uport_dev),
> + dev_name(&iter->dev), cxlr_sel, selector);
> + return -ENXIO;
> + }
> +
> + /* Calculate remaining selector bits available for use. */
> + selector = cxlr_sel & ~selector;
> +
> + if (cxl_rr->nr_targets_set) {
> + for (int i = 0; i < cxl_rr->nr_targets_set; i++)
> if (ep->dport == cxlsd->target[i]) {
> rc = check_last_peer(cxled, ep, cxl_rr,
> distance);
> @@ -1384,87 +1446,40 @@ static int cxl_port_setup_targets(struct cxl_port *port,
> goto add_target;
> }
>
> - if (is_cxl_root(parent_port)) {
> + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
> + ig = cxld->interleave_granularity;
> + else
> /*
> + * Set the interleave granularity with each interleave
> + * level to a multiple of it's parent port interleave
> + * ways. Beginning with the granularity of the root
> + * decoder set to the region granularity (starting
> + * with the inner selector bits of the HPA), the
> + * granularity is increased with each level. Calculate
> + * this using the parent distance and region
> + * granularity.
> + *
> * Root decoder IG is always set to value in CFMWS which
> * may be different than this region's IG. We can use the
> * region's IG here since interleave_granularity_store()
> * does not allow interleaved host-bridges with
> * root IG != region IG.
> */
> - parent_ig = p->interleave_granularity;
> - parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
> - /*
> - * For purposes of address bit routing, use power-of-2 math for
> - * switch ports.
> - */
> - if (!is_power_of_2(parent_iw))
> - parent_iw /= 3;
> - } else {
> - struct cxl_region_ref *parent_rr;
> - struct cxl_decoder *parent_cxld;
> -
> - parent_rr = cxl_rr_load(parent_port, cxlr);
> - parent_cxld = parent_rr->decoder;
> - parent_ig = parent_cxld->interleave_granularity;
> - parent_iw = parent_cxld->interleave_ways;
> - }
> -
> - rc = granularity_to_eig(parent_ig, &peig);
> - if (rc) {
> - dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
> - dev_name(parent_port->uport_dev),
> - dev_name(&parent_port->dev), parent_ig);
> - return rc;
> - }
> -
> - rc = ways_to_eiw(parent_iw, &peiw);
> - if (rc) {
> - dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
> - dev_name(parent_port->uport_dev),
> - dev_name(&parent_port->dev), parent_iw);
> - return rc;
> - }
> + ig = p->interleave_granularity * parent_distance;
>
> - iw = cxl_rr->nr_targets;
> rc = ways_to_eiw(iw, &eiw);
> - if (rc) {
> - dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
> - dev_name(port->uport_dev), dev_name(&port->dev), iw);
> - return rc;
> - }
> -
> - /*
> - * Interleave granularity is a multiple of @parent_port granularity.
> - * Multiplier is the parent port interleave ways.
> - */
> - rc = granularity_to_eig(parent_ig * parent_iw, &eig);
> - if (rc) {
> - dev_dbg(&cxlr->dev,
> - "%s: invalid granularity calculation (%d * %d)\n",
> - dev_name(&parent_port->dev), parent_ig, parent_iw);
> - return rc;
> - }
> -
> - rc = eig_to_granularity(eig, &ig);
> - if (rc) {
> - dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
> - dev_name(port->uport_dev), dev_name(&port->dev),
> - 256 << eig);
> - return rc;
> - }
> + if (!rc)
> + rc = granularity_to_eig(ig, &eig);
>
> - if (iw > 8 || iw > cxlsd->nr_targets) {
> - dev_dbg(&cxlr->dev,
> - "%s:%s:%s: ways: %d overflows targets: %d\n",
> + if (rc || (iw > 1 && ~selector & get_selector(iw, ig))) {
> + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d:%d:%#llx\n",
> dev_name(port->uport_dev), dev_name(&port->dev),
> - dev_name(&cxld->dev), iw, cxlsd->nr_targets);
> + iw, ig, selector);
> return -ENXIO;
> }
>
> if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
> if (cxld->interleave_ways != iw ||
> - (iw > 1 && cxld->interleave_granularity != ig) ||
> !region_res_match_cxl_range(p, &cxld->hpa_range) ||
> ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
> dev_err(&cxlr->dev,
On Tue, 28 Oct 2025 10:47:53 +0100 Robert Richter <rrichter@amd.com> wrote: > The CXL specification supports multi-level interleaving "as long as > all the levels use different, but consecutive, HPA bits to select the > target and no Interleave Set has more than 8 devices" (from 3.2). > > Currently the kernel expects that a decoder's "interleave granularity > is a multiple of @parent_port granularity". That is, the granularity > of a lower level is bigger than those of the parent and uses the outer > HPA bits as selector. It works e.g. for the following 8-way config: > > * cross-link (cross-hostbridge config in CFMWS): > * 4-way > * 256 granularity > * Selector: HPA[8:9] > * sub-link (CXL Host bridge config of the HDM): > * 2-way > * 1024 granularity > * Selector: HPA[10] > > Now, if the outer HPA bits are used for the cross-hostbridge, an 8-way > config could look like this: > > * cross-link (cross-hostbridge config in CFMWS): > * 4-way > * 512 granularity > * Selector: HPA[9:10] > * sub-link (CXL Host bridge config of the HDM): > * 2-way > * 256 granularity > * Selector: HPA[8] > > The enumeration of decoders for this configuration fails then with > following error: > > cxl region0: pci0000:00:port1 cxl_port_setup_targets expected iw: 2 ig: 1024 [mem 0x10000000000-0x1ffffffffff flags 0x200] > cxl region0: pci0000:00:port1 cxl_port_setup_targets got iw: 2 ig: 256 state: enabled 0x10000000000:0x1ffffffffff > cxl_port endpoint12: failed to attach decoder12.0 to region0: -6 > > Note that this happens only if firmware is setting up the decoders > (CXL_REGION_F_AUTO). For userspace region assembly the granularities > are chosen to increase from root down to the lower levels. That is, > outer HPA bits are always used for lower interleaving levels. > > Rework the implementation to also support multi-level interleaving > with smaller granularities for lower levels. Determine the interleave > set of autodetected decoders. Check that it is a subset of the root > interleave. > > The HPA selector bits are extracted for all decoders of the set and > checked that there is no overlap and bits are consecutive. All > decoders can be programmed now to use any bit range within the > region's target selector. > > Signed-off-by: Robert Richter <rrichter@amd.com> We debated this back when the original support was added. At the time the discussion pretty much concluded there was no reason anyone would ever actually do this because it mostly doesn't provide much/any performance benefit and might result in significantly worse performance by not spreading out accesses as early as possible. So do we have any idea why a bios is doing this? Something about the host interconnect? Thanks, Jonathan
On 28.10.25 14:27:27, Jonathan Cameron wrote: > On Tue, 28 Oct 2025 10:47:53 +0100 > Robert Richter <rrichter@amd.com> wrote: > > > The CXL specification supports multi-level interleaving "as long as > > all the levels use different, but consecutive, HPA bits to select the > > target and no Interleave Set has more than 8 devices" (from 3.2). > > > > Currently the kernel expects that a decoder's "interleave granularity > > is a multiple of @parent_port granularity". That is, the granularity > > of a lower level is bigger than those of the parent and uses the outer > > HPA bits as selector. It works e.g. for the following 8-way config: > > > > * cross-link (cross-hostbridge config in CFMWS): > > * 4-way > > * 256 granularity > > * Selector: HPA[8:9] > > * sub-link (CXL Host bridge config of the HDM): > > * 2-way > > * 1024 granularity > > * Selector: HPA[10] > > > > Now, if the outer HPA bits are used for the cross-hostbridge, an 8-way > > config could look like this: > > > > * cross-link (cross-hostbridge config in CFMWS): > > * 4-way > > * 512 granularity > > * Selector: HPA[9:10] > > * sub-link (CXL Host bridge config of the HDM): > > * 2-way > > * 256 granularity > > * Selector: HPA[8] > > > > The enumeration of decoders for this configuration fails then with > > following error: > > > > cxl region0: pci0000:00:port1 cxl_port_setup_targets expected iw: 2 ig: 1024 [mem 0x10000000000-0x1ffffffffff flags 0x200] > > cxl region0: pci0000:00:port1 cxl_port_setup_targets got iw: 2 ig: 256 state: enabled 0x10000000000:0x1ffffffffff > > cxl_port endpoint12: failed to attach decoder12.0 to region0: -6 > > > > Note that this happens only if firmware is setting up the decoders > > (CXL_REGION_F_AUTO). For userspace region assembly the granularities > > are chosen to increase from root down to the lower levels. That is, > > outer HPA bits are always used for lower interleaving levels. > > > > Rework the implementation to also support multi-level interleaving > > with smaller granularities for lower levels. Determine the interleave > > set of autodetected decoders. Check that it is a subset of the root > > interleave. > > > > The HPA selector bits are extracted for all decoders of the set and > > checked that there is no overlap and bits are consecutive. All > > decoders can be programmed now to use any bit range within the > > region's target selector. > > > > Signed-off-by: Robert Richter <rrichter@amd.com> > > We debated this back when the original support was added. At the time the > discussion pretty much concluded there was no reason anyone would ever actually > do this because it mostly doesn't provide much/any performance benefit and > might result in significantly worse performance by not spreading out accesses as > early as possible. > > So do we have any idea why a bios is doing this? Something about the > host interconnect? There could be platform constraints that require the cross-hostbridge logic to have bigger granularities. The sublinks have then a smaller granularity. There are existing platforms with that config that would be broken otherwise. It is not possible to handle this differently by the bios and reconfigure that. -Robert
© 2016 - 2026 Red Hat, Inc.