[PATCH v3 08/11] cxl/region: Implement endpoint decoder address translation

Robert Richter posted 11 patches 2 weeks, 6 days ago
[PATCH v3 08/11] cxl/region: Implement endpoint decoder address translation
Posted by Robert Richter 2 weeks, 6 days ago
Systems that need address translation have the endpoint decoders
programmed for a different address space. Host physical addresses
(HPA) are different from their system physical addresses (SPA). The
decoder's address range and interleaving configuration of such
endpoints cannot be used to determine the region parameters. The
region's address range must be SPA which the decoder does not
provide. In addition, an endpoint's incoming HPA is already converted
to the devices physical address (DPA). Thus it has interleaving
disabled.

Address translation may provide different ways to determine an
endpoint's SPA, e.g. it may support a firmware call. This allows the
determination of the region's parameters without inspecting the
endpoint decoders.

Implement the setup of address translation given there is a function
to convert an endpoint's HPA (which is identical to its DPA) to an
SPA. Use the previously introduced cxl_to_hpa_fn callback for this.
Convert the decoder's address range and ensure it is 256MB aligned.

Identify the region's interleaving ways by inspecting the address
ranges. Also determine the interleaving granularity using the address
translation callback. Note that the position of the chunk from one
interleaving block to the next may vary and thus cannot be considered
constant. Address offsets larger than the interleaving block size
cannot be used to calculate the granularity. Thus, probe the
granularity using address translation for various HPAs in the same
interleaving block.

Note that this patch does not yet enable address translation as
callbacks have not been initialized.

Signed-off-by: Robert Richter <rrichter@amd.com>
---
 drivers/cxl/core/region.c | 95 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 94 insertions(+), 1 deletion(-)

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 57697504410b..9fb1e9508213 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -3422,16 +3422,109 @@ struct cxl_region_context {
 	int interleave_granularity;
 };
 
+static int setup_address_translation(struct cxl_endpoint_decoder *cxled,
+				     struct cxl_region_context *ctx)
+{
+	struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent->parent);
+	struct cxl_decoder *cxld = &cxled->cxld;
+	struct range range = ctx->hpa_range;
+	u64 spa_len, len = range_len(&range);
+	u64 addr, base = range.start;
+	int ways, gran;
+
+	if (!len || !port->to_hpa)
+		return 0;
+
+	if (!IS_ALIGNED(range.start, SZ_256M) ||
+	    !IS_ALIGNED(range.end + 1, SZ_256M)) {
+		dev_warn(&port->dev,
+			"CXL address translation: Unaligned decoder HPA range: %#llx-%#llx(%s)\n",
+			range.start, range.end, dev_name(&cxld->dev));
+		return -ENXIO;
+	}
+
+	/* Translate HPA range to SPA. */
+	range.start = port->to_hpa(cxld, range.start);
+	range.end = port->to_hpa(cxld, range.end);
+
+	if (range.start == ULLONG_MAX || range.end == ULLONG_MAX) {
+		dev_warn(&port->dev,
+			"CXL address translation: Failed to translate HPA range: %#llx-%#llx:%#llx-%#llx(%s)\n",
+			range.start, range.end, ctx->hpa_range.start,
+			ctx->hpa_range.end, dev_name(&cxld->dev));
+		return -ENXIO;
+	}
+
+	/*
+	 * Since translated addresses include the interleaving
+	 * offsets, align the range to 256 MB.
+	 */
+	range.start = ALIGN_DOWN(range.start, SZ_256M);
+	range.end = ALIGN(range.end, SZ_256M) - 1;
+
+	spa_len = range_len(&range);
+	if (!len || !spa_len || spa_len % len) {
+		dev_warn(&port->dev,
+			"CXL address translation: HPA range not contiguous: %#llx-%#llx:%#llx-%#llx(%s)\n",
+			range.start, range.end, ctx->hpa_range.start,
+			ctx->hpa_range.end, dev_name(&cxld->dev));
+		return -ENXIO;
+	}
+
+	ways = spa_len / len;
+	gran = SZ_256;
+
+	/*
+	 * Determine interleave granularity
+	 *
+	 * Note: The position of the chunk from one interleaving block
+	 * to the next may vary and thus cannot be considered
+	 * constant. Address offsets larger than the interleaving
+	 * block size cannot be used to calculate the granularity.
+	 */
+	while (ways > 1 && gran <= SZ_16M) {
+		addr = port->to_hpa(cxld, base + gran);
+		if (addr != base + gran)
+			break;
+		gran <<= 1;
+	}
+
+	if (gran > SZ_16M) {
+		dev_warn(&port->dev,
+			"CXL address translation: Cannot determine granularity: %#llx-%#llx:%#llx-%#llx(%s)\n",
+			range.start, range.end, ctx->hpa_range.start,
+			ctx->hpa_range.end, dev_name(&cxld->dev));
+		return -ENXIO;
+	}
+
+	ctx->hpa_range = range;
+	ctx->interleave_ways = ways;
+	ctx->interleave_granularity = gran;
+
+	dev_dbg(&cxld->dev,
+		"address mapping found for %s (hpa -> spa): %#llx+%#llx -> %#llx+%#llx ways:%d granularity:%d\n",
+		dev_name(ctx->cxlmd->dev.parent), base, len, range.start,
+		spa_len, ways, gran);
+
+	return 0;
+}
+
 static int setup_region_params(struct cxl_endpoint_decoder *cxled,
 			       struct cxl_region_context *ctx)
 {
+	int rc;
+
 	ctx->cxled = cxled;
 	ctx->cxlmd = cxled_to_memdev(cxled);
 	ctx->hpa_range = cxled->cxld.hpa_range;
 	ctx->interleave_ways = cxled->cxld.interleave_ways;
 	ctx->interleave_granularity = cxled->cxld.interleave_granularity;
 
-	return 0;
+	rc = setup_address_translation(cxled, ctx);
+	if (rc)
+		return rc;
+
+	return rc;
 }
 
 static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
-- 
2.39.5
Re: [PATCH v3 08/11] cxl/region: Implement endpoint decoder address translation
Posted by Jonathan Cameron 2 weeks, 3 days ago
On Fri, 12 Sep 2025 16:45:10 +0200
Robert Richter <rrichter@amd.com> wrote:

> Systems that need address translation have the endpoint decoders
> programmed for a different address space. Host physical addresses
> (HPA) are different from their system physical addresses (SPA). The
> decoder's address range and interleaving configuration of such
> endpoints cannot be used to determine the region parameters. The
> region's address range must be SPA which the decoder does not
> provide. In addition, an endpoint's incoming HPA is already converted
> to the devices physical address (DPA). Thus it has interleaving
> disabled.
> 
> Address translation may provide different ways to determine an
> endpoint's SPA, e.g. it may support a firmware call. This allows the
> determination of the region's parameters without inspecting the
> endpoint decoders.
> 
> Implement the setup of address translation given there is a function
> to convert an endpoint's HPA (which is identical to its DPA) to an
> SPA. Use the previously introduced cxl_to_hpa_fn callback for this.
> Convert the decoder's address range and ensure it is 256MB aligned.
> 
> Identify the region's interleaving ways by inspecting the address
> ranges. Also determine the interleaving granularity using the address
> translation callback. Note that the position of the chunk from one
> interleaving block to the next may vary and thus cannot be considered
> constant. Address offsets larger than the interleaving block size
> cannot be used to calculate the granularity. Thus, probe the
> granularity using address translation for various HPAs in the same
> interleaving block.
> 
> Note that this patch does not yet enable address translation as
> callbacks have not been initialized.
> 
> Signed-off-by: Robert Richter <rrichter@amd.com>
> ---
>  drivers/cxl/core/region.c | 95 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 94 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 57697504410b..9fb1e9508213 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -3422,16 +3422,109 @@ struct cxl_region_context {
>  	int interleave_granularity;
>  };
>  
> +static int setup_address_translation(struct cxl_endpoint_decoder *cxled,
> +				     struct cxl_region_context *ctx)
> +{
> +	struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent->parent);

When there is a parent->parent it always makes me nervous that I haven't
reasoned out what port this actually is. A comment would help or
a more specific macro where the name lets us know what we are getting.

> +	struct cxl_decoder *cxld = &cxled->cxld;
> +	struct range range = ctx->hpa_range;
> +	u64 spa_len, len = range_len(&range);
> +	u64 addr, base = range.start;
> +	int ways, gran;
> +
> +	if (!len || !port->to_hpa)
> +		return 0;
> +
> +	if (!IS_ALIGNED(range.start, SZ_256M) ||
> +	    !IS_ALIGNED(range.end + 1, SZ_256M)) {
> +		dev_warn(&port->dev,
> +			"CXL address translation: Unaligned decoder HPA range: %#llx-%#llx(%s)\n",
> +			range.start, range.end, dev_name(&cxld->dev));
> +		return -ENXIO;
> +	}
> +
> +	/* Translate HPA range to SPA. */
> +	range.start = port->to_hpa(cxld, range.start);

This is where the generic naming as 'range' gets really confusing.
hpa_range etc with separate struct range for each would definitely help

For the checks and inputs maybe just use ctx->hpa_range directly.


> +	range.end = port->to_hpa(cxld, range.end);
Perhaps use the DEFINE_RANGE macro or 
	range = (struct range) {
		.start = ...
style as per earlier patches.

> +
> +	if (range.start == ULLONG_MAX || range.end == ULLONG_MAX) {
> +		dev_warn(&port->dev,
> +			"CXL address translation: Failed to translate HPA range: %#llx-%#llx:%#llx-%#llx(%s)\n",
> +			range.start, range.end, ctx->hpa_range.start,
> +			ctx->hpa_range.end, dev_name(&cxld->dev));
> +		return -ENXIO;
> +	}
> +
> +	/*
> +	 * Since translated addresses include the interleaving
> +	 * offsets, align the range to 256 MB.

So we pass in an HPA range without interleaving offsets and get back
one with them?  Is that unavoidable, or can we potentially push
this bit into the callback?  Probably with separate callbacks to
get the interleave details.

Overall I'm not really following what is going on here.  Maybe
some ascii art would help?

> +	 */
> +	range.start = ALIGN_DOWN(range.start, SZ_256M);
> +	range.end = ALIGN(range.end, SZ_256M) - 1;
> +
> +	spa_len = range_len(&range);
> +	if (!len || !spa_len || spa_len % len) {
> +		dev_warn(&port->dev,
> +			"CXL address translation: HPA range not contiguous: %#llx-%#llx:%#llx-%#llx(%s)\n",
> +			range.start, range.end, ctx->hpa_range.start,
> +			ctx->hpa_range.end, dev_name(&cxld->dev));
> +		return -ENXIO;
> +	}
> +
> +	ways = spa_len / len;
> +	gran = SZ_256;
> +
> +	/*
> +	 * Determine interleave granularity
> +	 *
> +	 * Note: The position of the chunk from one interleaving block
> +	 * to the next may vary and thus cannot be considered
> +	 * constant. Address offsets larger than the interleaving
> +	 * block size cannot be used to calculate the granularity.
> +	 */
> +	while (ways > 1 && gran <= SZ_16M) {
> +		addr = port->to_hpa(cxld, base + gran);
> +		if (addr != base + gran)
> +			break;
> +		gran <<= 1;
> +	}
> +
> +	if (gran > SZ_16M) {
> +		dev_warn(&port->dev,
> +			"CXL address translation: Cannot determine granularity: %#llx-%#llx:%#llx-%#llx(%s)\n",
> +			range.start, range.end, ctx->hpa_range.start,
> +			ctx->hpa_range.end, dev_name(&cxld->dev));
> +		return -ENXIO;
> +	}
> +
> +	ctx->hpa_range = range;
> +	ctx->interleave_ways = ways;
> +	ctx->interleave_granularity = gran;
> +
> +	dev_dbg(&cxld->dev,
> +		"address mapping found for %s (hpa -> spa): %#llx+%#llx -> %#llx+%#llx ways:%d granularity:%d\n",
> +		dev_name(ctx->cxlmd->dev.parent), base, len, range.start,
> +		spa_len, ways, gran);
> +
> +	return 0;
> +}
> +
>  static int setup_region_params(struct cxl_endpoint_decoder *cxled,
>  			       struct cxl_region_context *ctx)
>  {
> +	int rc;
> +
>  	ctx->cxled = cxled;
>  	ctx->cxlmd = cxled_to_memdev(cxled);
>  	ctx->hpa_range = cxled->cxld.hpa_range;
>  	ctx->interleave_ways = cxled->cxld.interleave_ways;
>  	ctx->interleave_granularity = cxled->cxld.interleave_granularity;
>  
> -	return 0;
> +	rc = setup_address_translation(cxled, ctx);

A quick search suggested nothing new gets added after this. As such
	return setup_address_translation(...);
is probably appropriate here.


> +	if (rc)
> +		return rc;
> +
> +	return rc;
>  }
>  
>  static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
Re: [PATCH v3 08/11] cxl/region: Implement endpoint decoder address translation
Posted by Gregory Price 2 weeks ago
On Mon, Sep 15, 2025 at 11:46:14AM +0100, Jonathan Cameron wrote:
> > +	/*
> > +	 * Since translated addresses include the interleaving
> > +	 * offsets, align the range to 256 MB.
> 
> So we pass in an HPA range without interleaving offsets and get back
> one with them?  Is that unavoidable, or can we potentially push
> this bit into the callback?  Probably with separate callbacks to
> get the interleave details.
> 
> Overall I'm not really following what is going on here.  Maybe
> some ascii art would help?
>

The endpoints in this case are encoded with "normalized" (base-0) with
a size of only the memory they provide. As a result, the decoder
interleave settings will always be passthrough (iw=1, ig=ignored).

This chunk translates the normalized address region to the relevant SPA
region, and translates the IW/IG to what it actually is (i.e. what it 
*would have* been on a "normal" system).

Took me a while when i originally reviewed and tested this set.

Example - this is how you'd expect a real system supported by this code
to be programmed:

region {
    .start = 0x20000000
    .end   = 0x3fffffff
    .iw    = 2
    .ig    = 256
}

endpoint1_decoder {
    .start = 0x0
    .end   = 0xfffffff
    .iw    = 1
    .ig    = 256
}

endpoint2_decoder {
    .start = 0x0
    .end   = 0xfffffff
    .iw    = 1
    .ig    = 256
}

when you do the translation from either decoder's hpa start/end,
you want the following output:

range {
    .start = 0x20000000
    .end   = 0x3fffffff
    .iw    = 2
    .ig    = 256
}

If you assume a "normal" system - this is the settings the decoders
would have been programmed with in the first place.

You have to do the alignment because the translation function (may)
only work on granularity alignment.

Example:
endpoint1->to_hpa(0)         => 0x0
endpoint1->to_hpa(0xfffffff) => 0xffffe00
endpoint2->to_hpa(0)         => 0x100
endpoint2->to_hpa(0xfffffff) => 0xfffff00

So this code applies the appropriate alignment and returns the
translated iw/ig for use elsewhere in the stack when validating the rest
of the decoders.

(haven't gotten to later commits, but iirc it was eventually used)

~Gregory

> > +	 */
> > +	range.start = ALIGN_DOWN(range.start, SZ_256M);
> > +	range.end = ALIGN(range.end, SZ_256M) - 1;
> > +
> > +	spa_len = range_len(&range);
> > +	if (!len || !spa_len || spa_len % len) {
> > +		dev_warn(&port->dev,
> > +			"CXL address translation: HPA range not contiguous: %#llx-%#llx:%#llx-%#llx(%s)\n",
> > +			range.start, range.end, ctx->hpa_range.start,
> > +			ctx->hpa_range.end, dev_name(&cxld->dev));
> > +		return -ENXIO;
> > +	}
> > +
> > +	ways = spa_len / len;
> > +	gran = SZ_256;
> > +
Re: [PATCH v3 08/11] cxl/region: Implement endpoint decoder address translation
Posted by Gregory Price 2 weeks ago
On Wed, Sep 17, 2025 at 04:51:37PM -0400, Gregory Price wrote:
> You have to do the alignment because the translation function (may)
> only work on granularity alignment.
> 
> Example:
> endpoint1->to_hpa(0)         => 0x0
> endpoint1->to_hpa(0xfffffff) => 0xffffe00
                                  0x3ffffe00
> endpoint2->to_hpa(0)         => 0x100
> endpoint2->to_hpa(0xfffffff) => 0xfffff00
                                  0x3fffff00

minor corrections above if intending to use for documentation
Re: [PATCH v3 08/11] cxl/region: Implement endpoint decoder address translation
Posted by Dave Jiang 2 weeks ago

On 9/17/25 1:51 PM, Gregory Price wrote:
> On Mon, Sep 15, 2025 at 11:46:14AM +0100, Jonathan Cameron wrote:
>>> +	/*
>>> +	 * Since translated addresses include the interleaving
>>> +	 * offsets, align the range to 256 MB.
>>
>> So we pass in an HPA range without interleaving offsets and get back
>> one with them?  Is that unavoidable, or can we potentially push
>> this bit into the callback?  Probably with separate callbacks to
>> get the interleave details.
>>
>> Overall I'm not really following what is going on here.  Maybe
>> some ascii art would help?
>>
> 
> The endpoints in this case are encoded with "normalized" (base-0) with
> a size of only the memory they provide. As a result, the decoder
> interleave settings will always be passthrough (iw=1, ig=ignored).
> 
> This chunk translates the normalized address region to the relevant SPA
> region, and translates the IW/IG to what it actually is (i.e. what it 
> *would have* been on a "normal" system).
> 
> Took me a while when i originally reviewed and tested this set.
> 
> Example - this is how you'd expect a real system supported by this code
> to be programmed:
> 
> region {
>     .start = 0x20000000
>     .end   = 0x3fffffff
>     .iw    = 2
>     .ig    = 256
> }
> 
> endpoint1_decoder {
>     .start = 0x0
>     .end   = 0xfffffff
>     .iw    = 1
>     .ig    = 256
> }
> 
> endpoint2_decoder {
>     .start = 0x0
>     .end   = 0xfffffff
>     .iw    = 1
>     .ig    = 256
> }
> 
> when you do the translation from either decoder's hpa start/end,
> you want the following output:
> 
> range {
>     .start = 0x20000000
>     .end   = 0x3fffffff
>     .iw    = 2
>     .ig    = 256
> }
> 
> If you assume a "normal" system - this is the settings the decoders
> would have been programmed with in the first place.
> 
> You have to do the alignment because the translation function (may)
> only work on granularity alignment.
> 
> Example:
> endpoint1->to_hpa(0)         => 0x0
> endpoint1->to_hpa(0xfffffff) => 0xffffe00
> endpoint2->to_hpa(0)         => 0x100
> endpoint2->to_hpa(0xfffffff) => 0xfffff00
> 
> So this code applies the appropriate alignment and returns the
> translated iw/ig for use elsewhere in the stack when validating the rest
> of the decoders.

Having this explanation added to the Conventions document would be good to have.

> 
> (haven't gotten to later commits, but iirc it was eventually used)
> 
> ~Gregory
> 
>>> +	 */
>>> +	range.start = ALIGN_DOWN(range.start, SZ_256M);
>>> +	range.end = ALIGN(range.end, SZ_256M) - 1;
>>> +
>>> +	spa_len = range_len(&range);
>>> +	if (!len || !spa_len || spa_len % len) {
>>> +		dev_warn(&port->dev,
>>> +			"CXL address translation: HPA range not contiguous: %#llx-%#llx:%#llx-%#llx(%s)\n",
>>> +			range.start, range.end, ctx->hpa_range.start,
>>> +			ctx->hpa_range.end, dev_name(&cxld->dev));
>>> +		return -ENXIO;
>>> +	}
>>> +
>>> +	ways = spa_len / len;
>>> +	gran = SZ_256;
>>> +
Re: [PATCH v3 08/11] cxl/region: Implement endpoint decoder address translation
Posted by Robert Richter 2 weeks, 1 day ago
On 15.09.25 11:46:14, Jonathan Cameron wrote:
> On Fri, 12 Sep 2025 16:45:10 +0200
> Robert Richter <rrichter@amd.com> wrote:
> 
> > Systems that need address translation have the endpoint decoders
> > programmed for a different address space. Host physical addresses
> > (HPA) are different from their system physical addresses (SPA). The
> > decoder's address range and interleaving configuration of such
> > endpoints cannot be used to determine the region parameters. The
> > region's address range must be SPA which the decoder does not
> > provide. In addition, an endpoint's incoming HPA is already converted
> > to the devices physical address (DPA). Thus it has interleaving
> > disabled.
> > 
> > Address translation may provide different ways to determine an
> > endpoint's SPA, e.g. it may support a firmware call. This allows the
> > determination of the region's parameters without inspecting the
> > endpoint decoders.
> > 
> > Implement the setup of address translation given there is a function
> > to convert an endpoint's HPA (which is identical to its DPA) to an
> > SPA. Use the previously introduced cxl_to_hpa_fn callback for this.
> > Convert the decoder's address range and ensure it is 256MB aligned.
> > 
> > Identify the region's interleaving ways by inspecting the address
> > ranges. Also determine the interleaving granularity using the address
> > translation callback. Note that the position of the chunk from one
> > interleaving block to the next may vary and thus cannot be considered
> > constant. Address offsets larger than the interleaving block size
> > cannot be used to calculate the granularity. Thus, probe the
> > granularity using address translation for various HPAs in the same
> > interleaving block.
> > 
> > Note that this patch does not yet enable address translation as
> > callbacks have not been initialized.
> > 
> > Signed-off-by: Robert Richter <rrichter@amd.com>
> > ---
> >  drivers/cxl/core/region.c | 95 ++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 94 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index 57697504410b..9fb1e9508213 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -3422,16 +3422,109 @@ struct cxl_region_context {
> >  	int interleave_granularity;
> >  };
> >  
> > +static int setup_address_translation(struct cxl_endpoint_decoder *cxled,
> > +				     struct cxl_region_context *ctx)
> > +{
> > +	struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent->parent);
> 
> When there is a parent->parent it always makes me nervous that I haven't
> reasoned out what port this actually is. A comment would help or
> a more specific macro where the name lets us know what we are getting.

Yes, will improve that. Since the implemenatation will be changed to
be more specific to only translate cxled -> host_bridge, this section
will become more readable as well.

> 
> > +	struct cxl_decoder *cxld = &cxled->cxld;
> > +	struct range range = ctx->hpa_range;
> > +	u64 spa_len, len = range_len(&range);
> > +	u64 addr, base = range.start;
> > +	int ways, gran;
> > +
> > +	if (!len || !port->to_hpa)
> > +		return 0;
> > +
> > +	if (!IS_ALIGNED(range.start, SZ_256M) ||
> > +	    !IS_ALIGNED(range.end + 1, SZ_256M)) {
> > +		dev_warn(&port->dev,
> > +			"CXL address translation: Unaligned decoder HPA range: %#llx-%#llx(%s)\n",
> > +			range.start, range.end, dev_name(&cxld->dev));
> > +		return -ENXIO;
> > +	}
> > +
> > +	/* Translate HPA range to SPA. */
> > +	range.start = port->to_hpa(cxld, range.start);
> 
> This is where the generic naming as 'range' gets really confusing.
> hpa_range etc with separate struct range for each would definitely help
> 
> For the checks and inputs maybe just use ctx->hpa_range directly.
> 
> 
> > +	range.end = port->to_hpa(cxld, range.end);
> Perhaps use the DEFINE_RANGE macro or 
> 	range = (struct range) {
> 		.start = ...
> style as per earlier patches.

Ok.


> 
> > +
> > +	if (range.start == ULLONG_MAX || range.end == ULLONG_MAX) {
> > +		dev_warn(&port->dev,
> > +			"CXL address translation: Failed to translate HPA range: %#llx-%#llx:%#llx-%#llx(%s)\n",
> > +			range.start, range.end, ctx->hpa_range.start,
> > +			ctx->hpa_range.end, dev_name(&cxld->dev));
> > +		return -ENXIO;
> > +	}
> > +
> > +	/*
> > +	 * Since translated addresses include the interleaving
> > +	 * offsets, align the range to 256 MB.
> 
> So we pass in an HPA range without interleaving offsets and get back
> one with them?  Is that unavoidable, or can we potentially push
> this bit into the callback?  Probably with separate callbacks to
> get the interleave details.

While the translation is used here to get the HPA range for whole
region regardless of the specific endpoint, the call should also
provide translated addresses of the endpoint, esp. for a later use in
tracing and error reporting. As this function extracts the range, do
the alignment here too and not in the callback.

> 
> Overall I'm not really following what is going on here.  Maybe
> some ascii art would help?

Uh, how about this:

   ___ Start of region
  /                                                  End of region ___
 /                                                                    \
|----------------------------------------------------------------------|
| chunk 1 | chunk 2 | ... |  ................ | chunk1 | chunk 2 | ... |
|----------------------------------------------------------------------|
\          \                                 /        /
 \          \___ Start HPA EP2              /        /
  \___  Start HPA EP1                      /        /
                          End HPA EP1 ____/        /
                                  End HPA EP2 ____/

As regions are aligned 256MB, use that instead of the gran * ways
blocksize.

> 
> > +	 */
> > +	range.start = ALIGN_DOWN(range.start, SZ_256M);
> > +	range.end = ALIGN(range.end, SZ_256M) - 1;
> > +
> > +	spa_len = range_len(&range);
> > +	if (!len || !spa_len || spa_len % len) {
> > +		dev_warn(&port->dev,
> > +			"CXL address translation: HPA range not contiguous: %#llx-%#llx:%#llx-%#llx(%s)\n",
> > +			range.start, range.end, ctx->hpa_range.start,
> > +			ctx->hpa_range.end, dev_name(&cxld->dev));
> > +		return -ENXIO;
> > +	}
> > +
> > +	ways = spa_len / len;
> > +	gran = SZ_256;
> > +
> > +	/*
> > +	 * Determine interleave granularity
> > +	 *
> > +	 * Note: The position of the chunk from one interleaving block
> > +	 * to the next may vary and thus cannot be considered
> > +	 * constant. Address offsets larger than the interleaving
> > +	 * block size cannot be used to calculate the granularity.
> > +	 */
> > +	while (ways > 1 && gran <= SZ_16M) {
> > +		addr = port->to_hpa(cxld, base + gran);
> > +		if (addr != base + gran)
> > +			break;
> > +		gran <<= 1;
> > +	}
> > +
> > +	if (gran > SZ_16M) {
> > +		dev_warn(&port->dev,
> > +			"CXL address translation: Cannot determine granularity: %#llx-%#llx:%#llx-%#llx(%s)\n",
> > +			range.start, range.end, ctx->hpa_range.start,
> > +			ctx->hpa_range.end, dev_name(&cxld->dev));
> > +		return -ENXIO;
> > +	}
> > +
> > +	ctx->hpa_range = range;
> > +	ctx->interleave_ways = ways;
> > +	ctx->interleave_granularity = gran;
> > +
> > +	dev_dbg(&cxld->dev,
> > +		"address mapping found for %s (hpa -> spa): %#llx+%#llx -> %#llx+%#llx ways:%d granularity:%d\n",
> > +		dev_name(ctx->cxlmd->dev.parent), base, len, range.start,
> > +		spa_len, ways, gran);
> > +
> > +	return 0;
> > +}
> > +
> >  static int setup_region_params(struct cxl_endpoint_decoder *cxled,
> >  			       struct cxl_region_context *ctx)
> >  {
> > +	int rc;
> > +
> >  	ctx->cxled = cxled;
> >  	ctx->cxlmd = cxled_to_memdev(cxled);
> >  	ctx->hpa_range = cxled->cxld.hpa_range;
> >  	ctx->interleave_ways = cxled->cxld.interleave_ways;
> >  	ctx->interleave_granularity = cxled->cxld.interleave_granularity;
> >  
> > -	return 0;
> > +	rc = setup_address_translation(cxled, ctx);
> 
> A quick search suggested nothing new gets added after this. As such
> 	return setup_address_translation(...);
> is probably appropriate here.
> 
> 
> > +	if (rc)
> > +		return rc;
> > +
> > +	return rc;

Considered a tail call here too but dropped that idea. That would
suggest the function should not be extended. But function is open to
extend the setup, maybe Low Memory Hole or so. However, there are
advantages for both and can change that.

Thanks for review,

-Robert

> >  }
> >  
> >  static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
>
Re: [PATCH v3 08/11] cxl/region: Implement endpoint decoder address translation
Posted by Dave Jiang 2 weeks, 3 days ago

On 9/15/25 3:46 AM, Jonathan Cameron wrote:
> On Fri, 12 Sep 2025 16:45:10 +0200
> Robert Richter <rrichter@amd.com> wrote:
> 
>> Systems that need address translation have the endpoint decoders
>> programmed for a different address space. Host physical addresses
>> (HPA) are different from their system physical addresses (SPA). The
>> decoder's address range and interleaving configuration of such
>> endpoints cannot be used to determine the region parameters. The
>> region's address range must be SPA which the decoder does not
>> provide. In addition, an endpoint's incoming HPA is already converted
>> to the devices physical address (DPA). Thus it has interleaving
>> disabled.
>>
>> Address translation may provide different ways to determine an
>> endpoint's SPA, e.g. it may support a firmware call. This allows the
>> determination of the region's parameters without inspecting the
>> endpoint decoders.
>>
>> Implement the setup of address translation given there is a function
>> to convert an endpoint's HPA (which is identical to its DPA) to an
>> SPA. Use the previously introduced cxl_to_hpa_fn callback for this.
>> Convert the decoder's address range and ensure it is 256MB aligned.
>>
>> Identify the region's interleaving ways by inspecting the address
>> ranges. Also determine the interleaving granularity using the address
>> translation callback. Note that the position of the chunk from one
>> interleaving block to the next may vary and thus cannot be considered
>> constant. Address offsets larger than the interleaving block size
>> cannot be used to calculate the granularity. Thus, probe the
>> granularity using address translation for various HPAs in the same
>> interleaving block.
>>
>> Note that this patch does not yet enable address translation as
>> callbacks have not been initialized.
>>
>> Signed-off-by: Robert Richter <rrichter@amd.com>
>> ---
>>  drivers/cxl/core/region.c | 95 ++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 94 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>> index 57697504410b..9fb1e9508213 100644
>> --- a/drivers/cxl/core/region.c
>> +++ b/drivers/cxl/core/region.c
>> @@ -3422,16 +3422,109 @@ struct cxl_region_context {
>>  	int interleave_granularity;
>>  };
>>  
>> +static int setup_address_translation(struct cxl_endpoint_decoder *cxled,
>> +				     struct cxl_region_context *ctx)
>> +{
>> +	struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent->parent);
> 
> When there is a parent->parent it always makes me nervous that I haven't
> reasoned out what port this actually is. A comment would help or
> a more specific macro where the name lets us know what we are getting.

I was also going to suggest that name 'port' to 'parent_port' as well to make it clear what it is.

DJ

> 
>> +	struct cxl_decoder *cxld = &cxled->cxld;
>> +	struct range range = ctx->hpa_range;
>> +	u64 spa_len, len = range_len(&range);
>> +	u64 addr, base = range.start;
>> +	int ways, gran;
>> +
>> +	if (!len || !port->to_hpa)
>> +		return 0;
>> +
>> +	if (!IS_ALIGNED(range.start, SZ_256M) ||
>> +	    !IS_ALIGNED(range.end + 1, SZ_256M)) {
>> +		dev_warn(&port->dev,
>> +			"CXL address translation: Unaligned decoder HPA range: %#llx-%#llx(%s)\n",
>> +			range.start, range.end, dev_name(&cxld->dev));
>> +		return -ENXIO;
>> +	}
>> +
>> +	/* Translate HPA range to SPA. */
>> +	range.start = port->to_hpa(cxld, range.start);
> 
> This is where the generic naming as 'range' gets really confusing.
> hpa_range etc with separate struct range for each would definitely help
> 
> For the checks and inputs maybe just use ctx->hpa_range directly.
> 
> 
>> +	range.end = port->to_hpa(cxld, range.end);
> Perhaps use the DEFINE_RANGE macro or 
> 	range = (struct range) {
> 		.start = ...
> style as per earlier patches.
> 
>> +
>> +	if (range.start == ULLONG_MAX || range.end == ULLONG_MAX) {
>> +		dev_warn(&port->dev,
>> +			"CXL address translation: Failed to translate HPA range: %#llx-%#llx:%#llx-%#llx(%s)\n",
>> +			range.start, range.end, ctx->hpa_range.start,
>> +			ctx->hpa_range.end, dev_name(&cxld->dev));
>> +		return -ENXIO;
>> +	}
>> +
>> +	/*
>> +	 * Since translated addresses include the interleaving
>> +	 * offsets, align the range to 256 MB.
> 
> So we pass in an HPA range without interleaving offsets and get back
> one with them?  Is that unavoidable, or can we potentially push
> this bit into the callback?  Probably with separate callbacks to
> get the interleave details.
> 
> Overall I'm not really following what is going on here.  Maybe
> some ascii art would help?
> 
>> +	 */
>> +	range.start = ALIGN_DOWN(range.start, SZ_256M);
>> +	range.end = ALIGN(range.end, SZ_256M) - 1;
>> +
>> +	spa_len = range_len(&range);
>> +	if (!len || !spa_len || spa_len % len) {
>> +		dev_warn(&port->dev,
>> +			"CXL address translation: HPA range not contiguous: %#llx-%#llx:%#llx-%#llx(%s)\n",
>> +			range.start, range.end, ctx->hpa_range.start,
>> +			ctx->hpa_range.end, dev_name(&cxld->dev));
>> +		return -ENXIO;
>> +	}
>> +
>> +	ways = spa_len / len;
>> +	gran = SZ_256;
>> +
>> +	/*
>> +	 * Determine interleave granularity
>> +	 *
>> +	 * Note: The position of the chunk from one interleaving block
>> +	 * to the next may vary and thus cannot be considered
>> +	 * constant. Address offsets larger than the interleaving
>> +	 * block size cannot be used to calculate the granularity.
>> +	 */
>> +	while (ways > 1 && gran <= SZ_16M) {
>> +		addr = port->to_hpa(cxld, base + gran);
>> +		if (addr != base + gran)
>> +			break;
>> +		gran <<= 1;
>> +	}
>> +
>> +	if (gran > SZ_16M) {
>> +		dev_warn(&port->dev,
>> +			"CXL address translation: Cannot determine granularity: %#llx-%#llx:%#llx-%#llx(%s)\n",
>> +			range.start, range.end, ctx->hpa_range.start,
>> +			ctx->hpa_range.end, dev_name(&cxld->dev));
>> +		return -ENXIO;
>> +	}
>> +
>> +	ctx->hpa_range = range;
>> +	ctx->interleave_ways = ways;
>> +	ctx->interleave_granularity = gran;
>> +
>> +	dev_dbg(&cxld->dev,
>> +		"address mapping found for %s (hpa -> spa): %#llx+%#llx -> %#llx+%#llx ways:%d granularity:%d\n",
>> +		dev_name(ctx->cxlmd->dev.parent), base, len, range.start,
>> +		spa_len, ways, gran);
>> +
>> +	return 0;
>> +}
>> +
>>  static int setup_region_params(struct cxl_endpoint_decoder *cxled,
>>  			       struct cxl_region_context *ctx)
>>  {
>> +	int rc;
>> +
>>  	ctx->cxled = cxled;
>>  	ctx->cxlmd = cxled_to_memdev(cxled);
>>  	ctx->hpa_range = cxled->cxld.hpa_range;
>>  	ctx->interleave_ways = cxled->cxld.interleave_ways;
>>  	ctx->interleave_granularity = cxled->cxld.interleave_granularity;
>>  
>> -	return 0;
>> +	rc = setup_address_translation(cxled, ctx);
> 
> A quick search suggested nothing new gets added after this. As such
> 	return setup_address_translation(...);
> is probably appropriate here.
> 
> 
>> +	if (rc)
>> +		return rc;
>> +
>> +	return rc;
>>  }
>>  
>>  static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
> 
>