[PATCH] DAX: warn when kmem regions are truncated for memory block alignment.

Gregory Price posted 1 patch 10 months, 3 weeks ago
There is a newer version of this series
drivers/dax/kmem.c | 18 ++++++++++++++----
1 file changed, 14 insertions(+), 4 deletions(-)
[PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by Gregory Price 10 months, 3 weeks ago
Device capacity intended for use as system ram should be aligned to the
architecture-defined memory block size or that capacity will be silently
truncated and capacity stranded.

As hotplug dax memory becomes more prevelant, the memory block size
alignment becomes more important for platform and device vendors to
pay attention to - so this truncation should not be silent.

This issue is particularly relevant for CXL Dynamic Capacity devices,
whose capacity may arrive in spec-aligned but block-misaligned chunks.

Example:
 [...] kmem dax0.0: dax region truncated 2684354560 bytes - alignment
 [...] kmem dax1.0: dax region truncated 1610612736 bytes - alignment

Signed-off-by: Gregory Price <gourry@gourry.net>
---
 drivers/dax/kmem.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index e97d47f42ee2..15b6807b703d 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -28,7 +28,8 @@ static const char *kmem_name;
 /* Set if any memory will remain added when the driver will be unloaded. */
 static bool any_hotremove_failed;
 
-static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
+static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r,
+			  unsigned long *truncated)
 {
 	struct dev_dax_range *dax_range = &dev_dax->ranges[i];
 	struct range *range = &dax_range->range;
@@ -41,6 +42,9 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
 		r->end = range->end;
 		return -ENOSPC;
 	}
+
+	if (truncated && (r->start != range->start || r->end != range->end))
+		*truncated = (r->start - range->start) + (range->end - r->end);
 	return 0;
 }
 
@@ -75,6 +79,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 	mhp_t mhp_flags;
 	int numa_node;
 	int adist = MEMTIER_DEFAULT_DAX_ADISTANCE;
+	unsigned long ttl_trunc = 0;
 
 	/*
 	 * Ensure good NUMA information for the persistent memory.
@@ -97,7 +102,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 	for (i = 0; i < dev_dax->nr_range; i++) {
 		struct range range;
 
-		rc = dax_kmem_range(dev_dax, i, &range);
+		rc = dax_kmem_range(dev_dax, i, &range, NULL);
 		if (rc) {
 			dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
 					i, range.start, range.end);
@@ -130,8 +135,9 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 	for (i = 0; i < dev_dax->nr_range; i++) {
 		struct resource *res;
 		struct range range;
+		unsigned long truncated = 0;
 
-		rc = dax_kmem_range(dev_dax, i, &range);
+		rc = dax_kmem_range(dev_dax, i, &range, &truncated);
 		if (rc)
 			continue;
 
@@ -180,8 +186,12 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
 				continue;
 			goto err_request_mem;
 		}
+
+		ttl_trunc += truncated;
 		mapped++;
 	}
+	if (ttl_trunc)
+		dev_warn(dev, "dax region truncated %ld bytes - alignment\n", ttl_trunc);
 
 	dev_set_drvdata(dev, data);
 
@@ -216,7 +226,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
 		struct range range;
 		int rc;
 
-		rc = dax_kmem_range(dev_dax, i, &range);
+		rc = dax_kmem_range(dev_dax, i, &range, NULL);
 		if (rc)
 			continue;
 
-- 
2.48.1
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by David Hildenbrand 10 months, 1 week ago
On 21.03.25 19:07, Gregory Price wrote:
> Device capacity intended for use as system ram should be aligned to the
> architecture-defined memory block size or that capacity will be silently
> truncated and capacity stranded.
> 
> As hotplug dax memory becomes more prevelant, the memory block size
> alignment becomes more important for platform and device vendors to
> pay attention to - so this truncation should not be silent.
> 
> This issue is particularly relevant for CXL Dynamic Capacity devices,
> whose capacity may arrive in spec-aligned but block-misaligned chunks.
> 
> Example:
>   [...] kmem dax0.0: dax region truncated 2684354560 bytes - alignment
>   [...] kmem dax1.0: dax region truncated 1610612736 bytes - alignment
> 
> Signed-off-by: Gregory Price <gourry@gourry.net>
> ---
>   drivers/dax/kmem.c | 18 ++++++++++++++----
>   1 file changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index e97d47f42ee2..15b6807b703d 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -28,7 +28,8 @@ static const char *kmem_name;
>   /* Set if any memory will remain added when the driver will be unloaded. */
>   static bool any_hotremove_failed;
>   
> -static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
> +static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r,
> +			  unsigned long *truncated)
>   {
>   	struct dev_dax_range *dax_range = &dev_dax->ranges[i];
>   	struct range *range = &dax_range->range;
> @@ -41,6 +42,9 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
>   		r->end = range->end;
>   		return -ENOSPC;
>   	}
> +
> +	if (truncated && (r->start != range->start || r->end != range->end))
> +		*truncated = (r->start - range->start) + (range->end - r->end);
>   	return 0;
>   }
>   
> @@ -75,6 +79,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   	mhp_t mhp_flags;
>   	int numa_node;
>   	int adist = MEMTIER_DEFAULT_DAX_ADISTANCE;
> +	unsigned long ttl_trunc = 0;
>   
>   	/*
>   	 * Ensure good NUMA information for the persistent memory.
> @@ -97,7 +102,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   	for (i = 0; i < dev_dax->nr_range; i++) {
>   		struct range range;
>   
> -		rc = dax_kmem_range(dev_dax, i, &range);
> +		rc = dax_kmem_range(dev_dax, i, &range, NULL);
>   		if (rc) {
>   			dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
>   					i, range.start, range.end);
> @@ -130,8 +135,9 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   	for (i = 0; i < dev_dax->nr_range; i++) {
>   		struct resource *res;
>   		struct range range;
> +		unsigned long truncated = 0;
>   
> -		rc = dax_kmem_range(dev_dax, i, &range);
> +		rc = dax_kmem_range(dev_dax, i, &range, &truncated);
>   		if (rc)
>   			continue;
>   
> @@ -180,8 +186,12 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   				continue;
>   			goto err_request_mem;
>   		}
> +
> +		ttl_trunc += truncated;
>   		mapped++;
>   	}
> +	if (ttl_trunc)
> +		dev_warn(dev, "dax region truncated %ld bytes - alignment\n", ttl_trunc);
>   
>   	dev_set_drvdata(dev, data);
>   
> @@ -216,7 +226,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
>   		struct range range;
>   		int rc;
>   
> -		rc = dax_kmem_range(dev_dax, i, &range);
> +		rc = dax_kmem_range(dev_dax, i, &range, NULL);
>   		if (rc)
>   			continue;
>   

Can't that be done a bit simpler?

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index e97d47f42ee2e..23a68ff809cdf 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -67,8 +67,8 @@ static void kmem_put_memory_types(void)
  
  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
  {
+       unsigned long total_len = 0, orig_len = 0;
         struct device *dev = &dev_dax->dev;
-       unsigned long total_len = 0;
         struct dax_kmem_data *data;
         struct memory_dev_type *mtype;
         int i, rc, mapped = 0;
@@ -97,6 +97,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
         for (i = 0; i < dev_dax->nr_range; i++) {
                 struct range range;
  
+               orig_len += range_len(&dev_dax->ranges[i].range);
                 rc = dax_kmem_range(dev_dax, i, &range);
                 if (rc) {
                         dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
@@ -109,6 +110,9 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
         if (!total_len) {
                 dev_warn(dev, "rejecting DAX region without any memory after alignment\n");
                 return -EINVAL;
+       } else if (total_len != orig_len) {
+               dev_warn(dev, "DAX region truncated by %lu bytes due to alignment\n",
+                        orig_len - total_len);
         }
  
         init_node_memory_type(numa_node, mtype);


-- 
Cheers,

David / dhildenb
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by Dan Williams 10 months, 1 week ago
David Hildenbrand wrote:
> On 21.03.25 19:07, Gregory Price wrote:
> > Device capacity intended for use as system ram should be aligned to the
> > architecture-defined memory block size or that capacity will be silently
> > truncated and capacity stranded.
> > 
> > As hotplug dax memory becomes more prevelant, the memory block size
> > alignment becomes more important for platform and device vendors to
> > pay attention to - so this truncation should not be silent.
> > 
> > This issue is particularly relevant for CXL Dynamic Capacity devices,
> > whose capacity may arrive in spec-aligned but block-misaligned chunks.
> > 
> > Example:
> >   [...] kmem dax0.0: dax region truncated 2684354560 bytes - alignment
> >   [...] kmem dax1.0: dax region truncated 1610612736 bytes - alignment
> > 
> > Signed-off-by: Gregory Price <gourry@gourry.net>
> > ---
> >   drivers/dax/kmem.c | 18 ++++++++++++++----
> >   1 file changed, 14 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> > index e97d47f42ee2..15b6807b703d 100644
> > --- a/drivers/dax/kmem.c
> > +++ b/drivers/dax/kmem.c
> > @@ -28,7 +28,8 @@ static const char *kmem_name;
> >   /* Set if any memory will remain added when the driver will be unloaded. */
> >   static bool any_hotremove_failed;
> >   
> > -static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
> > +static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r,
> > +			  unsigned long *truncated)
> >   {
> >   	struct dev_dax_range *dax_range = &dev_dax->ranges[i];
> >   	struct range *range = &dax_range->range;
> > @@ -41,6 +42,9 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
> >   		r->end = range->end;
> >   		return -ENOSPC;
> >   	}
> > +
> > +	if (truncated && (r->start != range->start || r->end != range->end))
> > +		*truncated = (r->start - range->start) + (range->end - r->end);
> >   	return 0;
> >   }
> >   
> > @@ -75,6 +79,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
> >   	mhp_t mhp_flags;
> >   	int numa_node;
> >   	int adist = MEMTIER_DEFAULT_DAX_ADISTANCE;
> > +	unsigned long ttl_trunc = 0;
> >   
> >   	/*
> >   	 * Ensure good NUMA information for the persistent memory.
> > @@ -97,7 +102,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
> >   	for (i = 0; i < dev_dax->nr_range; i++) {
> >   		struct range range;
> >   
> > -		rc = dax_kmem_range(dev_dax, i, &range);
> > +		rc = dax_kmem_range(dev_dax, i, &range, NULL);
> >   		if (rc) {
> >   			dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
> >   					i, range.start, range.end);
> > @@ -130,8 +135,9 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
> >   	for (i = 0; i < dev_dax->nr_range; i++) {
> >   		struct resource *res;
> >   		struct range range;
> > +		unsigned long truncated = 0;
> >   
> > -		rc = dax_kmem_range(dev_dax, i, &range);
> > +		rc = dax_kmem_range(dev_dax, i, &range, &truncated);
> >   		if (rc)
> >   			continue;
> >   
> > @@ -180,8 +186,12 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
> >   				continue;
> >   			goto err_request_mem;
> >   		}
> > +
> > +		ttl_trunc += truncated;
> >   		mapped++;
> >   	}
> > +	if (ttl_trunc)
> > +		dev_warn(dev, "dax region truncated %ld bytes - alignment\n", ttl_trunc);
> >   
> >   	dev_set_drvdata(dev, data);
> >   
> > @@ -216,7 +226,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax)
> >   		struct range range;
> >   		int rc;
> >   
> > -		rc = dax_kmem_range(dev_dax, i, &range);
> > +		rc = dax_kmem_range(dev_dax, i, &range, NULL);
> >   		if (rc)
> >   			continue;
> >   
> 
> Can't that be done a bit simpler?
> 
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index e97d47f42ee2e..23a68ff809cdf 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -67,8 +67,8 @@ static void kmem_put_memory_types(void)
>   
>   static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>   {
> +       unsigned long total_len = 0, orig_len = 0;
>          struct device *dev = &dev_dax->dev;
> -       unsigned long total_len = 0;
>          struct dax_kmem_data *data;
>          struct memory_dev_type *mtype;
>          int i, rc, mapped = 0;
> @@ -97,6 +97,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>          for (i = 0; i < dev_dax->nr_range; i++) {
>                  struct range range;
>   
> +               orig_len += range_len(&dev_dax->ranges[i].range);
>                  rc = dax_kmem_range(dev_dax, i, &range);
>                  if (rc) {
>                          dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
> @@ -109,6 +110,9 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>          if (!total_len) {
>                  dev_warn(dev, "rejecting DAX region without any memory after alignment\n");
>                  return -EINVAL;
> +       } else if (total_len != orig_len) {
> +               dev_warn(dev, "DAX region truncated by %lu bytes due to alignment\n",
> +                        orig_len - total_len);

This looks good, I agree with it being a warn because the user has lost
usable capacity and maybe this eventually pressures platform BIOS to
avoid causing Linux warnings.

In terms of making that loss easier for people to report / understand
how about use string_get_size() to convert raw bytes to power of 10 and
power of 2 values? I.e.

"DAX region truncated by X.XX GiB (Y.YY GB) due to alignment."
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by Gregory Price 10 months, 1 week ago
On Tue, Apr 01, 2025 at 10:47:09AM -0700, Dan Williams wrote:
> David Hildenbrand wrote:
> > diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> > index e97d47f42ee2e..23a68ff809cdf 100644
> > --- a/drivers/dax/kmem.c
> > +++ b/drivers/dax/kmem.c
> > @@ -67,8 +67,8 @@ static void kmem_put_memory_types(void)
> >   
> >   static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
> >   {
> > +       unsigned long total_len = 0, orig_len = 0;
> >          struct device *dev = &dev_dax->dev;
> > -       unsigned long total_len = 0;
> >          struct dax_kmem_data *data;
> >          struct memory_dev_type *mtype;
> >          int i, rc, mapped = 0;
> > @@ -97,6 +97,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
> >          for (i = 0; i < dev_dax->nr_range; i++) {
> >                  struct range range;
> >   
> > +               orig_len += range_len(&dev_dax->ranges[i].range);
> >                  rc = dax_kmem_range(dev_dax, i, &range);
> >                  if (rc) {
> >                          dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
> > @@ -109,6 +110,9 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
> >          if (!total_len) {
> >                  dev_warn(dev, "rejecting DAX region without any memory after alignment\n");
> >                  return -EINVAL;
> > +       } else if (total_len != orig_len) {
> > +               dev_warn(dev, "DAX region truncated by %lu bytes due to alignment\n",
> > +                        orig_len - total_len);
> 
> This looks good, I agree with it being a warn because the user has lost
> usable capacity and maybe this eventually pressures platform BIOS to
> avoid causing Linux warnings.
> 
> In terms of making that loss easier for people to report / understand
> how about use string_get_size() to convert raw bytes to power of 10 and
> power of 2 values? I.e.
> 
> "DAX region truncated by X.XX GiB (Y.YY GB) due to alignment."

Will pick this up in v2.
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by Gregory Price 10 months, 1 week ago
On Tue, Apr 01, 2025 at 11:47:32AM +0200, David Hildenbrand wrote:
> 
> Can't that be done a bit simpler?

Yes, this is better, lets do this.  Thank you!

> 
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index e97d47f42ee2e..23a68ff809cdf 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -67,8 +67,8 @@ static void kmem_put_memory_types(void)
>  static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>  {
> +       unsigned long total_len = 0, orig_len = 0;
>         struct device *dev = &dev_dax->dev;
> -       unsigned long total_len = 0;
>         struct dax_kmem_data *data;
>         struct memory_dev_type *mtype;
>         int i, rc, mapped = 0;
> @@ -97,6 +97,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>         for (i = 0; i < dev_dax->nr_range; i++) {
>                 struct range range;
> +               orig_len += range_len(&dev_dax->ranges[i].range);
>                 rc = dax_kmem_range(dev_dax, i, &range);
>                 if (rc) {
>                         dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
> @@ -109,6 +110,9 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
>         if (!total_len) {
>                 dev_warn(dev, "rejecting DAX region without any memory after alignment\n");
>                 return -EINVAL;
> +       } else if (total_len != orig_len) {
> +               dev_warn(dev, "DAX region truncated by %lu bytes due to alignment\n",
> +                        orig_len - total_len);
>         }
>         init_node_memory_type(numa_node, mtype);
> 
> 
> -- 
> Cheers,
> 
> David / dhildenb
>
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by Gregory Price 10 months, 2 weeks ago
On Fri, Mar 21, 2025 at 02:07:31PM -0400, Gregory Price wrote:
> Device capacity intended for use as system ram should be aligned to the
> architecture-defined memory block size or that capacity will be silently
> truncated and capacity stranded.
> 
> As hotplug dax memory becomes more prevelant, the memory block size
> alignment becomes more important for platform and device vendors to
> pay attention to - so this truncation should not be silent.
> 
> This issue is particularly relevant for CXL Dynamic Capacity devices,
> whose capacity may arrive in spec-aligned but block-misaligned chunks.
> 
> Example:
>  [...] kmem dax0.0: dax region truncated 2684354560 bytes - alignment
>  [...] kmem dax1.0: dax region truncated 1610612736 bytes - alignment
> 
> Signed-off-by: Gregory Price <gourry@gourry.net>

Gentle pokes.  There were a couple questions last week whether we should
warn here or actually fix something in memory-hotplug.

Notes from CXL Boot to Bash session discussions:


We discussed [1] how this auto-sizing can cause 1GB huge page
allocation failures (assuming you online as ZONE_NORMAL). That means
ACPI-informed sizing by default would potentially be harmful to existing
systems and adding yet-another-boot-option just seems nasty.

I've since dropped acpi-informed block size patch[2].  If there are opinions
otherwise, I can continue pushing it.


We also discussed[3] variable-sized blocks having some nasty corner cases.
Not unsolvable, but doesn't help users in the short term.


There was some brief discussion about whether a hotplug memblock with a
portion as offline pages would be possible.  This seems hacky?  There
was another patch set discussing this, but I can't seem to find it.


I debated whether to warn here or in ACPI.  This seemed more accurate,
as platforms could simply over-reserve HPA space to avoid the issue.

Thoughts?
~Gregory

[1] https://lore.kernel.org/all/bda4cf52-d81a-4935-b45a-09e9439e33b6@redhat.com/
[2] https://lore.kernel.org/linux-mm/20250127153405.3379117-1-gourry@gourry.net/
[3]https://lore.kernel.org/all/b4b312c8-1117-45cd-a3c3-c8747aca51bd@redhat.com/
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by David Hildenbrand 10 months, 1 week ago
On 31.03.25 20:27, Gregory Price wrote:
> On Fri, Mar 21, 2025 at 02:07:31PM -0400, Gregory Price wrote:
>> Device capacity intended for use as system ram should be aligned to the
>> architecture-defined memory block size or that capacity will be silently
>> truncated and capacity stranded.
>>
>> As hotplug dax memory becomes more prevelant, the memory block size
>> alignment becomes more important for platform and device vendors to
>> pay attention to - so this truncation should not be silent.
>>
>> This issue is particularly relevant for CXL Dynamic Capacity devices,
>> whose capacity may arrive in spec-aligned but block-misaligned chunks.
>>
>> Example:
>>   [...] kmem dax0.0: dax region truncated 2684354560 bytes - alignment
>>   [...] kmem dax1.0: dax region truncated 1610612736 bytes - alignment
>>
>> Signed-off-by: Gregory Price <gourry@gourry.net>
> 
> Gentle pokes.  There were a couple questions last week whether we should
> warn here or actually fix something in memory-hotplug.
> 
> Notes from CXL Boot to Bash session discussions:
> 
> 
> We discussed [1] how this auto-sizing can cause 1GB huge page
> allocation failures (assuming you online as ZONE_NORMAL). That means
> ACPI-informed sizing by default would potentially be harmful to existing
> systems and adding yet-another-boot-option just seems nasty.
> 
> I've since dropped acpi-informed block size patch[2].  If there are opinions
> otherwise, I can continue pushing it.

Oh, I thought we would be going forward with that. What's the reason we 
would not want to do that?

> 
> 
> We also discussed[3] variable-sized blocks having some nasty corner cases.
> Not unsolvable, but doesn't help users in the short term.
> 
> 
> There was some brief discussion about whether a hotplug memblock with a
> portion as offline pages would be possible.  This seems hacky?  There
> was another patch set discussing this, but I can't seem to find it.

Yeah, I proposed something like that as well when I started working on 
virtio-mem and did not really understand the whole hot(un)plug model in 
Linux properly. Someone else proposed it again a couple of years ago, 
but it's just wrong and should not be done that way.

One could implement something like virtio-mem, whereby parts of a Linux 
memory block can be added/removed independently ("fake offlined"). But 
the whole idea of virtio-mem is that all memory in the Linux memory 
block range belongs to it. So it doesn't quite apply to DAX where parts 
of a Linux memory block might be from something completely different 
(e.g., boot memory etc).

-- 
Cheers,

David / dhildenb
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by Gregory Price 10 months, 1 week ago
On Tue, Apr 01, 2025 at 11:33:59AM +0200, David Hildenbrand wrote:
> On 31.03.25 20:27, Gregory Price wrote:
> > We discussed [1] how this auto-sizing can cause 1GB huge page
> > allocation failures (assuming you online as ZONE_NORMAL). That means
> > ACPI-informed sizing by default would potentially be harmful to existing
> > systems and adding yet-another-boot-option just seems nasty.
> > 
> > I've since dropped acpi-informed block size patch[2].  If there are opinions
> > otherwise, I can continue pushing it.
> 
> Oh, I thought we would be going forward with that. What's the reason we
> would not want to do that?
> 

It seemed like having it reduce block size by default would make 1GB huge
pages less reliable to allocate. If you think this isn't a large concern,
I can update and push again.  I suppose I could make it a build option.

Any opinions here are welcome.

~Gregory
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by David Hildenbrand 10 months, 1 week ago
On 01.04.25 16:43, Gregory Price wrote:
> On Tue, Apr 01, 2025 at 11:33:59AM +0200, David Hildenbrand wrote:
>> On 31.03.25 20:27, Gregory Price wrote:
>>> We discussed [1] how this auto-sizing can cause 1GB huge page
>>> allocation failures (assuming you online as ZONE_NORMAL). That means
>>> ACPI-informed sizing by default would potentially be harmful to existing
>>> systems and adding yet-another-boot-option just seems nasty.
>>>
>>> I've since dropped acpi-informed block size patch[2].  If there are opinions
>>> otherwise, I can continue pushing it.
>>
>> Oh, I thought we would be going forward with that. What's the reason we
>> would not want to do that?
>>
> 
> It seemed like having it reduce block size by default would make 1GB huge
> pages less reliable to allocate. If you think this isn't a large concern,
> I can update and push again.

Oh, you mean with the whole memmap_on_memory thing. Even with that, 
using 2GB memory blocks would only fit a single 1GB memory block ... and 
it requires ZONE_NORMAL.

For ordinary boot memory, the 1GB behavior should be independent of the 
memory block size (a 1GB page can span multiple blocks as long as they 
are in the same zone), which is the most important thing.

So I don't think it's a concern for DAX right now. Whoever needs that, 
can disable the memmap_on_memory option.

-- 
Cheers,

David / dhildenb
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by Gregory Price 10 months, 1 week ago
On Tue, Apr 01, 2025 at 04:50:40PM +0200, David Hildenbrand wrote:
> 
> Oh, you mean with the whole memmap_on_memory thing. Even with that, using
> 2GB memory blocks would only fit a single 1GB memory block ... and it
> requires ZONE_NORMAL.
> 
> For ordinary boot memory, the 1GB behavior should be independent of the
> memory block size (a 1GB page can span multiple blocks as long as they are
> in the same zone), which is the most important thing.
> 
> So I don't think it's a concern for DAX right now. Whoever needs that, can
> disable the memmap_on_memory option.
>

If we think it's not a major issue then I'll rebase onto latest and push
a v9.  I think there was one minor nit left.

I suppose folks can complain to their vendors about alignment if they
don't want 60000 memoryN entries on their huge-memory-systems.

Probably we still want this warning?  Silent truncation still seems
undesirable.

~Gregory
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by David Hildenbrand 10 months, 1 week ago
On 01.04.25 17:16, Gregory Price wrote:
> On Tue, Apr 01, 2025 at 04:50:40PM +0200, David Hildenbrand wrote:
>>
>> Oh, you mean with the whole memmap_on_memory thing. Even with that, using
>> 2GB memory blocks would only fit a single 1GB memory block ... and it
>> requires ZONE_NORMAL.
>>
>> For ordinary boot memory, the 1GB behavior should be independent of the
>> memory block size (a 1GB page can span multiple blocks as long as they are
>> in the same zone), which is the most important thing.
>>
>> So I don't think it's a concern for DAX right now. Whoever needs that, can
>> disable the memmap_on_memory option.
>>
> 
> If we think it's not a major issue then I'll rebase onto latest and push
> a v9.  I think there was one minor nit left.
> 
> I suppose folks can complain to their vendors about alignment if they
> don't want 60000 memoryN entries on their huge-memory-systems.
> 
> Probably we still want this warning?  Silent truncation still seems
> undesirable.

Yes, it's valuable I think. But should it be a warning or rather an info?

-- 
Cheers,

David / dhildenb
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by Gregory Price 10 months, 1 week ago
On Tue, Apr 01, 2025 at 05:19:28PM +0200, David Hildenbrand wrote:
> 
> Yes, it's valuable I think. But should it be a warning or rather an info?
> 

dev_warn, but yeah I think so?  A user expects to get their memory in
full, that means we're slightly misbehaving.  I'm fine with either.

~Gregory
Re: [PATCH] DAX: warn when kmem regions are truncated for memory block alignment.
Posted by David Hildenbrand 10 months, 1 week ago
On 01.04.25 17:26, Gregory Price wrote:
> On Tue, Apr 01, 2025 at 05:19:28PM +0200, David Hildenbrand wrote:
>>
>> Yes, it's valuable I think. But should it be a warning or rather an info?
>>
> 
> dev_warn, but yeah I think so?  A user expects to get their memory in
> full, that means we're slightly misbehaving.  I'm fine with either.

Fine with me :)

-- 
Cheers,

David / dhildenb