[PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes

Jingqi Liu posted 1 patch 3 years, 12 months ago
Test docker-mingw@fedora passed
Test docker-quick@centos7 passed
Test checkpatch passed
Test FreeBSD passed
Test asan passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20200401031314.11592-1-jingqi.liu@intel.com
Maintainers: Richard Henderson <rth@twiddle.net>, Paolo Bonzini <pbonzini@redhat.com>
exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 45 insertions(+), 1 deletion(-)
[PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Jingqi Liu 3 years, 12 months ago
If the backend file is devdax pmem character device, the alignment
specified by the option 'align=NUM' in the '-object memory-backend-file'
needs to match the alignment requirement of the devdax pmem character device.

This patch fetches the devdax pmem file 'align', so that we can compare
it with the NUM of 'align=NUM'.
The NUM needs to be larger than or equal to the devdax pmem file 'align'.

It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.

Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
---
 exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/exec.c b/exec.c
index de9d949902..8221abffec 100644
--- a/exec.c
+++ b/exec.c
@@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
     return size;
 }
 
+static int64_t get_file_align(int fd)
+{
+    int64_t align = -1;
+#if defined(__linux__)
+    struct stat st;
+
+    if (fstat(fd, &st) < 0) {
+        return -errno;
+    }
+
+    /* Special handling for devdax character devices */
+    if (S_ISCHR(st.st_mode)) {
+        g_autofree char *subsystem_path = NULL;
+        g_autofree char *subsystem = NULL;
+
+        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
+                                         major(st.st_rdev), minor(st.st_rdev));
+        subsystem = g_file_read_link(subsystem_path, NULL);
+
+        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
+            g_autofree char *align_path = NULL;
+            g_autofree char *align_str = NULL;
+
+            align_path = g_strdup_printf("/sys/dev/char/%d:%d/device/align",
+                                    major(st.st_rdev), minor(st.st_rdev));
+
+            if (g_file_get_contents(align_path, &align_str, NULL, NULL)) {
+                return g_ascii_strtoll(align_str, NULL, 0);
+            }
+        }
+    }
+#endif /* defined(__linux__) */
+
+    return align;
+}
+
 static int file_ram_open(const char *path,
                          const char *region_name,
                          bool *created,
@@ -2275,7 +2311,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
 {
     RAMBlock *new_block;
     Error *local_err = NULL;
-    int64_t file_size;
+    int64_t file_size, file_align;
 
     /* Just support these ram flags by now. */
     assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
@@ -2311,6 +2347,14 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
         return NULL;
     }
 
+    file_align = get_file_align(fd);
+    if (file_align > 0 && mr && file_align > mr->align) {
+        error_setg(errp, "backing store align 0x%" PRIx64
+                   " is larger than 'align' option 0x" RAM_ADDR_FMT,
+                   file_align, mr->align);
+        return NULL;
+    }
+
     new_block = g_malloc0(sizeof(*new_block));
     new_block->mr = mr;
     new_block->used_length = size;
-- 
2.17.1


Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Joao Martins 3 years, 11 months ago

On 4/1/20 4:13 AM, Jingqi Liu wrote:
> If the backend file is devdax pmem character device, the alignment
> specified by the option 'align=NUM' in the '-object memory-backend-file'
> needs to match the alignment requirement of the devdax pmem character device.
> 
> This patch fetches the devdax pmem file 'align', so that we can compare
> it with the NUM of 'align=NUM'.
> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
> 
> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
> 
> Cc: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
> ---
>  exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 45 insertions(+), 1 deletion(-)
> 
> diff --git a/exec.c b/exec.c
> index de9d949902..8221abffec 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
>      return size;
>  }
>  
> +static int64_t get_file_align(int fd)
> +{
> +    int64_t align = -1;
> +#if defined(__linux__)
> +    struct stat st;
> +
> +    if (fstat(fd, &st) < 0) {
> +        return -errno;
> +    }
> +
> +    /* Special handling for devdax character devices */
> +    if (S_ISCHR(st.st_mode)) {
> +        g_autofree char *subsystem_path = NULL;
> +        g_autofree char *subsystem = NULL;
> +
> +        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
> +                                         major(st.st_rdev), minor(st.st_rdev));
> +        subsystem = g_file_read_link(subsystem_path, NULL);
> +
> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
> +            g_autofree char *align_path = NULL;
> +            g_autofree char *align_str = NULL;
> +
> +            align_path = g_strdup_printf("/sys/dev/char/%d:%d/device/align",
> +                                    major(st.st_rdev), minor(st.st_rdev));
> +

Perhaps, you meant instead:

	/sys/dev/char/%d:%d/align

 ?

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Paolo Bonzini 3 years, 11 months ago
On 07/04/20 12:59, Joao Martins wrote:
> Perhaps, you meant instead:
> 
> 	/sys/dev/char/%d:%d/align
> 
>  ?
> 

So it works with that change?

Paolo


Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Joao Martins 3 years, 11 months ago
On 4/7/20 3:31 PM, Paolo Bonzini wrote:
> On 07/04/20 12:59, Joao Martins wrote:
>> Perhaps, you meant instead:
>>
>> 	/sys/dev/char/%d:%d/align
>>
>>  ?
>>
> 
> So it works with that change?

Yeah.

	Joao

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Liu, Jingqi 3 years, 11 months ago
On 4/7/2020 11:51 PM, Joao Martins wrote:
> On 4/7/20 3:31 PM, Paolo Bonzini wrote:
>> On 07/04/20 12:59, Joao Martins wrote:
>>> Perhaps, you meant instead:
>>>
>>> 	/sys/dev/char/%d:%d/align
>>>
>>>   ?
>>>
Hi Joao,

In my machine with real NVDIMM, the devdax device is as follows:

$ ll /dev/dax0.0
crw------- 1 root root 250, 6 3月  25 15:16 /dev/dax0.0

$ ls /sys/dev/char/250\:6/align
ls: cannot access '/sys/dev/char/250:6/align': No such file or directory

$ ls /sys/dev/char/250\:6/device/align

/sys/dev/char/250:6/device/align

So:
The file of "/sys/dev/char/%d:%d/align" does not exist.
It should be "/sys/dev/char/%d:%d/device/align".

Anyone has a real NVDIMM can help double check. Thanks.

Hi Dan,

You may have a real NVDIMM, in what directory is the 'align' file ?

Thanks,

Jingqi

>> So it works with that change?
> Yeah.
>
> 	Joao

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Joao Martins 3 years, 11 months ago
On 4/8/20 2:16 AM, Liu, Jingqi wrote:
> On 4/7/2020 11:51 PM, Joao Martins wrote:
>> On 4/7/20 3:31 PM, Paolo Bonzini wrote:
>>> On 07/04/20 12:59, Joao Martins wrote:
>>>> Perhaps, you meant instead:
>>>>
>>>> 	/sys/dev/char/%d:%d/align
>>>>
>>>>   ?
>>>>
> Hi Joao,
> 
> In my machine with real NVDIMM, the devdax device is as follows:
> 
> $ ll /dev/dax0.0
> crw------- 1 root root 250, 6 3月  25 15:16 /dev/dax0.0
> 
> $ ls /sys/dev/char/250\:6/align
> ls: cannot access '/sys/dev/char/250:6/align': No such file or directory
> 
> $ ls /sys/dev/char/250\:6/device/align
> 
> /sys/dev/char/250:6/device/align
> 
> So:
> The file of "/sys/dev/char/%d:%d/align" does not exist.
> It should be "/sys/dev/char/%d:%d/device/align".
> 
The 'align' (without the ../) was my mistake as I was testing with other wip
patches.

Albeit, still retain my comment to 'device/align' as it is relying on an
deprecated path, so perhaps we should look other alternatives.

I had the deprecated dax class disabled (DEV_DAX_PMEM_COMPAT=n), and 'device'
does not exist on that cases.



> Anyone has a real NVDIMM can help double check. Thanks.
> 
> Hi Dan,
> 
> You may have a real NVDIMM, in what directory is the 'align' file ?
> 
> Thanks,
> 
> Jingqi
> 
>>> So it works with that change?
>> Yeah.
>>
>> 	Joao

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Dan Williams 3 years, 11 months ago
On Tue, Apr 7, 2020 at 4:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>
>
>
> On 4/1/20 4:13 AM, Jingqi Liu wrote:
> > If the backend file is devdax pmem character device, the alignment
> > specified by the option 'align=NUM' in the '-object memory-backend-file'
> > needs to match the alignment requirement of the devdax pmem character device.
> >
> > This patch fetches the devdax pmem file 'align', so that we can compare
> > it with the NUM of 'align=NUM'.
> > The NUM needs to be larger than or equal to the devdax pmem file 'align'.
> >
> > It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
> > when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
> >
> > Cc: Dan Williams <dan.j.williams@intel.com>
> > Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
> > ---
> >  exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 45 insertions(+), 1 deletion(-)
> >
> > diff --git a/exec.c b/exec.c
> > index de9d949902..8221abffec 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
> >      return size;
> >  }
> >
> > +static int64_t get_file_align(int fd)
> > +{
> > +    int64_t align = -1;
> > +#if defined(__linux__)
> > +    struct stat st;
> > +
> > +    if (fstat(fd, &st) < 0) {
> > +        return -errno;
> > +    }
> > +
> > +    /* Special handling for devdax character devices */
> > +    if (S_ISCHR(st.st_mode)) {
> > +        g_autofree char *subsystem_path = NULL;
> > +        g_autofree char *subsystem = NULL;
> > +
> > +        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
> > +                                         major(st.st_rdev), minor(st.st_rdev));
> > +        subsystem = g_file_read_link(subsystem_path, NULL);
> > +
> > +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
> > +            g_autofree char *align_path = NULL;
> > +            g_autofree char *align_str = NULL;
> > +
> > +            align_path = g_strdup_printf("/sys/dev/char/%d:%d/device/align",
> > +                                    major(st.st_rdev), minor(st.st_rdev));
> > +
>
> Perhaps, you meant instead:
>
>         /sys/dev/char/%d:%d/align
>

Hmm, are you sure that's working? I expect the alignment to be found
in the region device:

/sys/class/dax:
/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.1/dax1.0
$(readlink -f /sys/dev/char/253\:263)/../align
$(readlink -f /sys/dev/char/253\:263)/device/align


/sys/bus/dax:
/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.0/dax1.0
$(readlink -f /sys/dev/char/253\:265)/../align
$(readlink -f /sys/dev/char/253\:265)/device/align <-- No such file

The use of the /sys/dev/char/%d:%d/device is only supported by the
deprecated /sys/class/dax. The current /sys/bus/dax device-model can
be a drop in replacement as long as software is not written to the
/sys/class sysfs layout, i.e. it uses ../ instead of device/ to walk
to the region properties.

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Joao Martins 3 years, 11 months ago
On 4/7/20 5:55 PM, Dan Williams wrote:
> On Tue, Apr 7, 2020 at 4:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>> On 4/1/20 4:13 AM, Jingqi Liu wrote:
>>> If the backend file is devdax pmem character device, the alignment
>>> specified by the option 'align=NUM' in the '-object memory-backend-file'
>>> needs to match the alignment requirement of the devdax pmem character device.
>>>
>>> This patch fetches the devdax pmem file 'align', so that we can compare
>>> it with the NUM of 'align=NUM'.
>>> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
>>>
>>> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
>>> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
>>>
>>> Cc: Dan Williams <dan.j.williams@intel.com>
>>> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
>>> ---
>>>  exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 45 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/exec.c b/exec.c
>>> index de9d949902..8221abffec 100644
>>> --- a/exec.c
>>> +++ b/exec.c
>>> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
>>>      return size;
>>>  }
>>>
>>> +static int64_t get_file_align(int fd)
>>> +{
>>> +    int64_t align = -1;
>>> +#if defined(__linux__)
>>> +    struct stat st;
>>> +
>>> +    if (fstat(fd, &st) < 0) {
>>> +        return -errno;
>>> +    }
>>> +
>>> +    /* Special handling for devdax character devices */
>>> +    if (S_ISCHR(st.st_mode)) {
>>> +        g_autofree char *subsystem_path = NULL;
>>> +        g_autofree char *subsystem = NULL;
>>> +
>>> +        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
>>> +                                         major(st.st_rdev), minor(st.st_rdev));
>>> +        subsystem = g_file_read_link(subsystem_path, NULL);
>>> +
>>> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
>>> +            g_autofree char *align_path = NULL;
>>> +            g_autofree char *align_str = NULL;
>>> +
>>> +            align_path = g_strdup_printf("/sys/dev/char/%d:%d/device/align",
>>> +                                    major(st.st_rdev), minor(st.st_rdev));
>>> +
>>
>> Perhaps, you meant instead:
>>
>>         /sys/dev/char/%d:%d/align
>>
> 
> Hmm, are you sure that's working? 

It is, except that I made the slight mistake of testing with a bunch of wip
patches on top which one of them actually adds the 'align' to child dax device.

Argh, my apologies - and thanks for noticing.

> I expect the alignment to be found
> in the region device:
> 
> /sys/class/dax:
> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.1/dax1.0
> $(readlink -f /sys/dev/char/253\:263)/../align
> $(readlink -f /sys/dev/char/253\:263)/device/align
> 
> 
> /sys/bus/dax:
> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.0/dax1.0
> $(readlink -f /sys/dev/char/253\:265)/../align
> $(readlink -f /sys/dev/char/253\:265)/device/align <-- No such file
> 
> The use of the /sys/dev/char/%d:%d/device is only supported by the
> deprecated /sys/class/dax. 

I don't have the deprecated dax class enabled as could you tell, so the second
case is what I was testing. Except it wasn't a namespace/nvdimm but rather an
hmem device-dax.

'../align' though covers only one case? What about hmem which '../align' returns
ENOENT; perhaps using '../dax_region/align' instead which is common to both?
Albeit that wouldn't address the sub-division devices (that I mention above)

> The current /sys/bus/dax device-model can
> be a drop in replacement as long as software is not written to the
> /sys/class sysfs layout, i.e. it uses ../ instead of device/ to walk
> to the region properties.
> 
/nods

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Paolo Bonzini 3 years, 11 months ago
On 07/04/20 20:28, Joao Martins wrote:
> On 4/7/20 5:55 PM, Dan Williams wrote:
>> On Tue, Apr 7, 2020 at 4:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>> On 4/1/20 4:13 AM, Jingqi Liu wrote:
>>>> If the backend file is devdax pmem character device, the alignment
>>>> specified by the option 'align=NUM' in the '-object memory-backend-file'
>>>> needs to match the alignment requirement of the devdax pmem character device.
>>>>
>>>> This patch fetches the devdax pmem file 'align', so that we can compare
>>>> it with the NUM of 'align=NUM'.
>>>> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
>>>>
>>>> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
>>>> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
>>>>
>>>> Cc: Dan Williams <dan.j.williams@intel.com>
>>>> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
>>>> ---
>>>>  exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>>>>  1 file changed, 45 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/exec.c b/exec.c
>>>> index de9d949902..8221abffec 100644
>>>> --- a/exec.c
>>>> +++ b/exec.c
>>>> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
>>>>      return size;
>>>>  }
>>>>
>>>> +static int64_t get_file_align(int fd)
>>>> +{
>>>> +    int64_t align = -1;
>>>> +#if defined(__linux__)
>>>> +    struct stat st;
>>>> +
>>>> +    if (fstat(fd, &st) < 0) {
>>>> +        return -errno;
>>>> +    }
>>>> +
>>>> +    /* Special handling for devdax character devices */
>>>> +    if (S_ISCHR(st.st_mode)) {
>>>> +        g_autofree char *subsystem_path = NULL;
>>>> +        g_autofree char *subsystem = NULL;
>>>> +
>>>> +        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
>>>> +                                         major(st.st_rdev), minor(st.st_rdev));
>>>> +        subsystem = g_file_read_link(subsystem_path, NULL);
>>>> +
>>>> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
>>>> +            g_autofree char *align_path = NULL;
>>>> +            g_autofree char *align_str = NULL;
>>>> +
>>>> +            align_path = g_strdup_printf("/sys/dev/char/%d:%d/device/align",
>>>> +                                    major(st.st_rdev), minor(st.st_rdev));
>>>> +
>>>
>>> Perhaps, you meant instead:
>>>
>>>         /sys/dev/char/%d:%d/align
>>>
>>
>> Hmm, are you sure that's working? 
> 
> It is, except that I made the slight mistake of testing with a bunch of wip
> patches on top which one of them actually adds the 'align' to child dax device.
> 
> Argh, my apologies - and thanks for noticing.
> 
>> I expect the alignment to be found
>> in the region device:
>>
>> /sys/class/dax:
>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.1/dax1.0
>> $(readlink -f /sys/dev/char/253\:263)/../align
>> $(readlink -f /sys/dev/char/253\:263)/device/align
>>
>>
>> /sys/bus/dax:
>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.0/dax1.0
>> $(readlink -f /sys/dev/char/253\:265)/../align
>> $(readlink -f /sys/dev/char/253\:265)/device/align <-- No such file
>>
>> The use of the /sys/dev/char/%d:%d/device is only supported by the
>> deprecated /sys/class/dax. 
> 
> I don't have the deprecated dax class enabled as could you tell, so the second
> case is what I was testing. Except it wasn't a namespace/nvdimm but rather an
> hmem device-dax.
> 
> '../align' though covers only one case? What about hmem which '../align' returns
> ENOENT; perhaps using '../dax_region/align' instead which is common to both?
> Albeit that wouldn't address the sub-division devices (that I mention above)

Clearly a 5.1 patch then. :)

Paolo



Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Liu, Jingqi 3 years, 11 months ago
On 4/8/2020 2:28 AM, Joao Martins wrote:
> On 4/7/20 5:55 PM, Dan Williams wrote:
>> On Tue, Apr 7, 2020 at 4:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>> On 4/1/20 4:13 AM, Jingqi Liu wrote:
>>>> If the backend file is devdax pmem character device, the alignment
>>>> specified by the option 'align=NUM' in the '-object memory-backend-file'
>>>> needs to match the alignment requirement of the devdax pmem character device.
>>>>
>>>> This patch fetches the devdax pmem file 'align', so that we can compare
>>>> it with the NUM of 'align=NUM'.
>>>> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
>>>>
>>>> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
>>>> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
>>>>
>>>> Cc: Dan Williams <dan.j.williams@intel.com>
>>>> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
>>>> ---
>>>>   exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>>>>   1 file changed, 45 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/exec.c b/exec.c
>>>> index de9d949902..8221abffec 100644
>>>> --- a/exec.c
>>>> +++ b/exec.c
>>>> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
>>>>       return size;
>>>>   }
>>>>
>>>> +static int64_t get_file_align(int fd)
>>>> +{
>>>> +    int64_t align = -1;
>>>> +#if defined(__linux__)
>>>> +    struct stat st;
>>>> +
>>>> +    if (fstat(fd, &st) < 0) {
>>>> +        return -errno;
>>>> +    }
>>>> +
>>>> +    /* Special handling for devdax character devices */
>>>> +    if (S_ISCHR(st.st_mode)) {
>>>> +        g_autofree char *subsystem_path = NULL;
>>>> +        g_autofree char *subsystem = NULL;
>>>> +
>>>> +        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
>>>> +                                         major(st.st_rdev), minor(st.st_rdev));
>>>> +        subsystem = g_file_read_link(subsystem_path, NULL);
>>>> +
>>>> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
>>>> +            g_autofree char *align_path = NULL;
>>>> +            g_autofree char *align_str = NULL;
>>>> +
>>>> +            align_path = g_strdup_printf("/sys/dev/char/%d:%d/device/align",
>>>> +                                    major(st.st_rdev), minor(st.st_rdev));
>>>> +
>>> Perhaps, you meant instead:
>>>
>>>          /sys/dev/char/%d:%d/align
>>>
>> Hmm, are you sure that's working?
> It is, except that I made the slight mistake of testing with a bunch of wip
> patches on top which one of them actually adds the 'align' to child dax device.
>
> Argh, my apologies - and thanks for noticing.
>
>> I expect the alignment to be found
>> in the region device:
>>
>> /sys/class/dax:
>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.1/dax1.0
>> $(readlink -f /sys/dev/char/253\:263)/../align
>> $(readlink -f /sys/dev/char/253\:263)/device/align
>>
>>
>> /sys/bus/dax:
>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.0/dax1.0
>> $(readlink -f /sys/dev/char/253\:265)/../align
>> $(readlink -f /sys/dev/char/253\:265)/device/align <-- No such file
>>
>> The use of the /sys/dev/char/%d:%d/device is only supported by the
>> deprecated /sys/class/dax.

Hi Dan,

Thanks for your comments.

Seems it is a mistake.

It should be: $(readlink -f /sys/dev/char/253\:263)/../../align

> I don't have the deprecated dax class enabled as could you tell, so the second
> case is what I was testing. Except it wasn't a namespace/nvdimm but rather an
> hmem device-dax.
>
> '../align' though covers only one case? What about hmem which '../align' returns
> ENOENT; perhaps using '../dax_region/align' instead which is common to both?
> Albeit that wouldn't address the sub-division devices (that I mention above)

Seems that you mean to use $(readlink -f 
/sys/dev/char/253\:263)/../../dax_region/align.

Right ?

Thanks,

Jingqi

>> The current /sys/bus/dax device-model can
>> be a drop in replacement as long as software is not written to the
>> /sys/class sysfs layout, i.e. it uses ../ instead of device/ to walk
>> to the region properties.
>>
> /nods

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Joao Martins 3 years, 11 months ago
On 4/8/20 3:25 AM, Liu, Jingqi wrote:
> On 4/8/2020 2:28 AM, Joao Martins wrote:
>> On 4/7/20 5:55 PM, Dan Williams wrote:
>>> On Tue, Apr 7, 2020 at 4:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>> On 4/1/20 4:13 AM, Jingqi Liu wrote:
>>>>> If the backend file is devdax pmem character device, the alignment
>>>>> specified by the option 'align=NUM' in the '-object memory-backend-file'
>>>>> needs to match the alignment requirement of the devdax pmem character device.
>>>>>
>>>>> This patch fetches the devdax pmem file 'align', so that we can compare
>>>>> it with the NUM of 'align=NUM'.
>>>>> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
>>>>>
>>>>> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
>>>>> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
>>>>>
>>>>> Cc: Dan Williams <dan.j.williams@intel.com>
>>>>> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
>>>>> ---
>>>>>   exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>>>>>   1 file changed, 45 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/exec.c b/exec.c
>>>>> index de9d949902..8221abffec 100644
>>>>> --- a/exec.c
>>>>> +++ b/exec.c
>>>>> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
>>>>>       return size;
>>>>>   }
>>>>>
>>>>> +static int64_t get_file_align(int fd)
>>>>> +{
>>>>> +    int64_t align = -1;
>>>>> +#if defined(__linux__)
>>>>> +    struct stat st;
>>>>> +
>>>>> +    if (fstat(fd, &st) < 0) {
>>>>> +        return -errno;
>>>>> +    }
>>>>> +
>>>>> +    /* Special handling for devdax character devices */
>>>>> +    if (S_ISCHR(st.st_mode)) {
>>>>> +        g_autofree char *subsystem_path = NULL;
>>>>> +        g_autofree char *subsystem = NULL;
>>>>> +
>>>>> +        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
>>>>> +                                         major(st.st_rdev), minor(st.st_rdev));
>>>>> +        subsystem = g_file_read_link(subsystem_path, NULL);
>>>>> +
>>>>> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
>>>>> +            g_autofree char *align_path = NULL;
>>>>> +            g_autofree char *align_str = NULL;
>>>>> +
>>>>> +            align_path = g_strdup_printf("/sys/dev/char/%d:%d/device/align",
>>>>> +                                    major(st.st_rdev), minor(st.st_rdev));
>>>>> +
>>>> Perhaps, you meant instead:
>>>>
>>>>          /sys/dev/char/%d:%d/align
>>>>
>>> Hmm, are you sure that's working?
>> It is, except that I made the slight mistake of testing with a bunch of wip
>> patches on top which one of them actually adds the 'align' to child dax device.
>>
>> Argh, my apologies - and thanks for noticing.
>>
>>> I expect the alignment to be found
>>> in the region device:
>>>
>>> /sys/class/dax:
>>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.1/dax1.0
>>> $(readlink -f /sys/dev/char/253\:263)/../align
>>> $(readlink -f /sys/dev/char/253\:263)/device/align
>>>
>>>
>>> /sys/bus/dax:
>>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.0/dax1.0
>>> $(readlink -f /sys/dev/char/253\:265)/../align
>>> $(readlink -f /sys/dev/char/253\:265)/device/align <-- No such file
>>>
>>> The use of the /sys/dev/char/%d:%d/device is only supported by the
>>> deprecated /sys/class/dax.
> 
> Hi Dan,
> 
> Thanks for your comments.
> 
> Seems it is a mistake.
> 
> It should be: $(readlink -f /sys/dev/char/253\:263)/../../align
> 

Hmm, perhaps you have an extra '../' in the path? This works for me:

# ls $(readlink -f /sys/dev/char/252\:0/../align)
/sys/devices/platform/e820_pmem/ndbus0/region0/dax0.0/dax0.0/../align
# cat $(readlink -f /sys/dev/char/252\:0)/../align
2097152
# cat /sys/dev/char/252\:0/../align
2097152

>> I don't have the deprecated dax class enabled as could you tell, so the second
>> case is what I was testing. Except it wasn't a namespace/nvdimm but rather an
>> hmem device-dax.
>>
>> '../align' though covers only one case? What about hmem which '../align' returns
>> ENOENT; perhaps using '../dax_region/align' instead which is common to both?
>> Albeit that wouldn't address the sub-division devices (that I mention above)
> 
> Seems that you mean to use $(readlink -f 
> /sys/dev/char/253\:263)/../../dax_region/align.
> 
> Right ?
> 

An extra '../' ?

# ls $(readlink -f /sys/dev/char/252\:0/../dax_region/align)
/sys/devices/platform/e820_pmem/ndbus0/region0/dax0.0/dax0.0/../align
# cat $(readlink -f /sys/dev/char/252\:0)/../dax_region/align
2097152
# cat /sys/dev/char/252\:0/../dax_region/align
2097152

For HMAT/hmem devdax, though, only 'dax_region/align' is available for now:

# ls $(readlink -f /sys/dev/char/252:0)/../align
ls: cannot access /sys/devices/platform/hmem.0/dax0.0/../align: No such file or
directory
# ls $(readlink -f /sys/dev/char/252:0)/../dax_region/align
/sys/devices/platform/hmem.0/dax0.0/../dax_region/align
# cat $(readlink -f /sys/dev/char/252:0)/../dax_region/align
2097152

The 'dax_region/align' was just an idea mainly because it's common to both
device-dax devices -- not sure how others feel about it.

	Joao

> Thanks,
> 
> Jingqi
> 
>>> The current /sys/bus/dax device-model can
>>> be a drop in replacement as long as software is not written to the
>>> /sys/class sysfs layout, i.e. it uses ../ instead of device/ to walk
>>> to the region properties.
>>>
>> /nods

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Liu, Jingqi 3 years, 11 months ago
On 4/8/2020 5:42 PM, Joao Martins wrote:
> On 4/8/20 3:25 AM, Liu, Jingqi wrote:
>> On 4/8/2020 2:28 AM, Joao Martins wrote:
>>> On 4/7/20 5:55 PM, Dan Williams wrote:
>>>> On Tue, Apr 7, 2020 at 4:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>> Perhaps, you meant instead:
>>>>>
>>>>>           /sys/dev/char/%d:%d/align
>>>>>
>>>> Hmm, are you sure that's working?
>>> It is, except that I made the slight mistake of testing with a bunch of wip
>>> patches on top which one of them actually adds the 'align' to child dax device.
>>>
>>> Argh, my apologies - and thanks for noticing.
>>>
>>>> I expect the alignment to be found
>>>> in the region device:
>>>>
>>>> /sys/class/dax:
>>>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.1/dax1.0
>>>> $(readlink -f /sys/dev/char/253\:263)/../align
>>>> $(readlink -f /sys/dev/char/253\:263)/device/align
>>>>
>>>>
>>>> /sys/bus/dax:
>>>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.0/dax1.0
>>>> $(readlink -f /sys/dev/char/253\:265)/../align
>>>> $(readlink -f /sys/dev/char/253\:265)/device/align <-- No such file
>>>>
>>>> The use of the /sys/dev/char/%d:%d/device is only supported by the
>>>> deprecated /sys/class/dax.
>> Hi Dan,
>>
>> Thanks for your comments.
>>
>> Seems it is a mistake.
>>
>> It should be: $(readlink -f /sys/dev/char/253\:263)/../../align
>>
> Hmm, perhaps you have an extra '../' in the path? This works for me:
>
> # ls $(readlink -f /sys/dev/char/252\:0/../align)
> /sys/devices/platform/e820_pmem/ndbus0/region0/dax0.0/dax0.0/../align
> # cat $(readlink -f /sys/dev/char/252\:0)/../align
> 2097152
> # cat /sys/dev/char/252\:0/../align
> 2097152

Hi Joao,

Hmm, I need to have an extra '../' in the path. The details are as follows:

# ll /dev/dax2.0
crw------- 1 root root 251, 5 Mar 20 13:35 /dev/dax2.0
# uname -r
5.6.0-rc1-00044-gb19e8c684703
# readlink -f /sys/dev/char/251\:5/
/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0
# ls $(readlink -f /sys/dev/char/251\:5)/../align
ls: cannot access 
'/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../align': 
No such file or directory
# ls $(readlink -f /sys/dev/char/251\:5)/../dax_region/align
ls: cannot access 
'/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../dax_region/align': 
No such file or directory
# ls $(readlink -f /sys/dev/char/251\:5)/../../align
/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../../align
# ls $(readlink -f /sys/dev/char/251\:5)/../../dax_region/align
/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../../dax_region/align
# lsmod|grep pmem
dax_pmem_compat        16384  0
device_dax             20480  1 dax_pmem_compat
dax_pmem_core          16384  1 dax_pmem_compat
# lsmod|grep dax
dax_pmem_compat        16384  0
device_dax             20480  1 dax_pmem_compat
dax_pmem_core          16384  1 dax_pmem_compat

Seems some configurations are different ?

Can you share your info as above ? Thanks.

>>> I don't have the deprecated dax class enabled as could you tell, so the second
>>> case is what I was testing. Except it wasn't a namespace/nvdimm but rather an
>>> hmem device-dax.
>>>
>>> '../align' though covers only one case? What about hmem which '../align' returns
>>> ENOENT; perhaps using '../dax_region/align' instead which is common to both?
>>> Albeit that wouldn't address the sub-division devices (that I mention above)
>> Seems that you mean to use $(readlink -f
>> /sys/dev/char/253\:263)/../../dax_region/align.
>>
>> Right ?
>>
> An extra '../' ?
>
> # ls $(readlink -f /sys/dev/char/252\:0/../dax_region/align)
> /sys/devices/platform/e820_pmem/ndbus0/region0/dax0.0/dax0.0/../align
> # cat $(readlink -f /sys/dev/char/252\:0)/../dax_region/align
> 2097152
> # cat /sys/dev/char/252\:0/../dax_region/align
> 2097152
>
> For HMAT/hmem devdax, though, only 'dax_region/align' is available for now:
>
> # ls $(readlink -f /sys/dev/char/252:0)/../align
> ls: cannot access /sys/devices/platform/hmem.0/dax0.0/../align: No such file or
> directory
> # ls $(readlink -f /sys/dev/char/252:0)/../dax_region/align
> /sys/devices/platform/hmem.0/dax0.0/../dax_region/align
> # cat $(readlink -f /sys/dev/char/252:0)/../dax_region/align
> 2097152
>
> The 'dax_region/align' was just an idea mainly because it's common to both
> device-dax devices -- not sure how others feel about it.

Seems it's reasonable. I need to sync the above path with yours.

Thanks,

Jingqi

>
> 	Joao
>
>> Thanks,
>>
>> Jingqi
>>
>>>> The current /sys/bus/dax device-model can
>>>> be a drop in replacement as long as software is not written to the
>>>> /sys/class sysfs layout, i.e. it uses ../ instead of device/ to walk
>>>> to the region properties.
>>>>
>>> /nods

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Dan Williams 3 years, 11 months ago
On Thu, Apr 9, 2020 at 7:33 AM Liu, Jingqi <jingqi.liu@intel.com> wrote:
>
> On 4/8/2020 5:42 PM, Joao Martins wrote:
> > On 4/8/20 3:25 AM, Liu, Jingqi wrote:
> >> On 4/8/2020 2:28 AM, Joao Martins wrote:
> >>> On 4/7/20 5:55 PM, Dan Williams wrote:
> >>>> On Tue, Apr 7, 2020 at 4:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
> >>>>> Perhaps, you meant instead:
> >>>>>
> >>>>>           /sys/dev/char/%d:%d/align
> >>>>>
> >>>> Hmm, are you sure that's working?
> >>> It is, except that I made the slight mistake of testing with a bunch of wip
> >>> patches on top which one of them actually adds the 'align' to child dax device.
> >>>
> >>> Argh, my apologies - and thanks for noticing.
> >>>
> >>>> I expect the alignment to be found
> >>>> in the region device:
> >>>>
> >>>> /sys/class/dax:
> >>>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.1/dax1.0
> >>>> $(readlink -f /sys/dev/char/253\:263)/../align
> >>>> $(readlink -f /sys/dev/char/253\:263)/device/align
> >>>>
> >>>>
> >>>> /sys/bus/dax:
> >>>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.0/dax1.0
> >>>> $(readlink -f /sys/dev/char/253\:265)/../align
> >>>> $(readlink -f /sys/dev/char/253\:265)/device/align <-- No such file
> >>>>
> >>>> The use of the /sys/dev/char/%d:%d/device is only supported by the
> >>>> deprecated /sys/class/dax.
> >> Hi Dan,
> >>
> >> Thanks for your comments.
> >>
> >> Seems it is a mistake.
> >>
> >> It should be: $(readlink -f /sys/dev/char/253\:263)/../../align
> >>
> > Hmm, perhaps you have an extra '../' in the path? This works for me:
> >
> > # ls $(readlink -f /sys/dev/char/252\:0/../align)
> > /sys/devices/platform/e820_pmem/ndbus0/region0/dax0.0/dax0.0/../align
> > # cat $(readlink -f /sys/dev/char/252\:0)/../align
> > 2097152
> > # cat /sys/dev/char/252\:0/../align
> > 2097152
>
> Hi Joao,
>
> Hmm, I need to have an extra '../' in the path. The details are as follows:
>
> # ll /dev/dax2.0
> crw------- 1 root root 251, 5 Mar 20 13:35 /dev/dax2.0
> # uname -r
> 5.6.0-rc1-00044-gb19e8c684703
> # readlink -f /sys/dev/char/251\:5/
> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0
> # ls $(readlink -f /sys/dev/char/251\:5)/../align
> ls: cannot access
> '/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../align':
> No such file or directory
> # ls $(readlink -f /sys/dev/char/251\:5)/../dax_region/align
> ls: cannot access
> '/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../dax_region/align':
> No such file or directory
> # ls $(readlink -f /sys/dev/char/251\:5)/../../align
> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../../align
> # ls $(readlink -f /sys/dev/char/251\:5)/../../dax_region/align
> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../../dax_region/align
> # lsmod|grep pmem
> dax_pmem_compat        16384  0
> device_dax             20480  1 dax_pmem_compat
> dax_pmem_core          16384  1 dax_pmem_compat
> # lsmod|grep dax
> dax_pmem_compat        16384  0
> device_dax             20480  1 dax_pmem_compat
> dax_pmem_core          16384  1 dax_pmem_compat
>
> Seems some configurations are different ?
>
> Can you share your info as above ? Thanks.

Alternatively maybe you can use libdaxctl that automatically handles
the ABI differences between compat-dax-class and dax-bus layouts? I
didn't recommend it before because I was not sure about qemu's policy
about taking on new library dependencies, but with libdaxctl you could
do something like (untested):

path = g_strdup_printf("/sys/dev/char/%d:%d", major(st.st_rdev),
minor(st.st_rdev));
rpath = realpath(path, NULL);
daxctl_region_foreach(ctx, region)
    if (strstr(daxctl_region_get_path(region), rpath)) {
        align = daxctl_region_get_align(region);
        break;
    }

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Liu, Jingqi 3 years, 11 months ago
On 4/10/2020 12:46 AM, Dan Williams wrote:
> On Thu, Apr 9, 2020 at 7:33 AM Liu, Jingqi <jingqi.liu@intel.com> wrote:
>> On 4/8/2020 5:42 PM, Joao Martins wrote:
>>> On 4/8/20 3:25 AM, Liu, Jingqi wrote:
>>>> On 4/8/2020 2:28 AM, Joao Martins wrote:
>>>>> On 4/7/20 5:55 PM, Dan Williams wrote:
>>>>>> On Tue, Apr 7, 2020 at 4:01 AM Joao Martins <joao.m.martins@oracle.com> wrote:
>>>>>>> Perhaps, you meant instead:
>>>>>>>
>>>>>>>            /sys/dev/char/%d:%d/align
>>>>>>>
>>>>>> Hmm, are you sure that's working?
>>>>> It is, except that I made the slight mistake of testing with a bunch of wip
>>>>> patches on top which one of them actually adds the 'align' to child dax device.
>>>>>
>>>>> Argh, my apologies - and thanks for noticing.
>>>>>
>>>>>> I expect the alignment to be found
>>>>>> in the region device:
>>>>>>
>>>>>> /sys/class/dax:
>>>>>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.1/dax1.0
>>>>>> $(readlink -f /sys/dev/char/253\:263)/../align
>>>>>> $(readlink -f /sys/dev/char/253\:263)/device/align
>>>>>>
>>>>>>
>>>>>> /sys/bus/dax:
>>>>>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus1/region1/dax1.0/dax1.0
>>>>>> $(readlink -f /sys/dev/char/253\:265)/../align
>>>>>> $(readlink -f /sys/dev/char/253\:265)/device/align <-- No such file
>>>>>>
>>>>>> The use of the /sys/dev/char/%d:%d/device is only supported by the
>>>>>> deprecated /sys/class/dax.
>>>> Hi Dan,
>>>>
>>>> Thanks for your comments.
>>>>
>>>> Seems it is a mistake.
>>>>
>>>> It should be: $(readlink -f /sys/dev/char/253\:263)/../../align
>>>>
>>> Hmm, perhaps you have an extra '../' in the path? This works for me:
>>>
>>> # ls $(readlink -f /sys/dev/char/252\:0/../align)
>>> /sys/devices/platform/e820_pmem/ndbus0/region0/dax0.0/dax0.0/../align
>>> # cat $(readlink -f /sys/dev/char/252\:0)/../align
>>> 2097152
>>> # cat /sys/dev/char/252\:0/../align
>>> 2097152
>> Hi Joao,
>>
>> Hmm, I need to have an extra '../' in the path. The details are as follows:
>>
>> # ll /dev/dax2.0
>> crw------- 1 root root 251, 5 Mar 20 13:35 /dev/dax2.0
>> # uname -r
>> 5.6.0-rc1-00044-gb19e8c684703
>> # readlink -f /sys/dev/char/251\:5/
>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0
>> # ls $(readlink -f /sys/dev/char/251\:5)/../align
>> ls: cannot access
>> '/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../align':
>> No such file or directory
>> # ls $(readlink -f /sys/dev/char/251\:5)/../dax_region/align
>> ls: cannot access
>> '/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../dax_region/align':
>> No such file or directory
>> # ls $(readlink -f /sys/dev/char/251\:5)/../../align
>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../../align
>> # ls $(readlink -f /sys/dev/char/251\:5)/../../dax_region/align
>> /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/region2/dax2.1/dax/dax2.0/../../dax_region/align
>> # lsmod|grep pmem
>> dax_pmem_compat        16384  0
>> device_dax             20480  1 dax_pmem_compat
>> dax_pmem_core          16384  1 dax_pmem_compat
>> # lsmod|grep dax
>> dax_pmem_compat        16384  0
>> device_dax             20480  1 dax_pmem_compat
>> dax_pmem_core          16384  1 dax_pmem_compat
>>
>> Seems some configurations are different ?
>>
>> Can you share your info as above ? Thanks.
> Alternatively maybe you can use libdaxctl that automatically handles
> the ABI differences between compat-dax-class and dax-bus layouts? I
> didn't recommend it before because I was not sure about qemu's policy
> about taking on new library dependencies, but with libdaxctl you could
> do something like (untested):
>
> path = g_strdup_printf("/sys/dev/char/%d:%d", major(st.st_rdev),
> minor(st.st_rdev));
> rpath = realpath(path, NULL);
> daxctl_region_foreach(ctx, region)
>      if (strstr(daxctl_region_get_path(region), rpath)) {
>          align = daxctl_region_get_align(region);
>          break;
>      }

Thanks for your suggestion. I'll test it.

Thanks,

Jingqi


Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Paolo Bonzini 3 years, 11 months ago
On 09/04/20 18:46, Dan Williams wrote:
> Alternatively maybe you can use libdaxctl that automatically handles
> the ABI differences between compat-dax-class and dax-bus layouts? I
> didn't recommend it before because I was not sure about qemu's policy
> about taking on new library dependencies

In this case I think the new dependency is more than justified, thanks!

Paolo


Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Liu, Jingqi 3 years, 11 months ago
Ping.

Any comments are appreciated.

Hi Paolo, Richard,

Any comments about this ?


Thanks,

Jingqi

On 4/1/2020 11:13 AM, Liu, Jingqi wrote:
> If the backend file is devdax pmem character device, the alignment
> specified by the option 'align=NUM' in the '-object memory-backend-file'
> needs to match the alignment requirement of the devdax pmem character device.
>
> This patch fetches the devdax pmem file 'align', so that we can compare
> it with the NUM of 'align=NUM'.
> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
>
> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
> ---
>   exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 45 insertions(+), 1 deletion(-)
>
> diff --git a/exec.c b/exec.c
> index de9d949902..8221abffec 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
>       return size;
>   }
>   
> +static int64_t get_file_align(int fd)
> +{
> +    int64_t align = -1;
> +#if defined(__linux__)
> +    struct stat st;
> +
> +    if (fstat(fd, &st) < 0) {
> +        return -errno;
> +    }
> +
> +    /* Special handling for devdax character devices */
> +    if (S_ISCHR(st.st_mode)) {
> +        g_autofree char *subsystem_path = NULL;
> +        g_autofree char *subsystem = NULL;
> +
> +        subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
> +                                         major(st.st_rdev), minor(st.st_rdev));
> +        subsystem = g_file_read_link(subsystem_path, NULL);
> +
> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
> +            g_autofree char *align_path = NULL;
> +            g_autofree char *align_str = NULL;
> +
> +            align_path = g_strdup_printf("/sys/dev/char/%d:%d/device/align",
> +                                    major(st.st_rdev), minor(st.st_rdev));
> +
> +            if (g_file_get_contents(align_path, &align_str, NULL, NULL)) {
> +                return g_ascii_strtoll(align_str, NULL, 0);
> +            }
> +        }
> +    }
> +#endif /* defined(__linux__) */
> +
> +    return align;
> +}
> +
>   static int file_ram_open(const char *path,
>                            const char *region_name,
>                            bool *created,
> @@ -2275,7 +2311,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>   {
>       RAMBlock *new_block;
>       Error *local_err = NULL;
> -    int64_t file_size;
> +    int64_t file_size, file_align;
>   
>       /* Just support these ram flags by now. */
>       assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
> @@ -2311,6 +2347,14 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
>           return NULL;
>       }
>   
> +    file_align = get_file_align(fd);
> +    if (file_align > 0 && mr && file_align > mr->align) {
> +        error_setg(errp, "backing store align 0x%" PRIx64
> +                   " is larger than 'align' option 0x" RAM_ADDR_FMT,
> +                   file_align, mr->align);
> +        return NULL;
> +    }
> +
>       new_block = g_malloc0(sizeof(*new_block));
>       new_block->mr = mr;
>       new_block->used_length = size;

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Paolo Bonzini 3 years, 11 months ago
On 07/04/20 09:29, Liu, Jingqi wrote:
> Ping.
> 
> Any comments are appreciated.
> 
> Hi Paolo, Richard,
> 
> Any comments about this ?

I was hoping to get a review from someone else because I have no way to
test it.  But I've now queued the patch, thanks.

Paolo

> 
> Thanks,
> 
> Jingqi
> 
> On 4/1/2020 11:13 AM, Liu, Jingqi wrote:
>> If the backend file is devdax pmem character device, the alignment
>> specified by the option 'align=NUM' in the '-object memory-backend-file'
>> needs to match the alignment requirement of the devdax pmem character
>> device.
>>
>> This patch fetches the devdax pmem file 'align', so that we can compare
>> it with the NUM of 'align=NUM'.
>> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
>>
>> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
>> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
>>
>> Cc: Dan Williams <dan.j.williams@intel.com>
>> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
>> ---
>>   exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 45 insertions(+), 1 deletion(-)
>>
>> diff --git a/exec.c b/exec.c
>> index de9d949902..8221abffec 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
>>       return size;
>>   }
>>   +static int64_t get_file_align(int fd)
>> +{
>> +    int64_t align = -1;
>> +#if defined(__linux__)
>> +    struct stat st;
>> +
>> +    if (fstat(fd, &st) < 0) {
>> +        return -errno;
>> +    }
>> +
>> +    /* Special handling for devdax character devices */
>> +    if (S_ISCHR(st.st_mode)) {
>> +        g_autofree char *subsystem_path = NULL;
>> +        g_autofree char *subsystem = NULL;
>> +
>> +        subsystem_path =
>> g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
>> +                                         major(st.st_rdev),
>> minor(st.st_rdev));
>> +        subsystem = g_file_read_link(subsystem_path, NULL);
>> +
>> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
>> +            g_autofree char *align_path = NULL;
>> +            g_autofree char *align_str = NULL;
>> +
>> +            align_path =
>> g_strdup_printf("/sys/dev/char/%d:%d/device/align",
>> +                                    major(st.st_rdev),
>> minor(st.st_rdev));
>> +
>> +            if (g_file_get_contents(align_path, &align_str, NULL,
>> NULL)) {
>> +                return g_ascii_strtoll(align_str, NULL, 0);
>> +            }
>> +        }
>> +    }
>> +#endif /* defined(__linux__) */
>> +
>> +    return align;
>> +}
>> +
>>   static int file_ram_open(const char *path,
>>                            const char *region_name,
>>                            bool *created,
>> @@ -2275,7 +2311,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t
>> size, MemoryRegion *mr,
>>   {
>>       RAMBlock *new_block;
>>       Error *local_err = NULL;
>> -    int64_t file_size;
>> +    int64_t file_size, file_align;
>>         /* Just support these ram flags by now. */
>>       assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
>> @@ -2311,6 +2347,14 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t
>> size, MemoryRegion *mr,
>>           return NULL;
>>       }
>>   +    file_align = get_file_align(fd);
>> +    if (file_align > 0 && mr && file_align > mr->align) {
>> +        error_setg(errp, "backing store align 0x%" PRIx64
>> +                   " is larger than 'align' option 0x" RAM_ADDR_FMT,
>> +                   file_align, mr->align);
>> +        return NULL;
>> +    }
>> +
>>       new_block = g_malloc0(sizeof(*new_block));
>>       new_block->mr = mr;
>>       new_block->used_length = size;
> 


Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Dan Williams 3 years, 11 months ago
On Tue, Apr 7, 2020 at 1:08 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
> On 07/04/20 09:29, Liu, Jingqi wrote:
> > Ping.
> >
> > Any comments are appreciated.
> >
> > Hi Paolo, Richard,
> >
> > Any comments about this ?
>
> I was hoping to get a review from someone else because I have no way to
> test it.  But I've now queued the patch, thanks.

Does qemu run tests in a nested VM? The difficult aspect of testing
devdax is that you need to boot your kernel with a special option or
have existing memory ranges assigned to the device. Although, Joao had
thoughts about allowing dynamic creation of device-dax instance by hot
unplugging memory.


>
> Paolo
>
> >
> > Thanks,
> >
> > Jingqi
> >
> > On 4/1/2020 11:13 AM, Liu, Jingqi wrote:
> >> If the backend file is devdax pmem character device, the alignment
> >> specified by the option 'align=NUM' in the '-object memory-backend-file'
> >> needs to match the alignment requirement of the devdax pmem character
> >> device.
> >>
> >> This patch fetches the devdax pmem file 'align', so that we can compare
> >> it with the NUM of 'align=NUM'.
> >> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
> >>
> >> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
> >> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
> >>
> >> Cc: Dan Williams <dan.j.williams@intel.com>
> >> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
> >> ---
> >>   exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
> >>   1 file changed, 45 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/exec.c b/exec.c
> >> index de9d949902..8221abffec 100644
> >> --- a/exec.c
> >> +++ b/exec.c
> >> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
> >>       return size;
> >>   }
> >>   +static int64_t get_file_align(int fd)
> >> +{
> >> +    int64_t align = -1;
> >> +#if defined(__linux__)
> >> +    struct stat st;
> >> +
> >> +    if (fstat(fd, &st) < 0) {
> >> +        return -errno;
> >> +    }
> >> +
> >> +    /* Special handling for devdax character devices */
> >> +    if (S_ISCHR(st.st_mode)) {
> >> +        g_autofree char *subsystem_path = NULL;
> >> +        g_autofree char *subsystem = NULL;
> >> +
> >> +        subsystem_path =
> >> g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
> >> +                                         major(st.st_rdev),
> >> minor(st.st_rdev));
> >> +        subsystem = g_file_read_link(subsystem_path, NULL);
> >> +
> >> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
> >> +            g_autofree char *align_path = NULL;
> >> +            g_autofree char *align_str = NULL;
> >> +
> >> +            align_path =
> >> g_strdup_printf("/sys/dev/char/%d:%d/device/align",
> >> +                                    major(st.st_rdev),
> >> minor(st.st_rdev));
> >> +
> >> +            if (g_file_get_contents(align_path, &align_str, NULL,
> >> NULL)) {
> >> +                return g_ascii_strtoll(align_str, NULL, 0);
> >> +            }
> >> +        }
> >> +    }
> >> +#endif /* defined(__linux__) */
> >> +
> >> +    return align;
> >> +}
> >> +
> >>   static int file_ram_open(const char *path,
> >>                            const char *region_name,
> >>                            bool *created,
> >> @@ -2275,7 +2311,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t
> >> size, MemoryRegion *mr,
> >>   {
> >>       RAMBlock *new_block;
> >>       Error *local_err = NULL;
> >> -    int64_t file_size;
> >> +    int64_t file_size, file_align;
> >>         /* Just support these ram flags by now. */
> >>       assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
> >> @@ -2311,6 +2347,14 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t
> >> size, MemoryRegion *mr,
> >>           return NULL;
> >>       }
> >>   +    file_align = get_file_align(fd);
> >> +    if (file_align > 0 && mr && file_align > mr->align) {
> >> +        error_setg(errp, "backing store align 0x%" PRIx64
> >> +                   " is larger than 'align' option 0x" RAM_ADDR_FMT,
> >> +                   file_align, mr->align);
> >> +        return NULL;

Is there any downside to just making the alignment value be the max of
the device-dax instance align and the command line option? Why force
someone to debug the option unnecessarily?

Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Joao Martins 3 years, 11 months ago
On 4/7/20 9:16 AM, Dan Williams wrote:
> On Tue, Apr 7, 2020 at 1:08 AM Paolo Bonzini <pbonzini@redhat.com> wrote:
>> On 07/04/20 09:29, Liu, Jingqi wrote:
>>> Ping.
>>>
>>> Any comments are appreciated.
>>>
>>> Hi Paolo, Richard,
>>>
>>> Any comments about this ?
>>
>> I was hoping to get a review from someone else because I have no way to
>> test it.  But I've now queued the patch, thanks.
> 

FWIW, I tested it (and didn't work) . Later found something odd wrt to the
device path.

Paolo if it helps your future testing, you can have a device-dax with something
like this:

   efi_fake_mem=4G@16G:0x40000 # creates a dax0.0 device with sz 4G, 2M aligned

But it requires dax_hmem which is v5.5+. Or alternatively use memmap=4G!16G (and
using ndctl create-namespace -r 0 -a <align>) and it creates pmem legacy device.


> Does qemu run tests in a nested VM? The difficult aspect of testing
> devdax is that you need to boot your kernel with a special option or
> have existing memory ranges assigned to the device. Although, Joao had
> thoughts about allowing dynamic creation of device-dax instance by hot
> unplugging memory.
> 

The idea was to get feature parity with hugetlbfs where you can assign a number
of 2M/1G pages at runtime. Thus giving a more flexible manner of assigning
memory to hmem.

This means we would create dax regions -- which can be sub-divided into dax
devices -- dynamically by hotunpluging a memory%u device first and then
reassigning it to dax_hmem driver (and thus marking it as 'soft-reserved').
Which could be given back to system-ram via dax_kmem. Naturally this assumes you
can hot-unplug the memory block before assigning it to dax_hmem, which might be
rather unpredictable. via kernel cmdline still is, though, the most
deterministic manner of assigning memory say at a bigger page granularities
(e.g. 1G).

But this is hotunplug-assign-to-hmem is still on paper, I haven't yet prototyped
this to see where it all falls apart.

>>> On 4/1/2020 11:13 AM, Liu, Jingqi wrote:
>>>> If the backend file is devdax pmem character device, the alignment
>>>> specified by the option 'align=NUM' in the '-object memory-backend-file'
>>>> needs to match the alignment requirement of the devdax pmem character
>>>> device.
>>>>
>>>> This patch fetches the devdax pmem file 'align', so that we can compare
>>>> it with the NUM of 'align=NUM'.
>>>> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
>>>>
>>>> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
>>>> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
>>>>
>>>> Cc: Dan Williams <dan.j.williams@intel.com>
>>>> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
>>>> ---
>>>>   exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>>>>   1 file changed, 45 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/exec.c b/exec.c
>>>> index de9d949902..8221abffec 100644
>>>> --- a/exec.c
>>>> +++ b/exec.c
>>>> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
>>>>       return size;
>>>>   }
>>>>   +static int64_t get_file_align(int fd)
>>>> +{
>>>> +    int64_t align = -1;
>>>> +#if defined(__linux__)
>>>> +    struct stat st;
>>>> +
>>>> +    if (fstat(fd, &st) < 0) {
>>>> +        return -errno;
>>>> +    }
>>>> +
>>>> +    /* Special handling for devdax character devices */
>>>> +    if (S_ISCHR(st.st_mode)) {
>>>> +        g_autofree char *subsystem_path = NULL;
>>>> +        g_autofree char *subsystem = NULL;
>>>> +
>>>> +        subsystem_path =
>>>> g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
>>>> +                                         major(st.st_rdev),
>>>> minor(st.st_rdev));
>>>> +        subsystem = g_file_read_link(subsystem_path, NULL);
>>>> +
>>>> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
>>>> +            g_autofree char *align_path = NULL;
>>>> +            g_autofree char *align_str = NULL;
>>>> +
>>>> +            align_path =
>>>> g_strdup_printf("/sys/dev/char/%d:%d/device/align",
>>>> +                                    major(st.st_rdev),
>>>> minor(st.st_rdev));
>>>> +
>>>> +            if (g_file_get_contents(align_path, &align_str, NULL,
>>>> NULL)) {
>>>> +                return g_ascii_strtoll(align_str, NULL, 0);
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +#endif /* defined(__linux__) */
>>>> +
>>>> +    return align;
>>>> +}
>>>> +
>>>>   static int file_ram_open(const char *path,
>>>>                            const char *region_name,
>>>>                            bool *created,
>>>> @@ -2275,7 +2311,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t
>>>> size, MemoryRegion *mr,
>>>>   {
>>>>       RAMBlock *new_block;
>>>>       Error *local_err = NULL;
>>>> -    int64_t file_size;
>>>> +    int64_t file_size, file_align;
>>>>         /* Just support these ram flags by now. */
>>>>       assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
>>>> @@ -2311,6 +2347,14 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t
>>>> size, MemoryRegion *mr,
>>>>           return NULL;
>>>>       }
>>>>   +    file_align = get_file_align(fd);
>>>> +    if (file_align > 0 && mr && file_align > mr->align) {
>>>> +        error_setg(errp, "backing store align 0x%" PRIx64
>>>> +                   " is larger than 'align' option 0x" RAM_ADDR_FMT,
>>>> +                   file_align, mr->align);
>>>> +        return NULL;
> 
> Is there any downside to just making the alignment value be the max of
> the device-dax instance align and the command line option? Why force
> someone to debug the option unnecessarily?
> 
+1

Perhaps we can auto-detect that @align was not set and then we would set the max
align value. But if user has set a value over command line we would validate it
like Jingqi is doing above. Roughly, something like this just as a suggestion:

@@ -2354,11 +2354,16 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size,
MemoryRegion *mr,
     }

     file_align = get_file_align(fd);
-    if (file_align > 0 && mr && file_align > mr->align) {
-        error_setg(errp, "backing store align 0x%" PRIx64
-                   " is larger than 'align' option 0x" RAM_ADDR_FMT,
-                   file_align, mr->align);
-        return NULL;
+    if (file_align > 0 && mr) {
+        /* auto detect alignment if none is specified */
+        if (!mr->align)
+            mr->align = file_align;
+        if (file_align > mr->align) {
+            error_setg(errp, "backing store align 0x%" PRIx64
+                       " is larger than 'align' option 0x" RAM_ADDR_FMT,
+                       file_align, mr->align);
+            return NULL;
+        }
     }



Re: [PATCH] exec: fetch the alignment of Linux devdax pmem character device nodes
Posted by Liu, Jingqi 3 years, 11 months ago
On 4/7/2020 4:08 PM, Paolo Bonzini wrote:
> On 07/04/20 09:29, Liu, Jingqi wrote:
>> Ping.
>>
>> Any comments are appreciated.
>>
>> Hi Paolo, Richard,
>>
>> Any comments about this ?
> I was hoping to get a review from someone else because I have no way to
> test it.  But I've now queued the patch, thanks.

Hi Paolo,

Thanks for your comments.

Hi Dan,

Can you help review this patch ?

Thanks,

Jingqi

> Paolo
>
>> Thanks,
>>
>> Jingqi
>>
>> On 4/1/2020 11:13 AM, Liu, Jingqi wrote:
>>> If the backend file is devdax pmem character device, the alignment
>>> specified by the option 'align=NUM' in the '-object memory-backend-file'
>>> needs to match the alignment requirement of the devdax pmem character
>>> device.
>>>
>>> This patch fetches the devdax pmem file 'align', so that we can compare
>>> it with the NUM of 'align=NUM'.
>>> The NUM needs to be larger than or equal to the devdax pmem file 'align'.
>>>
>>> It also fixes the problem that mmap() returns failure in qemu_ram_mmap()
>>> when the NUM of 'align=NUM' is less than the devdax pmem file 'align'.
>>>
>>> Cc: Dan Williams <dan.j.williams@intel.com>
>>> Signed-off-by: Jingqi Liu <jingqi.liu@intel.com>
>>> ---
>>>    exec.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>>>    1 file changed, 45 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/exec.c b/exec.c
>>> index de9d949902..8221abffec 100644
>>> --- a/exec.c
>>> +++ b/exec.c
>>> @@ -1736,6 +1736,42 @@ static int64_t get_file_size(int fd)
>>>        return size;
>>>    }
>>>    +static int64_t get_file_align(int fd)
>>> +{
>>> +    int64_t align = -1;
>>> +#if defined(__linux__)
>>> +    struct stat st;
>>> +
>>> +    if (fstat(fd, &st) < 0) {
>>> +        return -errno;
>>> +    }
>>> +
>>> +    /* Special handling for devdax character devices */
>>> +    if (S_ISCHR(st.st_mode)) {
>>> +        g_autofree char *subsystem_path = NULL;
>>> +        g_autofree char *subsystem = NULL;
>>> +
>>> +        subsystem_path =
>>> g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
>>> +                                         major(st.st_rdev),
>>> minor(st.st_rdev));
>>> +        subsystem = g_file_read_link(subsystem_path, NULL);
>>> +
>>> +        if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
>>> +            g_autofree char *align_path = NULL;
>>> +            g_autofree char *align_str = NULL;
>>> +
>>> +            align_path =
>>> g_strdup_printf("/sys/dev/char/%d:%d/device/align",
>>> +                                    major(st.st_rdev),
>>> minor(st.st_rdev));
>>> +
>>> +            if (g_file_get_contents(align_path, &align_str, NULL,
>>> NULL)) {
>>> +                return g_ascii_strtoll(align_str, NULL, 0);
>>> +            }
>>> +        }
>>> +    }
>>> +#endif /* defined(__linux__) */
>>> +
>>> +    return align;
>>> +}
>>> +
>>>    static int file_ram_open(const char *path,
>>>                             const char *region_name,
>>>                             bool *created,
>>> @@ -2275,7 +2311,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t
>>> size, MemoryRegion *mr,
>>>    {
>>>        RAMBlock *new_block;
>>>        Error *local_err = NULL;
>>> -    int64_t file_size;
>>> +    int64_t file_size, file_align;
>>>          /* Just support these ram flags by now. */
>>>        assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0);
>>> @@ -2311,6 +2347,14 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t
>>> size, MemoryRegion *mr,
>>>            return NULL;
>>>        }
>>>    +    file_align = get_file_align(fd);
>>> +    if (file_align > 0 && mr && file_align > mr->align) {
>>> +        error_setg(errp, "backing store align 0x%" PRIx64
>>> +                   " is larger than 'align' option 0x" RAM_ADDR_FMT,
>>> +                   file_align, mr->align);
>>> +        return NULL;
>>> +    }
>>> +
>>>        new_block = g_malloc0(sizeof(*new_block));
>>>        new_block->mr = mr;
>>>        new_block->used_length = size;