The new fsdev driver provides pages/folios initialized compatibly with
fsdax - normal rather than devdax-style refcounting, and starting out
with order-0 folios.
When fsdev binds to a daxdev, it is usually (always?) switching from the
devdax mode (device.c), which pre-initializes compound folios according
to its alignment. Fsdev uses fsdev_clear_folio_state() to switch the
folios into a fsdax-compatible state.
A side effect of this is that raw mmap doesn't (can't?) work on an fsdev
dax instance. Accordingly, The fsdev driver does not provide raw mmap -
devices must be put in 'devdax' mode (drivers/dax/device.c) to get raw
mmap capability.
In this commit is just the framework, which remaps pages/folios compatibly
with fsdax.
Enabling dax changes:
* bus.h: add DAXDRV_FSDEV_TYPE driver type
* bus.c: allow DAXDRV_FSDEV_TYPE drivers to bind to daxdevs
* dax.h: prototype inode_dax(), which fsdev needs
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Suggested-by: Gregory Price <gourry@gourry.net>
Signed-off-by: John Groves <john@groves.net>
---
MAINTAINERS | 8 ++
drivers/dax/Kconfig | 17 +++
drivers/dax/Makefile | 2 +
drivers/dax/bus.c | 4 +
drivers/dax/bus.h | 1 +
drivers/dax/fsdev.c | 276 +++++++++++++++++++++++++++++++++++++++++++
include/linux/dax.h | 4 +
7 files changed, 312 insertions(+)
create mode 100644 drivers/dax/fsdev.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 765ad2daa218..90429cb06090 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7184,6 +7184,14 @@ L: linux-cxl@vger.kernel.org
S: Supported
F: drivers/dax/
+DEVICE DIRECT ACCESS (DAX) [fsdev_dax]
+M: John Groves <jgroves@micron.com>
+M: John Groves <John@Groves.net>
+L: nvdimm@lists.linux.dev
+L: linux-cxl@vger.kernel.org
+S: Supported
+F: drivers/dax/fsdev.c
+
DEVICE FREQUENCY (DEVFREQ)
M: MyungJoo Ham <myungjoo.ham@samsung.com>
M: Kyungmin Park <kyungmin.park@samsung.com>
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index d656e4c0eb84..491325d914a8 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -78,4 +78,21 @@ config DEV_DAX_KMEM
Say N if unsure.
+config DEV_DAX_FS
+ tristate "FSDEV DAX: fs-dax compatible device driver"
+ depends on DEV_DAX
+ default DEV_DAX
+ help
+ Support a device-dax driver mode that is compatible with fs-dax
+ filesystems. Unlike the standard device-dax driver which
+ pre-initializes compound folios based on device alignment, this
+ driver leaves folios uninitialized (similar to pmem) allowing
+ fs-dax to manage folio lifecycles dynamically.
+
+ This driver uses MEMORY_DEVICE_FS_DAX type and does not set
+ vmemmap_shift, making it compatible with filesystems like famfs
+ that use the iomap-based fs-dax infrastructure.
+
+ Say M if you plan to use fs-dax filesystems on /dev/dax devices.
+ Say N if you only need raw character device access to DAX memory.
endif
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 5ed5c39857c8..77aa3df3285c 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -4,11 +4,13 @@ obj-$(CONFIG_DEV_DAX) += device_dax.o
obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
obj-$(CONFIG_DEV_DAX_CXL) += dax_cxl.o
+obj-$(CONFIG_DEV_DAX_FS) += fsdev_dax.o
dax-y := super.o
dax-y += bus.o
device_dax-y := device.o
dax_pmem-y := pmem.o
dax_cxl-y := cxl.o
+fsdev_dax-y := fsdev.o
obj-y += hmem/
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index a2f9a3cc30a5..0d7228acb913 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -84,6 +84,10 @@ static int dax_match_type(const struct dax_device_driver *dax_drv, struct device
!IS_ENABLED(CONFIG_DEV_DAX_KMEM))
return 1;
+ /* fsdev driver can also bind to device-type dax devices */
+ if (dax_drv->type == DAXDRV_FSDEV_TYPE && type == DAXDRV_DEVICE_TYPE)
+ return 1;
+
return 0;
}
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index cbbf64443098..880bdf7e72d7 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -31,6 +31,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data);
enum dax_driver_type {
DAXDRV_KMEM_TYPE,
DAXDRV_DEVICE_TYPE,
+ DAXDRV_FSDEV_TYPE,
};
struct dax_device_driver {
diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c
new file mode 100644
index 000000000000..2a3249d1529c
--- /dev/null
+++ b/drivers/dax/fsdev.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2026 Micron Technology, Inc. */
+#include <linux/memremap.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
+#include <linux/slab.h>
+#include <linux/dax.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "dax-private.h"
+#include "bus.h"
+
+/*
+ * FS-DAX compatible devdax driver
+ *
+ * Unlike drivers/dax/device.c which pre-initializes compound folios based
+ * on device alignment (via vmemmap_shift), this driver leaves folios
+ * uninitialized similar to pmem. This allows fs-dax filesystems like famfs
+ * to work without needing special handling for pre-initialized folios.
+ *
+ * Key differences from device.c:
+ * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC)
+ * - vmemmap_shift is NOT set (folios remain order-0)
+ * - fs-dax can dynamically create compound folios as needed
+ * - No mmap support - all access is through fs-dax/iomap
+ */
+
+
+static void fsdev_cdev_del(void *cdev)
+{
+ cdev_del(cdev);
+}
+
+static void fsdev_kill(void *dev_dax)
+{
+ kill_dev_dax(dev_dax);
+}
+
+/*
+ * Page map operations for FS-DAX mode
+ * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c
+ *
+ * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX.
+ * The core mm code in free_zone_device_folio() handles the wake_up_var()
+ * directly for this memory type.
+ */
+static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap,
+ unsigned long pfn, unsigned long nr_pages, int mf_flags)
+{
+ struct dev_dax *dev_dax = pgmap->owner;
+ u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start;
+ u64 len = nr_pages << PAGE_SHIFT;
+
+ return dax_holder_notify_failure(dev_dax->dax_dev, offset,
+ len, mf_flags);
+}
+
+static const struct dev_pagemap_ops fsdev_pagemap_ops = {
+ .memory_failure = fsdev_pagemap_memory_failure,
+};
+
+/*
+ * Clear any stale folio state from pages in the given range.
+ * This is necessary because device_dax pre-initializes compound folios
+ * based on vmemmap_shift, and that state may persist after driver unbind.
+ * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax
+ * expects to find clean order-0 folios that it can build into compound
+ * folios on demand.
+ *
+ * At probe time, no filesystem should be mounted yet, so all mappings
+ * are stale and must be cleared along with compound state.
+ */
+static void fsdev_clear_folio_state(struct dev_dax *dev_dax)
+{
+ int i;
+
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct range *range = &dev_dax->ranges[i].range;
+ unsigned long pfn, end_pfn;
+
+ pfn = PHYS_PFN(range->start);
+ end_pfn = PHYS_PFN(range->end) + 1;
+
+ while (pfn < end_pfn) {
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio = (struct folio *)page;
+ struct dev_pagemap *pgmap = page_pgmap(page);
+ int order = folio_order(folio);
+
+ /*
+ * Clear any stale mapping pointer. At probe time,
+ * no filesystem is mounted, so any mapping is stale.
+ */
+ folio->mapping = NULL;
+ folio->share = 0;
+
+ if (order > 0) {
+ int j;
+
+ folio_reset_order(folio);
+ for (j = 0; j < (1UL << order); j++) {
+ struct page *p = page + j;
+
+ ClearPageHead(p);
+ clear_compound_head(p);
+ ((struct folio *)p)->mapping = NULL;
+ ((struct folio *)p)->share = 0;
+ ((struct folio *)p)->pgmap = pgmap;
+ }
+ pfn += (1UL << order);
+ } else {
+ folio->pgmap = pgmap;
+ pfn++;
+ }
+ }
+ }
+}
+
+static int fsdev_open(struct inode *inode, struct file *filp)
+{
+ struct dax_device *dax_dev = inode_dax(inode);
+ struct dev_dax *dev_dax = dax_get_private(dax_dev);
+
+ dev_dbg(&dev_dax->dev, "trace\n");
+ filp->private_data = dev_dax;
+
+ return 0;
+}
+
+static int fsdev_release(struct inode *inode, struct file *filp)
+{
+ struct dev_dax *dev_dax = filp->private_data;
+
+ dev_dbg(&dev_dax->dev, "trace\n");
+ return 0;
+}
+
+static const struct file_operations fsdev_fops = {
+ .llseek = noop_llseek,
+ .owner = THIS_MODULE,
+ .open = fsdev_open,
+ .release = fsdev_release,
+};
+
+static int fsdev_dax_probe(struct dev_dax *dev_dax)
+{
+ struct dax_device *dax_dev = dev_dax->dax_dev;
+ struct device *dev = &dev_dax->dev;
+ struct dev_pagemap *pgmap;
+ u64 data_offset = 0;
+ struct inode *inode;
+ struct cdev *cdev;
+ void *addr;
+ int rc, i;
+
+ if (static_dev_dax(dev_dax)) {
+ if (dev_dax->nr_range > 1) {
+ dev_warn(dev,
+ "static pgmap / multi-range device conflict\n");
+ return -EINVAL;
+ }
+
+ pgmap = dev_dax->pgmap;
+ } else {
+ if (dev_dax->pgmap) {
+ dev_warn(dev,
+ "dynamic-dax with pre-populated page map\n");
+ return -EINVAL;
+ }
+
+ pgmap = devm_kzalloc(dev,
+ struct_size(pgmap, ranges, dev_dax->nr_range - 1),
+ GFP_KERNEL);
+ if (!pgmap)
+ return -ENOMEM;
+
+ pgmap->nr_range = dev_dax->nr_range;
+ dev_dax->pgmap = pgmap;
+
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct range *range = &dev_dax->ranges[i].range;
+
+ pgmap->ranges[i] = *range;
+ }
+ }
+
+ for (i = 0; i < dev_dax->nr_range; i++) {
+ struct range *range = &dev_dax->ranges[i].range;
+
+ if (!devm_request_mem_region(dev, range->start,
+ range_len(range), dev_name(dev))) {
+ dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n",
+ i, range->start, range->end);
+ return -EBUSY;
+ }
+ }
+
+ /*
+ * FS-DAX compatible mode: Use MEMORY_DEVICE_FS_DAX type and
+ * do NOT set vmemmap_shift. This leaves folios at order-0,
+ * allowing fs-dax to dynamically create compound folios as needed
+ * (similar to pmem behavior).
+ */
+ pgmap->type = MEMORY_DEVICE_FS_DAX;
+ pgmap->ops = &fsdev_pagemap_ops;
+ pgmap->owner = dev_dax;
+
+ /*
+ * CRITICAL DIFFERENCE from device.c:
+ * We do NOT set vmemmap_shift here, even if align > PAGE_SIZE.
+ * This ensures folios remain order-0 and are compatible with
+ * fs-dax's folio management.
+ */
+
+ addr = devm_memremap_pages(dev, pgmap);
+ if (IS_ERR(addr))
+ return PTR_ERR(addr);
+
+ /*
+ * Clear any stale compound folio state left over from a previous
+ * driver (e.g., device_dax with vmemmap_shift).
+ */
+ fsdev_clear_folio_state(dev_dax);
+
+ /* Detect whether the data is at a non-zero offset into the memory */
+ if (pgmap->range.start != dev_dax->ranges[0].range.start) {
+ u64 phys = dev_dax->ranges[0].range.start;
+ u64 pgmap_phys = dev_dax->pgmap[0].range.start;
+
+ if (!WARN_ON(pgmap_phys > phys))
+ data_offset = phys - pgmap_phys;
+
+ pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n",
+ __func__, phys, pgmap_phys, data_offset);
+ }
+
+ inode = dax_inode(dax_dev);
+ cdev = inode->i_cdev;
+ cdev_init(cdev, &fsdev_fops);
+ cdev->owner = dev->driver->owner;
+ cdev_set_parent(cdev, &dev->kobj);
+ rc = cdev_add(cdev, dev->devt, 1);
+ if (rc)
+ return rc;
+
+ rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev);
+ if (rc)
+ return rc;
+
+ run_dax(dax_dev);
+ return devm_add_action_or_reset(dev, fsdev_kill, dev_dax);
+}
+
+static struct dax_device_driver fsdev_dax_driver = {
+ .probe = fsdev_dax_probe,
+ .type = DAXDRV_FSDEV_TYPE,
+};
+
+static int __init dax_init(void)
+{
+ return dax_driver_register(&fsdev_dax_driver);
+}
+
+static void __exit dax_exit(void)
+{
+ dax_driver_unregister(&fsdev_dax_driver);
+}
+
+MODULE_AUTHOR("John Groves");
+MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver");
+MODULE_LICENSE("GPL");
+module_init(dax_init);
+module_exit(dax_exit);
+MODULE_ALIAS_DAX_DEVICE(0);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9d624f4d9df6..74e098010016 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -51,6 +51,10 @@ struct dax_holder_operations {
#if IS_ENABLED(CONFIG_DAX)
struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
+
+#if IS_ENABLED(CONFIG_DEV_DAX_FS)
+struct dax_device *inode_dax(struct inode *inode);
+#endif
void *dax_holder(struct dax_device *dax_dev);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
--
2.49.0
On Wed, 7 Jan 2026 09:33:11 -0600
John Groves <John@Groves.net> wrote:
> The new fsdev driver provides pages/folios initialized compatibly with
> fsdax - normal rather than devdax-style refcounting, and starting out
> with order-0 folios.
>
> When fsdev binds to a daxdev, it is usually (always?) switching from the
> devdax mode (device.c), which pre-initializes compound folios according
> to its alignment. Fsdev uses fsdev_clear_folio_state() to switch the
> folios into a fsdax-compatible state.
>
> A side effect of this is that raw mmap doesn't (can't?) work on an fsdev
> dax instance. Accordingly, The fsdev driver does not provide raw mmap -
> devices must be put in 'devdax' mode (drivers/dax/device.c) to get raw
> mmap capability.
>
> In this commit is just the framework, which remaps pages/folios compatibly
> with fsdax.
>
> Enabling dax changes:
>
> * bus.h: add DAXDRV_FSDEV_TYPE driver type
> * bus.c: allow DAXDRV_FSDEV_TYPE drivers to bind to daxdevs
> * dax.h: prototype inode_dax(), which fsdev needs
>
> Suggested-by: Dan Williams <dan.j.williams@intel.com>
> Suggested-by: Gregory Price <gourry@gourry.net>
> Signed-off-by: John Groves <john@groves.net>
> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
> index d656e4c0eb84..491325d914a8 100644
> --- a/drivers/dax/Kconfig
> +++ b/drivers/dax/Kconfig
> @@ -78,4 +78,21 @@ config DEV_DAX_KMEM
>
> Say N if unsure.
>
> +config DEV_DAX_FS
> + tristate "FSDEV DAX: fs-dax compatible device driver"
> + depends on DEV_DAX
> + default DEV_DAX
What's the logic for the default? Generally I'd not expect a
default for something new like this (so default of default == no)
> + help
> + Support a device-dax driver mode that is compatible with fs-dax
...
> struct dax_device_driver {
> diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c
> new file mode 100644
> index 000000000000..2a3249d1529c
> --- /dev/null
> +++ b/drivers/dax/fsdev.c
> @@ -0,0 +1,276 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright(c) 2026 Micron Technology, Inc. */
> +#include <linux/memremap.h>
> +#include <linux/pagemap.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/cdev.h>
> +#include <linux/slab.h>
> +#include <linux/dax.h>
> +#include <linux/fs.h>
> +#include <linux/mm.h>
> +#include "dax-private.h"
> +#include "bus.h"
...
> +static void fsdev_cdev_del(void *cdev)
> +{
> + cdev_del(cdev);
> +}
> +
> +static void fsdev_kill(void *dev_dax)
> +{
> + kill_dev_dax(dev_dax);
> +}
...
> +/*
> + * Clear any stale folio state from pages in the given range.
> + * This is necessary because device_dax pre-initializes compound folios
> + * based on vmemmap_shift, and that state may persist after driver unbind.
What's the argument for not cleaning these out in the unbind path for device_dax?
I can see that it might be an optimization if some other code path blindly
overwrites all this state.
> + * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax
> + * expects to find clean order-0 folios that it can build into compound
> + * folios on demand.
> + *
> + * At probe time, no filesystem should be mounted yet, so all mappings
> + * are stale and must be cleared along with compound state.
> + */
> +static void fsdev_clear_folio_state(struct dev_dax *dev_dax)
> +{
> + int i;
It's becoming increasingly common to declare loop variables as
for (int i = 0; i <...
and given that saves us a few lines here it seems worth doing.
> +
> + for (i = 0; i < dev_dax->nr_range; i++) {
> + struct range *range = &dev_dax->ranges[i].range;
> + unsigned long pfn, end_pfn;
> +
> + pfn = PHYS_PFN(range->start);
> + end_pfn = PHYS_PFN(range->end) + 1;
Might as well do
unsigned long pfn = PHY_PFN(range->start);
unsigned long end_pfn = PHYS_PFN(range->end) + 1;
> +
> + while (pfn < end_pfn) {
> + struct page *page = pfn_to_page(pfn);
> + struct folio *folio = (struct folio *)page;
> + struct dev_pagemap *pgmap = page_pgmap(page);
> + int order = folio_order(folio);
> +
> + /*
> + * Clear any stale mapping pointer. At probe time,
> + * no filesystem is mounted, so any mapping is stale.
> + */
> + folio->mapping = NULL;
> + folio->share = 0;
> +
> + if (order > 0) {
> + int j;
> +
> + folio_reset_order(folio);
> + for (j = 0; j < (1UL << order); j++) {
> + struct page *p = page + j;
> +
> + ClearPageHead(p);
> + clear_compound_head(p);
> + ((struct folio *)p)->mapping = NULL;
This code block is very similar to a chunk in dax_folio_put() in fs/dax.c
Can we create a helper for both to use?
I note that uses a local struct folio *new_folio to avoid multiple casts.
I'd do similar here even if it's a long line.
If not possible to use a common helper, it is probably still worth
having a helper here for the stuff in the while loop just to reduce indent
and improve readability a little.
> + ((struct folio *)p)->share = 0;
> + ((struct folio *)p)->pgmap = pgmap;
> + }
> + pfn += (1UL << order);
> + } else {
> + folio->pgmap = pgmap;
> + pfn++;
> + }
> + }
> + }
> +}
> +
> +static int fsdev_open(struct inode *inode, struct file *filp)
> +{
> + struct dax_device *dax_dev = inode_dax(inode);
> + struct dev_dax *dev_dax = dax_get_private(dax_dev);
> +
> + dev_dbg(&dev_dax->dev, "trace\n");
Hmm. This is a somewhat odd, but I see dax/device.c does
the same thing and I guess that's because you are using
dynamic debug with function names turned on to provide the
'real' information.
> + filp->private_data = dev_dax;
> +
> + return 0;
> +}
> +static int fsdev_dax_probe(struct dev_dax *dev_dax)
> +{
> + struct dax_device *dax_dev = dev_dax->dax_dev;
> + struct device *dev = &dev_dax->dev;
> + struct dev_pagemap *pgmap;
> + u64 data_offset = 0;
> + struct inode *inode;
> + struct cdev *cdev;
> + void *addr;
> + int rc, i;
> +
A bunch of this is cut and paste from dax/device.c
If it carries on looking like this, can we have a helper module that
both drivers use with the common code in it? That would make the
difference more obvious as well.
> + if (static_dev_dax(dev_dax)) {
> + if (dev_dax->nr_range > 1) {
> + dev_warn(dev,
> + "static pgmap / multi-range device conflict\n");
> + return -EINVAL;
> + }
> +
> + pgmap = dev_dax->pgmap;
> + } else {
> + if (dev_dax->pgmap) {
> + dev_warn(dev,
> + "dynamic-dax with pre-populated page map\n");
Unless dax maintainers are very fussy about 80 chars, I'd go long on these as it's
only just over 80 chars on one line.
Given you are failing probe, not sure why dev_warn() is considered sufficient.
To me dev_err() seems more sensible. What you have matches dax/device.c though
so maybe there is a sound reason.
> + return -EINVAL;
> + }
> +
> + pgmap = devm_kzalloc(dev,
> + struct_size(pgmap, ranges, dev_dax->nr_range - 1),
> + GFP_KERNEL);
Pick an alignment style and stick to it. Either.
pgmap = devm_kzalloc(dev,
struct_size(pgmap, ranges, dev_dax->nr_range - 1),
GFP_KERNEL);
or go long for readability and do
pgmap = devm_kzalloc(dev,
struct_size(pgmap, ranges, dev_dax->nr_range - 1),
GFP_KERNEL);
> + if (!pgmap)
> + return -ENOMEM;
> +
> + pgmap->nr_range = dev_dax->nr_range;
> + dev_dax->pgmap = pgmap;
> +
> + for (i = 0; i < dev_dax->nr_range; i++) {
> + struct range *range = &dev_dax->ranges[i].range;
> +
> + pgmap->ranges[i] = *range;
> + }
> + }
> +
> + for (i = 0; i < dev_dax->nr_range; i++) {
> + struct range *range = &dev_dax->ranges[i].range;
> +
> + if (!devm_request_mem_region(dev, range->start,
> + range_len(range), dev_name(dev))) {
> + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n",
> + i, range->start, range->end);
> + return -EBUSY;
> + }
> + }
> +
> + /*
> + * FS-DAX compatible mode: Use MEMORY_DEVICE_FS_DAX type and
> + * do NOT set vmemmap_shift. This leaves folios at order-0,
> + * allowing fs-dax to dynamically create compound folios as needed
> + * (similar to pmem behavior).
> + */
> + pgmap->type = MEMORY_DEVICE_FS_DAX;
> + pgmap->ops = &fsdev_pagemap_ops;
> + pgmap->owner = dev_dax;
> +
> + /*
> + * CRITICAL DIFFERENCE from device.c:
> + * We do NOT set vmemmap_shift here, even if align > PAGE_SIZE.
> + * This ensures folios remain order-0 and are compatible with
> + * fs-dax's folio management.
> + */
> +
> + addr = devm_memremap_pages(dev, pgmap);
> + if (IS_ERR(addr))
> + return PTR_ERR(addr);
> +
> + /*
> + * Clear any stale compound folio state left over from a previous
> + * driver (e.g., device_dax with vmemmap_shift).
> + */
> + fsdev_clear_folio_state(dev_dax);
> +
> + /* Detect whether the data is at a non-zero offset into the memory */
> + if (pgmap->range.start != dev_dax->ranges[0].range.start) {
> + u64 phys = dev_dax->ranges[0].range.start;
> + u64 pgmap_phys = dev_dax->pgmap[0].range.start;
> +
> + if (!WARN_ON(pgmap_phys > phys))
> + data_offset = phys - pgmap_phys;
> +
> + pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n",
> + __func__, phys, pgmap_phys, data_offset);
> + }
> +
> + inode = dax_inode(dax_dev);
> + cdev = inode->i_cdev;
> + cdev_init(cdev, &fsdev_fops);
> + cdev->owner = dev->driver->owner;
> + cdev_set_parent(cdev, &dev->kobj);
> + rc = cdev_add(cdev, dev->devt, 1);
> + if (rc)
> + return rc;
> +
> + rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev);
> + if (rc)
> + return rc;
> +
> + run_dax(dax_dev);
> + return devm_add_action_or_reset(dev, fsdev_kill, dev_dax);
> +}
> +
> +static struct dax_device_driver fsdev_dax_driver = {
> + .probe = fsdev_dax_probe,
> + .type = DAXDRV_FSDEV_TYPE,
> +};
> +
> +static int __init dax_init(void)
> +{
> + return dax_driver_register(&fsdev_dax_driver);
> +}
> +
> +static void __exit dax_exit(void)
> +{
> + dax_driver_unregister(&fsdev_dax_driver);
> +}
If these don't get more complex, maybe it's time for a dax specific define
using module_driver()
> +
> +MODULE_AUTHOR("John Groves");
> +MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver");
> +MODULE_LICENSE("GPL");
> +module_init(dax_init);
> +module_exit(dax_exit);
> +MODULE_ALIAS_DAX_DEVICE(0);
Curious macro. Always has same parameter... Maybe ripe for just dropping the parameter?
On 26/01/08 11:31AM, Jonathan Cameron wrote: > On Wed, 7 Jan 2026 09:33:11 -0600 > John Groves <John@Groves.net> wrote: > > > The new fsdev driver provides pages/folios initialized compatibly with > > fsdax - normal rather than devdax-style refcounting, and starting out > > with order-0 folios. > > > > When fsdev binds to a daxdev, it is usually (always?) switching from the > > devdax mode (device.c), which pre-initializes compound folios according > > to its alignment. Fsdev uses fsdev_clear_folio_state() to switch the > > folios into a fsdax-compatible state. > > > > A side effect of this is that raw mmap doesn't (can't?) work on an fsdev > > dax instance. Accordingly, The fsdev driver does not provide raw mmap - > > devices must be put in 'devdax' mode (drivers/dax/device.c) to get raw > > mmap capability. > > > > In this commit is just the framework, which remaps pages/folios compatibly > > with fsdax. > > > > Enabling dax changes: > > > > * bus.h: add DAXDRV_FSDEV_TYPE driver type > > * bus.c: allow DAXDRV_FSDEV_TYPE drivers to bind to daxdevs > > * dax.h: prototype inode_dax(), which fsdev needs > > > > Suggested-by: Dan Williams <dan.j.williams@intel.com> > > Suggested-by: Gregory Price <gourry@gourry.net> > > Signed-off-by: John Groves <john@groves.net> > > > diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig > > index d656e4c0eb84..491325d914a8 100644 > > --- a/drivers/dax/Kconfig > > +++ b/drivers/dax/Kconfig > > @@ -78,4 +78,21 @@ config DEV_DAX_KMEM > > > > Say N if unsure. > > > > +config DEV_DAX_FS > > + tristate "FSDEV DAX: fs-dax compatible device driver" > > + depends on DEV_DAX > > + default DEV_DAX > > What's the logic for the default? Generally I'd not expect a > default for something new like this (so default of default == no) My thinking is that this is harmless unless you use it, but if you need it you need it. So defaulting to include the module seems viable. [ ... ] John
On 26/01/08 09:12AM, John Groves wrote: > On 26/01/08 11:31AM, Jonathan Cameron wrote: > > On Wed, 7 Jan 2026 09:33:11 -0600 > > John Groves <John@Groves.net> wrote: [ ... ] > > > diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig > > > index d656e4c0eb84..491325d914a8 100644 > > > --- a/drivers/dax/Kconfig > > > +++ b/drivers/dax/Kconfig > > > @@ -78,4 +78,21 @@ config DEV_DAX_KMEM > > > > > > Say N if unsure. > > > > > > +config DEV_DAX_FS > > > + tristate "FSDEV DAX: fs-dax compatible device driver" > > > + depends on DEV_DAX > > > + default DEV_DAX > > > > What's the logic for the default? Generally I'd not expect a > > default for something new like this (so default of default == no) > > My thinking is that this is harmless unless you use it, but if you > need it you need it. So defaulting to include the module seems > viable. > > [ ... ] On further deliberation, I think I'd like to get rid of CONFIG_DEV_DAX_FS, and just include the fsdev_dax driver if DEV_DAX and FS_DAX are configured. Then CONFIG_FUSE_FAMFS_DAX (controlling the famfs code in fuse) can just depend on DEV_DAX, FS_DAX and FUSE_FS. That's where I'm leaning for the next rev of the series... John
On Thu, Jan 08, 2026 at 03:15:10PM -0600, John Groves wrote: > On 26/01/08 09:12AM, John Groves wrote: > > On 26/01/08 11:31AM, Jonathan Cameron wrote: > > > On Wed, 7 Jan 2026 09:33:11 -0600 > > > John Groves <John@Groves.net> wrote: > > [ ... ] > > > > > diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig > > > > index d656e4c0eb84..491325d914a8 100644 > > > > --- a/drivers/dax/Kconfig > > > > +++ b/drivers/dax/Kconfig > > > > @@ -78,4 +78,21 @@ config DEV_DAX_KMEM > > > > > > > > Say N if unsure. > > > > > > > > +config DEV_DAX_FS > > > > + tristate "FSDEV DAX: fs-dax compatible device driver" > > > > + depends on DEV_DAX > > > > + default DEV_DAX > > > > > > What's the logic for the default? Generally I'd not expect a > > > default for something new like this (so default of default == no) > > > > My thinking is that this is harmless unless you use it, but if you > > need it you need it. So defaulting to include the module seems > > viable. > > > > [ ... ] > > On further deliberation, I think I'd like to get rid of > CONFIG_DEV_DAX_FS, and just include the fsdev_dax driver if DEV_DAX > and FS_DAX are configured. Then CONFIG_FUSE_FAMFS_DAX (controlling the > famfs code in fuse) can just depend on DEV_DAX, FS_DAX and FUSE_FS. > > That's where I'm leaning for the next rev of the series... > > John > Please do that for CXL_DAX or whatever because it's really annoying to have CXL and DAX configured but not have your dax device show up because CXL_DAX wasn't configured. :P ~Gregory
On 26/01/08 11:31AM, Jonathan Cameron wrote:
> On Wed, 7 Jan 2026 09:33:11 -0600
> John Groves <John@Groves.net> wrote:
>
> > The new fsdev driver provides pages/folios initialized compatibly with
> > fsdax - normal rather than devdax-style refcounting, and starting out
> > with order-0 folios.
> >
> > When fsdev binds to a daxdev, it is usually (always?) switching from the
> > devdax mode (device.c), which pre-initializes compound folios according
> > to its alignment. Fsdev uses fsdev_clear_folio_state() to switch the
> > folios into a fsdax-compatible state.
> >
> > A side effect of this is that raw mmap doesn't (can't?) work on an fsdev
> > dax instance. Accordingly, The fsdev driver does not provide raw mmap -
> > devices must be put in 'devdax' mode (drivers/dax/device.c) to get raw
> > mmap capability.
> >
> > In this commit is just the framework, which remaps pages/folios compatibly
> > with fsdax.
> >
> > Enabling dax changes:
> >
> > * bus.h: add DAXDRV_FSDEV_TYPE driver type
> > * bus.c: allow DAXDRV_FSDEV_TYPE drivers to bind to daxdevs
> > * dax.h: prototype inode_dax(), which fsdev needs
> >
> > Suggested-by: Dan Williams <dan.j.williams@intel.com>
> > Suggested-by: Gregory Price <gourry@gourry.net>
> > Signed-off-by: John Groves <john@groves.net>
>
> > diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
> > index d656e4c0eb84..491325d914a8 100644
> > --- a/drivers/dax/Kconfig
> > +++ b/drivers/dax/Kconfig
> > @@ -78,4 +78,21 @@ config DEV_DAX_KMEM
> >
> > Say N if unsure.
> >
> > +config DEV_DAX_FS
> > + tristate "FSDEV DAX: fs-dax compatible device driver"
> > + depends on DEV_DAX
> > + default DEV_DAX
>
> What's the logic for the default? Generally I'd not expect a
> default for something new like this (so default of default == no)
>
> > + help
> > + Support a device-dax driver mode that is compatible with fs-dax
>
> ...
>
>
>
> > struct dax_device_driver {
> > diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c
> > new file mode 100644
> > index 000000000000..2a3249d1529c
> > --- /dev/null
> > +++ b/drivers/dax/fsdev.c
> > @@ -0,0 +1,276 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright(c) 2026 Micron Technology, Inc. */
> > +#include <linux/memremap.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/module.h>
> > +#include <linux/device.h>
> > +#include <linux/cdev.h>
> > +#include <linux/slab.h>
> > +#include <linux/dax.h>
> > +#include <linux/fs.h>
> > +#include <linux/mm.h>
> > +#include "dax-private.h"
> > +#include "bus.h"
>
> ...
>
> > +static void fsdev_cdev_del(void *cdev)
> > +{
> > + cdev_del(cdev);
> > +}
> > +
> > +static void fsdev_kill(void *dev_dax)
> > +{
> > + kill_dev_dax(dev_dax);
> > +}
>
> ...
>
> > +/*
> > + * Clear any stale folio state from pages in the given range.
> > + * This is necessary because device_dax pre-initializes compound folios
> > + * based on vmemmap_shift, and that state may persist after driver unbind.
>
> What's the argument for not cleaning these out in the unbind path for device_dax?
> I can see that it might be an optimization if some other code path blindly
> overwrites all this state.
I prefer this because it doesn't rely on some other module having done the
right thing. Dax maintainers might have thoughts too though.
>
> > + * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax
> > + * expects to find clean order-0 folios that it can build into compound
> > + * folios on demand.
> > + *
> > + * At probe time, no filesystem should be mounted yet, so all mappings
> > + * are stale and must be cleared along with compound state.
> > + */
> > +static void fsdev_clear_folio_state(struct dev_dax *dev_dax)
> > +{
> > + int i;
>
> It's becoming increasingly common to declare loop variables as
> for (int i = 0; i <...
>
> and given that saves us a few lines here it seems worth doing.
Done thanks
>
> > +
> > + for (i = 0; i < dev_dax->nr_range; i++) {
> > + struct range *range = &dev_dax->ranges[i].range;
> > + unsigned long pfn, end_pfn;
> > +
> > + pfn = PHYS_PFN(range->start);
> > + end_pfn = PHYS_PFN(range->end) + 1;
>
> Might as well do
> unsigned long pfn = PHY_PFN(range->start);
> unsigned long end_pfn = PHYS_PFN(range->end) + 1;
Sounds good, done
> > +
> > + while (pfn < end_pfn) {
> > + struct page *page = pfn_to_page(pfn);
> > + struct folio *folio = (struct folio *)page;
> > + struct dev_pagemap *pgmap = page_pgmap(page);
> > + int order = folio_order(folio);
> > +
> > + /*
> > + * Clear any stale mapping pointer. At probe time,
> > + * no filesystem is mounted, so any mapping is stale.
> > + */
> > + folio->mapping = NULL;
> > + folio->share = 0;
> > +
> > + if (order > 0) {
> > + int j;
> > +
> > + folio_reset_order(folio);
> > + for (j = 0; j < (1UL << order); j++) {
> > + struct page *p = page + j;
> > +
> > + ClearPageHead(p);
> > + clear_compound_head(p);
> > + ((struct folio *)p)->mapping = NULL;
>
> This code block is very similar to a chunk in dax_folio_put() in fs/dax.c
>
> Can we create a helper for both to use?
>
> I note that uses a local struct folio *new_folio to avoid multiple casts.
> I'd do similar here even if it's a long line.
>
> If not possible to use a common helper, it is probably still worth
> having a helper here for the stuff in the while loop just to reduce indent
> and improve readability a little.
Good catch! You shall have a Suggested-by in the next version, which will
inject a commit right before this that factors out dax_folio_reset_order()
from dax_folio_put(). Then fsdev_clear_folio_state() will also call that.
>
> > + ((struct folio *)p)->share = 0;
> > + ((struct folio *)p)->pgmap = pgmap;
> > + }
> > + pfn += (1UL << order);
> > + } else {
> > + folio->pgmap = pgmap;
> > + pfn++;
> > + }
> > + }
> > + }
> > +}
> > +
> > +static int fsdev_open(struct inode *inode, struct file *filp)
> > +{
> > + struct dax_device *dax_dev = inode_dax(inode);
> > + struct dev_dax *dev_dax = dax_get_private(dax_dev);
> > +
> > + dev_dbg(&dev_dax->dev, "trace\n");
>
> Hmm. This is a somewhat odd, but I see dax/device.c does
> the same thing and I guess that's because you are using
> dynamic debug with function names turned on to provide the
> 'real' information.
Actually I just have it from the gut-and-repurpose of device.c.
Dropping from fsdev.c as I'm not using it.
>
>
>
> > + filp->private_data = dev_dax;
> > +
> > + return 0;
> > +}
>
> > +static int fsdev_dax_probe(struct dev_dax *dev_dax)
> > +{
> > + struct dax_device *dax_dev = dev_dax->dax_dev;
> > + struct device *dev = &dev_dax->dev;
> > + struct dev_pagemap *pgmap;
> > + u64 data_offset = 0;
> > + struct inode *inode;
> > + struct cdev *cdev;
> > + void *addr;
> > + int rc, i;
> > +
>
> A bunch of this is cut and paste from dax/device.c
> If it carries on looking like this, can we have a helper module that
> both drivers use with the common code in it? That would make the
> difference more obvious as well.
Makes sense. I'll wait for thoughts from the dax people before
flipping bits on this though.
>
> > + if (static_dev_dax(dev_dax)) {
> > + if (dev_dax->nr_range > 1) {
> > + dev_warn(dev,
> > + "static pgmap / multi-range device conflict\n");
> > + return -EINVAL;
> > + }
> > +
> > + pgmap = dev_dax->pgmap;
> > + } else {
> > + if (dev_dax->pgmap) {
> > + dev_warn(dev,
> > + "dynamic-dax with pre-populated page map\n");
> Unless dax maintainers are very fussy about 80 chars, I'd go long on these as it's
> only just over 80 chars on one line.
>
> Given you are failing probe, not sure why dev_warn() is considered sufficient.
> To me dev_err() seems more sensible. What you have matches dax/device.c though
> so maybe there is a sound reason.
I'm personally a bit fussy about 80 column code, being kinda old and favoring
80 column emacs windows :D - mulling it over.
>
> > + return -EINVAL;
> > + }
> > +
> > + pgmap = devm_kzalloc(dev,
> > + struct_size(pgmap, ranges, dev_dax->nr_range - 1),
> > + GFP_KERNEL);
> Pick an alignment style and stick to it. Either.
> pgmap = devm_kzalloc(dev,
> struct_size(pgmap, ranges, dev_dax->nr_range - 1),
> GFP_KERNEL);
>
> or go long for readability and do
> pgmap = devm_kzalloc(dev,
> struct_size(pgmap, ranges, dev_dax->nr_range - 1),
> GFP_KERNEL);
Will do something cleaner. This is the aforementioned 80 column curmudgeonliness
at work...
>
>
>
> > + if (!pgmap)
> > + return -ENOMEM;
> > +
> > + pgmap->nr_range = dev_dax->nr_range;
> > + dev_dax->pgmap = pgmap;
> > +
> > + for (i = 0; i < dev_dax->nr_range; i++) {
> > + struct range *range = &dev_dax->ranges[i].range;
> > +
> > + pgmap->ranges[i] = *range;
> > + }
> > + }
> > +
> > + for (i = 0; i < dev_dax->nr_range; i++) {
> > + struct range *range = &dev_dax->ranges[i].range;
> > +
> > + if (!devm_request_mem_region(dev, range->start,
> > + range_len(range), dev_name(dev))) {
> > + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n",
> > + i, range->start, range->end);
> > + return -EBUSY;
> > + }
> > + }
> > +
> > + /*
> > + * FS-DAX compatible mode: Use MEMORY_DEVICE_FS_DAX type and
> > + * do NOT set vmemmap_shift. This leaves folios at order-0,
> > + * allowing fs-dax to dynamically create compound folios as needed
> > + * (similar to pmem behavior).
> > + */
> > + pgmap->type = MEMORY_DEVICE_FS_DAX;
> > + pgmap->ops = &fsdev_pagemap_ops;
> > + pgmap->owner = dev_dax;
> > +
> > + /*
> > + * CRITICAL DIFFERENCE from device.c:
> > + * We do NOT set vmemmap_shift here, even if align > PAGE_SIZE.
> > + * This ensures folios remain order-0 and are compatible with
> > + * fs-dax's folio management.
> > + */
> > +
> > + addr = devm_memremap_pages(dev, pgmap);
> > + if (IS_ERR(addr))
> > + return PTR_ERR(addr);
> > +
> > + /*
> > + * Clear any stale compound folio state left over from a previous
> > + * driver (e.g., device_dax with vmemmap_shift).
> > + */
> > + fsdev_clear_folio_state(dev_dax);
> > +
> > + /* Detect whether the data is at a non-zero offset into the memory */
> > + if (pgmap->range.start != dev_dax->ranges[0].range.start) {
> > + u64 phys = dev_dax->ranges[0].range.start;
> > + u64 pgmap_phys = dev_dax->pgmap[0].range.start;
> > +
> > + if (!WARN_ON(pgmap_phys > phys))
> > + data_offset = phys - pgmap_phys;
> > +
> > + pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n",
> > + __func__, phys, pgmap_phys, data_offset);
> > + }
> > +
> > + inode = dax_inode(dax_dev);
> > + cdev = inode->i_cdev;
> > + cdev_init(cdev, &fsdev_fops);
> > + cdev->owner = dev->driver->owner;
> > + cdev_set_parent(cdev, &dev->kobj);
> > + rc = cdev_add(cdev, dev->devt, 1);
> > + if (rc)
> > + return rc;
> > +
> > + rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev);
> > + if (rc)
> > + return rc;
> > +
> > + run_dax(dax_dev);
> > + return devm_add_action_or_reset(dev, fsdev_kill, dev_dax);
> > +}
> > +
> > +static struct dax_device_driver fsdev_dax_driver = {
> > + .probe = fsdev_dax_probe,
> > + .type = DAXDRV_FSDEV_TYPE,
> > +};
> > +
> > +static int __init dax_init(void)
> > +{
> > + return dax_driver_register(&fsdev_dax_driver);
> > +}
> > +
> > +static void __exit dax_exit(void)
> > +{
> > + dax_driver_unregister(&fsdev_dax_driver);
> > +}
> If these don't get more complex, maybe it's time for a dax specific define
> using module_driver()
I'll defer to the dax folks here
>
> > +
> > +MODULE_AUTHOR("John Groves");
> > +MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver");
> > +MODULE_LICENSE("GPL");
> > +module_init(dax_init);
> > +module_exit(dax_exit);
> > +MODULE_ALIAS_DAX_DEVICE(0);
>
> Curious macro. Always has same parameter... Maybe ripe for just dropping the parameter?
>
>
Thanks!
John
© 2016 - 2026 Red Hat, Inc.