Add support for persistent reservations.
Effectively all that is done here is that a multipath version of pr_ops is
created which calls into the driver version of the callbacks for the
mpath_device selected.
Structure mpath_pr_ops is introduced, which must be set by the driver for
PR callbacks.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
include/linux/multipath.h | 18 +++++
lib/multipath.c | 146 ++++++++++++++++++++++++++++++++++++++
2 files changed, 164 insertions(+)
diff --git a/include/linux/multipath.h b/include/linux/multipath.h
index 9122560f71778..454826c385923 100644
--- a/include/linux/multipath.h
+++ b/include/linux/multipath.h
@@ -5,6 +5,7 @@
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/cdev.h>
+#include <linux/pr.h>
#include <linux/srcu.h>
#include <linux/io_uring/cmd.h>
@@ -48,6 +49,22 @@ struct mpath_device {
int numa_node;
};
+struct mpath_pr_ops {
+ int (*pr_register)(struct mpath_device *mpath_device, u64 old_key,
+ u64 new_key, u32 flags);
+ int (*pr_reserve)(struct mpath_device *mpath_device, u64 key,
+ enum pr_type type, u32 flags);
+ int (*pr_release)(struct mpath_device *mpath_device, u64 key,
+ enum pr_type type);
+ int (*pr_preempt)(struct mpath_device *mpath_device, u64 old_key,
+ u64 new_key, enum pr_type type, bool abort);
+ int (*pr_clear)(struct mpath_device *mpath_device, u64 key);
+ int (*pr_read_keys)(struct mpath_device *mpath_device,
+ struct pr_keys *keys_info);
+ int (*pr_read_reservation)(struct mpath_device *mpath_device,
+ struct pr_held_reservation *rsv);
+};
+
struct mpath_head_template {
bool (*available_path)(struct mpath_device *, bool *);
int (*add_cdev)(struct mpath_head *);
@@ -64,6 +81,7 @@ struct mpath_head_template {
unsigned int poll_flags);
enum mpath_iopolicy_e (*get_iopolicy)(struct mpath_head *);
struct bio *(*clone_bio)(struct bio *);
+ const struct mpath_pr_ops *pr_ops;
const struct attribute_group **device_groups;
};
diff --git a/lib/multipath.c b/lib/multipath.c
index c05b4d25ca223..8ee2d12600035 100644
--- a/lib/multipath.c
+++ b/lib/multipath.c
@@ -472,11 +472,157 @@ static void mpath_bdev_release(struct gendisk *disk)
mpath_put_disk(mpath_disk);
}
+static int mpath_pr_register(struct block_device *bdev, u64 old_key,
+ u64 new_key, unsigned int flags)
+{
+ struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
+ struct mpath_head *mpath_head = mpath_disk->mpath_head;
+ struct mpath_device *mpath_device;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ mpath_device = mpath_find_path(mpath_head);
+ if (mpath_device)
+ ret = mpath_head->mpdt->pr_ops->pr_register(mpath_device,
+ old_key, new_key, flags);
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+
+ return ret;
+}
+
+static int mpath_pr_reserve(struct block_device *bdev, u64 key,
+ enum pr_type type, unsigned flags)
+{
+ struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
+ struct mpath_head *mpath_head = mpath_disk->mpath_head;
+ struct mpath_device *mpath_device;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ mpath_device = mpath_find_path(mpath_head);
+
+ if (mpath_device)
+ ret = mpath_head->mpdt->pr_ops->pr_reserve(mpath_device, key,
+ type, flags);
+
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+
+ return ret;
+}
+
+static int mpath_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+ struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
+ struct mpath_head *mpath_head = mpath_disk->mpath_head;
+ struct mpath_device *mpath_device;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ mpath_device = mpath_find_path(mpath_head);
+
+ if (mpath_device)
+ ret = mpath_head->mpdt->pr_ops->pr_release(mpath_device, key,
+ type);
+
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+
+ return ret;
+}
+
+static int mpath_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+ enum pr_type type, bool abort)
+{
+ struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
+ struct mpath_head *mpath_head = mpath_disk->mpath_head;
+ struct mpath_device *mpath_device;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ mpath_device = mpath_find_path(mpath_head);
+
+ if (mpath_device)
+ ret = mpath_head->mpdt->pr_ops->pr_preempt(mpath_device, old,
+ new, type, abort);
+
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+
+ return ret;
+}
+
+static int mpath_pr_clear(struct block_device *bdev, u64 key)
+{
+ struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
+ struct mpath_head *mpath_head = mpath_disk->mpath_head;
+ struct mpath_device *mpath_device;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ mpath_device = mpath_find_path(mpath_head);
+
+ if (mpath_device)
+ ret = mpath_head->mpdt->pr_ops->pr_clear(mpath_device, key);
+
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+
+ return ret;
+}
+
+static int mpath_pr_read_keys(struct block_device *bdev,
+ struct pr_keys *keys_info)
+{
+ struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
+ struct mpath_head *mpath_head = mpath_disk->mpath_head;
+ struct mpath_device *mpath_device;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ mpath_device = mpath_find_path(mpath_head);
+
+ if (mpath_device)
+ ret = mpath_head->mpdt->pr_ops->pr_read_keys(mpath_device,
+ keys_info);
+
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+
+ return ret;
+}
+
+static int mpath_pr_read_reservation(struct block_device *bdev,
+ struct pr_held_reservation *resv)
+{
+ struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
+ struct mpath_head *mpath_head = mpath_disk->mpath_head;
+ struct mpath_device *mpath_device;
+ int srcu_idx, ret = -EWOULDBLOCK;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ mpath_device = mpath_find_path(mpath_head);
+
+ if (mpath_device)
+ ret = mpath_head->mpdt->pr_ops->pr_read_reservation(
+ mpath_device, resv);
+
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+
+ return ret;
+}
+
+static const struct pr_ops mpath_pr_ops = {
+ .pr_register = mpath_pr_register,
+ .pr_reserve = mpath_pr_reserve,
+ .pr_release = mpath_pr_release,
+ .pr_preempt = mpath_pr_preempt,
+ .pr_clear = mpath_pr_clear,
+ .pr_read_keys = mpath_pr_read_keys,
+ .pr_read_reservation = mpath_pr_read_reservation,
+};
+
const struct block_device_operations mpath_ops = {
.owner = THIS_MODULE,
.open = mpath_bdev_open,
.release = mpath_bdev_release,
.submit_bio = mpath_bdev_submit_bio,
+ .pr_ops = &mpath_pr_ops,
};
EXPORT_SYMBOL_GPL(mpath_ops);
--
2.43.5
On Wed, Feb 25, 2026 at 03:32:21PM +0000, John Garry wrote:
> +static int mpath_pr_register(struct block_device *bdev, u64 old_key,
> + u64 new_key, unsigned int flags)
> +{
> + struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
> + struct mpath_head *mpath_head = mpath_disk->mpath_head;
> + struct mpath_device *mpath_device;
> + int srcu_idx, ret = -EWOULDBLOCK;
> +
> + srcu_idx = srcu_read_lock(&mpath_head->srcu);
> + mpath_device = mpath_find_path(mpath_head);
> + if (mpath_device)
> + ret = mpath_head->mpdt->pr_ops->pr_register(mpath_device,
> + old_key, new_key, flags);
> + srcu_read_unlock(&mpath_head->srcu, srcu_idx);
Instead of having the lower layer define new mp template functions, why
not use the existing pr_ops from mpath_device->disk->fops->pr_ops?
On Wed, Feb 25, 2026 at 08:49:00AM -0700, Keith Busch wrote:
> On Wed, Feb 25, 2026 at 03:32:21PM +0000, John Garry wrote:
> > +static int mpath_pr_register(struct block_device *bdev, u64 old_key,
> > + u64 new_key, unsigned int flags)
> > +{
> > + struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
> > + struct mpath_head *mpath_head = mpath_disk->mpath_head;
> > + struct mpath_device *mpath_device;
> > + int srcu_idx, ret = -EWOULDBLOCK;
> > +
> > + srcu_idx = srcu_read_lock(&mpath_head->srcu);
> > + mpath_device = mpath_find_path(mpath_head);
> > + if (mpath_device)
> > + ret = mpath_head->mpdt->pr_ops->pr_register(mpath_device,
> > + old_key, new_key, flags);
> > + srcu_read_unlock(&mpath_head->srcu, srcu_idx);
>
> Instead of having the lower layer define new mp template functions, why
> not use the existing pr_ops from mpath_device->disk->fops->pr_ops?
I don't think that's the right answer. The regular scsi persistent
reservation functions simply won't work on a multipath device. Even just
a simple reservation fails.
For example (with /dev/sda being multipath device 0):
# echo round-robin > /sys/class/scsi_mpath_device/0/iopolicy
# blkpr -c register -k 0x1 /dev/sda
# blkpr -c reserve -k 0x1 -t exclusive-access-reg-only /dev/sda
# dd if=/dev/sda of=/dev/null iflag=direct count=100
dd: error reading '/dev/sda': Invalid exchange
1+0 records in
1+0 records out
512 bytes copied, 0.00871312 s, 58.8 kB/s
Here are the kernel messages:
[ 3494.660401] sd 7:0:1:0: reservation conflict
[ 3494.661802] sd 7:0:1:0: [sda:1] tag#768 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
[ 3494.664848] sd 7:0:1:0: [sda:1] tag#768 CDB: Read(10) 28 00 00 00 00 01 00 00 01 00
[ 3494.667092] reservation conflict error, dev sda:1, sector 1 op 0x0:(READ) flags 0x2800800 phys_seg 1 prio class 2
If you don't have a multipathed scsi device to try this on, you can run:
targetcli <<EOF
/backstores/ramdisk create mptest 1G
/loopback create naa.5001401111111111
/loopback create naa.5001402222222222
/loopback create naa.5001403333333333
/loopback create naa.5001404444444444
/loopback/naa.5001401111111111/luns create /backstores/ramdisk/mptest
/loopback/naa.5001402222222222/luns create /backstores/ramdisk/mptest
/loopback/naa.5001403333333333/luns create /backstores/ramdisk/mptest
/loopback/naa.5001404444444444/luns create /backstores/ramdisk/mptest
EOF
to create one.
Handling scsi Persistent Reservations on a multipath device is painful.
Here is a non-exhaustive list of the problems with trying to make a
multipath device act like a single scsi device for persistent
reservation purposes:
You need to register the key on all the I_T Nexuses. You can't just pick
a single path. Otherwise, when you set up the reservation, you will only
be able to do IO on one of the paths. That's what happened above.
If an path is down when you do the resevation, you might not be able to
register the key on that path. You certainly can't do it directly.
Using the All Target Ports bit (assuming the device supports it) could
let you extend a reservation from one target port to others, assuming
your path isn't down because of connection issue on the host side. But
in general, you have to be able to handle the case where you can't
register (or unregister) a key on your failed paths. If you don't do
that (un)registration when the path comes up, before it can get seleted
for handling IO, you will fail when accessing a path you should be
allowed allowed to access, or succeed in accessing a path that you are
should not be allowed to access.
The same is true when new paths are discovered. You need to register
them.
Except that a preempt can come and remove your registration at any time.
You can't register the new (or newly active) path if the key has been
preempted, and this preemption can happen at any moment, even after you
check if the other paths are still registered. If this isn't handled
correctly, paths can access storage that they should not be allowed to
access.
Changing the reservation type (for instance from
exclusive-access-reg-only to write-exclusive-reg-only) in scsi devices
is done by preempting the existing reservation. This will remove the
registered keys from every path except the one issuing the command. The
key needs to be reregistered on all the other paths again. If any IO
goes to these paths before they are reregistered, it will fail with a
reservation conflict, so IO needs to be suspended during this time.
The path that is holding the reservation might be down. In this case,
you aren't able to release the reservation from that path. The only way
I figured out to handle this in dm-mpath was for the device to preempt
it's own key, to move the reservation to a working path. This causes the
same issues as preempting key to change the reservation type, where you
need to reregister all the paths with IO suspended.
An actual preemption can come in from another machine while you are
doing this. In that case, you must not reregister the paths, and if you
already started, you must unregister them.
I can probably come up with more issues.
I think the best course of action for now is to just fail persistent
reservations as non-supported for scsi devices. IMHO Making them work
correctly (where mulitpath device IO won't fail when it should succeed,
and succeed when it should fail with a reservation conflict) dwarfs the
amount of work necessary to support ALUA.
dm-mpath previously did a pretty good job handling Persistent
Reservations. But recently it became much better, because it become very
clear that pretty good is not good enough for what people what to do
with Persistent Reservations and multipath devices.
-Ben
On 27/02/2026 18:12, Benjamin Marzinski wrote: >> Instead of having the lower layer define new mp template functions, why >> not use the existing pr_ops from mpath_device->disk->fops->pr_ops? > I don't think that's the right answer. The regular scsi persistent > reservation functions simply won't work on a multipath device. Even just > a simple reservation fails. > > For example (with /dev/sda being multipath device 0): > # echo round-robin > /sys/class/scsi_mpath_device/0/iopolicy > # blkpr -c register -k 0x1 /dev/sda > # blkpr -c reserve -k 0x1 -t exclusive-access-reg-only /dev/sda > # dd if=/dev/sda of=/dev/null iflag=direct count=100 > dd: error reading '/dev/sda': Invalid exchange > 1+0 records in > 1+0 records out > 512 bytes copied, 0.00871312 s, 58.8 kB/s > > Here are the kernel messages: > [ 3494.660401] sd 7:0:1:0: reservation conflict > [ 3494.661802] sd 7:0:1:0: [sda:1] tag#768 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s > [ 3494.664848] sd 7:0:1:0: [sda:1] tag#768 CDB: Read(10) 28 00 00 00 00 01 00 00 01 00 > [ 3494.667092] reservation conflict error, dev sda:1, sector 1 op 0x0:(READ) flags 0x2800800 phys_seg 1 prio class 2 > > If you don't have a multipathed scsi device to try this on, you can run: > > targetcli <<EOF > /backstores/ramdisk create mptest 1G > /loopback create naa.5001401111111111 > /loopback create naa.5001402222222222 > /loopback create naa.5001403333333333 > /loopback create naa.5001404444444444 > /loopback/naa.5001401111111111/luns create /backstores/ramdisk/mptest > /loopback/naa.5001402222222222/luns create /backstores/ramdisk/mptest > /loopback/naa.5001403333333333/luns create /backstores/ramdisk/mptest > /loopback/naa.5001404444444444/luns create /backstores/ramdisk/mptest > EOF > > to create one. > > Handling scsi Persistent Reservations on a multipath device is painful. > Here is a non-exhaustive list of the problems with trying to make a > multipath device act like a single scsi device for persistent > reservation purposes: > > You need to register the key on all the I_T Nexuses. You can't just pick > a single path. Otherwise, when you set up the reservation, you will only > be able to do IO on one of the paths. That's what happened above. ok, thanks for the pointer. This does not sound too difficult to implement, but obviously it will require special handling (vs NVMe) > > If an path is down when you do the resevation, you might not be able to > register the key on that path. You certainly can't do it directly. > Using the All Target Ports bit (assuming the device supports it) could > let you extend a reservation from one target port to others, assuming > your path isn't down because of connection issue on the host side. But > in general, you have to be able to handle the case where you can't > register (or unregister) a key on your failed paths. If you don't do > that (un)registration when the path comes up, before it can get seleted > for handling IO, you will fail when accessing a path you should be > allowed allowed to access, or succeed in accessing a path that you are > should not be allowed to access. Understood > > The same is true when new paths are discovered. You need to register > them. > > Except that a preempt can come and remove your registration at any time. > You can't register the new (or newly active) path if the key has been > preempted, and this preemption can happen at any moment, even after you > check if the other paths are still registered. If this isn't handled > correctly, paths can access storage that they should not be allowed to > access. right > > Changing the reservation type (for instance from > exclusive-access-reg-only to write-exclusive-reg-only) in scsi devices > is done by preempting the existing reservation. This will remove the > registered keys from every path except the one issuing the command. The > key needs to be reregistered on all the other paths again. If any IO > goes to these paths before they are reregistered, it will fail with a > reservation conflict, so IO needs to be suspended during this time. > > The path that is holding the reservation might be down. In this case, > you aren't able to release the reservation from that path. The only way > I figured out to handle this in dm-mpath was for the device to preempt > it's own key, to move the reservation to a working path. This causes the > same issues as preempting key to change the reservation type, where you > need to reregister all the paths with IO suspended. > > An actual preemption can come in from another machine while you are > doing this. In that case, you must not reregister the paths, and if you > already started, you must unregister them. > > I can probably come up with more issues. This all is becoming complicated... :) > > I think the best course of action for now is to just fail persistent > reservations as non-supported for scsi devices. IMHO Making them work > correctly (where mulitpath device IO won't fail when it should succeed, > and succeed when it should fail with a reservation conflict) dwarfs the > amount of work necessary to support ALUA. Yeah, that sounds reasonable, but I want to ensure libmultipath API does not later change here such that it disrupts NVMe support (if indeed NVMe goes on to use libmultipath). > > dm-mpath previously did a pretty good job handling Persistent > Reservations. But recently it became much better, because it become very > clear that pretty good is not good enough for what people what to do > with Persistent Reservations and multipath devices. Thanks for the feedback. I'll check these details further now.
On 25/02/2026 15:49, Keith Busch wrote:
> On Wed, Feb 25, 2026 at 03:32:21PM +0000, John Garry wrote:
>> +static int mpath_pr_register(struct block_device *bdev, u64 old_key,
>> + u64 new_key, unsigned int flags)
>> +{
>> + struct mpath_disk *mpath_disk = dev_get_drvdata(&bdev->bd_device);
>> + struct mpath_head *mpath_head = mpath_disk->mpath_head;
>> + struct mpath_device *mpath_device;
>> + int srcu_idx, ret = -EWOULDBLOCK;
>> +
>> + srcu_idx = srcu_read_lock(&mpath_head->srcu);
>> + mpath_device = mpath_find_path(mpath_head);
>> + if (mpath_device)
>> + ret = mpath_head->mpdt->pr_ops->pr_register(mpath_device,
>> + old_key, new_key, flags);
>> + srcu_read_unlock(&mpath_head->srcu, srcu_idx);
> Instead of having the lower layer define new mp template functions, why
> not use the existing pr_ops from mpath_device->disk->fops->pr_ops?
Yeah, that should be possible and I did use disk->fops elsewhere. We
would just need to find the per-path bdev. I just wasn't sure if that
was a preferred style.
cheers
© 2016 - 2026 Red Hat, Inc.