[PATCH v2 04/19] ublk: set request integrity params in ublksrv_io_desc

Caleb Sander Mateos posted 19 patches 1 month ago
There is a newer version of this series
[PATCH v2 04/19] ublk: set request integrity params in ublksrv_io_desc
Posted by Caleb Sander Mateos 1 month ago
Indicate to the ublk server when an incoming request has integrity data
by setting UBLK_IO_F_INTEGRITY in the ublksrv_io_desc's op_flags field.
If the ublk device doesn't support integrity, the request will never
provide integrity data. If the ublk device supports integrity, the
request may omit the integrity buffer only if metadata_size matches the
PI tuple size determined by csum_type. In this case, the ublk server
should internally generate/verify the protection information from the
data and sector offset.
Set the UBLK_IO_F_CHECK_{GUARD,REFTAG,APPTAG} flags based on the
request's BIP_CHECK_{GUARD,REFTAG,APPTAG} flags, indicating whether to
verify the guard, reference, and app tags in the protection information.
The expected reference tag (32 or 48 bits) and app tag (16 bits) are
indicated in ublksrv_io_desc's new struct ublksrv_io_integrity integrity
field. This field is unioned with the addr field to avoid changing the
size of struct ublksrv_io_desc. UBLK_F_INTEGRITY requires
UBLK_F_USER_COPY and the addr field isn't used for UBLK_F_USER_COPY, so
the two fields aren't needed simultaneously.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
---
 drivers/block/ublk_drv.c      | 43 +++++++++++++++++++++++++++++++----
 include/uapi/linux/ublk_cmd.h | 27 ++++++++++++++++++++--
 2 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 2f9316febf83..51469e0627ff 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -316,10 +316,36 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
 static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
 {
 	return ubq->flags & UBLK_F_ZONED;
 }
 
+static void ublk_setup_iod_buf(const struct ublk_queue *ubq,
+			       const struct request *req,
+			       struct ublksrv_io_desc *iod)
+{
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	if (ubq->flags & UBLK_F_INTEGRITY) {
+		struct bio_integrity_payload *bip;
+		sector_t ref_tag_seed;
+
+		if (!blk_integrity_rq(req))
+			return;
+
+		bip = bio_integrity(req->bio);
+		ref_tag_seed = bip_get_seed(bip);
+		iod->integrity.ref_tag_lo = ref_tag_seed;
+		iod->integrity.ref_tag_hi = ref_tag_seed >> 32;
+		iod->integrity.app_tag = bip->app_tag;
+	} else
+#endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */
+	{
+		const struct ublk_io *io = &ubq->ios[req->tag];
+
+		iod->addr = io->buf.addr;
+	}
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 
 struct ublk_zoned_report_desc {
 	__u64 sector;
 	__u32 operation;
@@ -498,11 +524,10 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
 
 static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
 					 struct request *req)
 {
 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
-	struct ublk_io *io = &ubq->ios[req->tag];
 	struct ublk_zoned_report_desc *desc;
 	u32 ublk_op;
 
 	switch (req_op(req)) {
 	case REQ_OP_ZONE_OPEN:
@@ -545,11 +570,11 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
 	}
 
 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
 	iod->nr_sectors = blk_rq_sectors(req);
 	iod->start_sector = blk_rq_pos(req);
-	iod->addr = io->buf.addr;
+	ublk_setup_iod_buf(ubq, req, iod);
 
 	return BLK_STS_OK;
 }
 
 #else
@@ -1120,17 +1145,27 @@ static inline unsigned int ublk_req_build_flags(struct request *req)
 		flags |= UBLK_IO_F_NOUNMAP;
 
 	if (req->cmd_flags & REQ_SWAP)
 		flags |= UBLK_IO_F_SWAP;
 
+	if (blk_integrity_rq(req)) {
+		flags |= UBLK_IO_F_INTEGRITY;
+
+		if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
+			flags |= UBLK_IO_F_CHECK_GUARD;
+		if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG))
+			flags |= UBLK_IO_F_CHECK_REFTAG;
+		if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG))
+			flags |= UBLK_IO_F_CHECK_APPTAG;
+	}
+
 	return flags;
 }
 
 static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 {
 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
-	struct ublk_io *io = &ubq->ios[req->tag];
 	u32 ublk_op;
 
 	switch (req_op(req)) {
 	case REQ_OP_READ:
 		ublk_op = UBLK_IO_OP_READ;
@@ -1155,11 +1190,11 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 
 	/* need to translate since kernel may change */
 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
 	iod->nr_sectors = blk_rq_sectors(req);
 	iod->start_sector = blk_rq_pos(req);
-	iod->addr = io->buf.addr;
+	ublk_setup_iod_buf(ubq, req, iod);
 
 	return BLK_STS_OK;
 }
 
 static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index a54c47832fa2..a22de3fc5447 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -412,10 +412,25 @@ struct ublksrv_ctrl_dev_info {
  *
  * ublk server has to check this flag if UBLK_AUTO_BUF_REG_FALLBACK is
  * passed in.
  */
 #define		UBLK_IO_F_NEED_REG_BUF		(1U << 17)
+/* Request has an integrity data buffer */
+#define		UBLK_IO_F_INTEGRITY		(1UL << 18)
+/* Guard tag verification requested */
+#define		UBLK_IO_F_CHECK_GUARD		(1UL << 19)
+/* Reference tag verification requested */
+#define		UBLK_IO_F_CHECK_REFTAG		(1UL << 20)
+/* Application tag verification requested */
+#define		UBLK_IO_F_CHECK_APPTAG		(1UL << 21)
+
+struct ublksrv_io_integrity
+{
+	__u32 ref_tag_lo; /* low 32 bits of reference tag seed */
+	__u16 ref_tag_hi; /* high 16 bits of reference tag seed */
+	__u16 app_tag;
+};
 
 /*
  * io cmd is described by this structure, and stored in share memory, indexed
  * by request tag.
  *
@@ -432,12 +447,20 @@ struct ublksrv_io_desc {
 	};
 
 	/* start sector for this io */
 	__u64		start_sector;
 
-	/* buffer address in ublksrv daemon vm space, from ublk driver */
-	__u64		addr;
+	union {
+		/*
+		 * buffer address in ublksrv daemon vm space, from ublk driver.
+		 * Unused for UBLK_F_SUPPORT_ZERO_COPY, UBLK_F_USER_COPY, or
+		 * UBLK_F_AUTO_BUF_REG.
+		 */
+		__u64		addr;
+		/* Integrity options, for UBLK_F_INTEGRITY */
+		struct ublksrv_io_integrity integrity;
+	};
 };
 
 static inline __u8 ublksrv_get_op(const struct ublksrv_io_desc *iod)
 {
 	return iod->op_flags & 0xff;
-- 
2.45.2
Re: [PATCH v2 04/19] ublk: set request integrity params in ublksrv_io_desc
Posted by Ming Lei 1 month ago
On Fri, Jan 02, 2026 at 05:45:14PM -0700, Caleb Sander Mateos wrote:
> Indicate to the ublk server when an incoming request has integrity data
> by setting UBLK_IO_F_INTEGRITY in the ublksrv_io_desc's op_flags field.
> If the ublk device doesn't support integrity, the request will never
> provide integrity data. If the ublk device supports integrity, the
> request may omit the integrity buffer only if metadata_size matches the
> PI tuple size determined by csum_type. In this case, the ublk server
> should internally generate/verify the protection information from the
> data and sector offset.
> Set the UBLK_IO_F_CHECK_{GUARD,REFTAG,APPTAG} flags based on the
> request's BIP_CHECK_{GUARD,REFTAG,APPTAG} flags, indicating whether to
> verify the guard, reference, and app tags in the protection information.
> The expected reference tag (32 or 48 bits) and app tag (16 bits) are
> indicated in ublksrv_io_desc's new struct ublksrv_io_integrity integrity
> field. This field is unioned with the addr field to avoid changing the

It might be fine to set per-rq app_tag, but bios in one request might have
different app_tag in case of io merge actually.

Also block layer builds ref_tag for each internal, please see
t10_pi_generate() and ext_pi_crc64_generate().

So looks this way is wrong.

More importantly reusing iod->addr for other purpose not related with IO
buffer is very unfriendly for adding new features, and one lesson is for ZONED support
by reusing ublksrv_io_cmd->addr for zoned's append lba.

For example, there is chance to support dma-buf based zero copy for ublk, and
please see the io-uring dma-buf support[1], and iod->addr might carry IO buffer info
in dma-buf format in future.

[1] https://lore.kernel.org/io-uring/cover.1763725387.git.asml.silence@gmail.com/#t


> size of struct ublksrv_io_desc. UBLK_F_INTEGRITY requires
> UBLK_F_USER_COPY and the addr field isn't used for UBLK_F_USER_COPY, so
> the two fields aren't needed simultaneously.
> 
> Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> ---
>  drivers/block/ublk_drv.c      | 43 +++++++++++++++++++++++++++++++----
>  include/uapi/linux/ublk_cmd.h | 27 ++++++++++++++++++++--
>  2 files changed, 64 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index 2f9316febf83..51469e0627ff 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -316,10 +316,36 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
>  static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
>  {
>  	return ubq->flags & UBLK_F_ZONED;
>  }
>  
> +static void ublk_setup_iod_buf(const struct ublk_queue *ubq,
> +			       const struct request *req,
> +			       struct ublksrv_io_desc *iod)
> +{
> +#ifdef CONFIG_BLK_DEV_INTEGRITY
> +	if (ubq->flags & UBLK_F_INTEGRITY) {
> +		struct bio_integrity_payload *bip;
> +		sector_t ref_tag_seed;
> +
> +		if (!blk_integrity_rq(req))
> +			return;
> +
> +		bip = bio_integrity(req->bio);
> +		ref_tag_seed = bip_get_seed(bip);

As mentioned, t10_pi_generate() and ext_pi_crc64_generate() builds
per-internal ref tag.


> +		iod->integrity.ref_tag_lo = ref_tag_seed;
> +		iod->integrity.ref_tag_hi = ref_tag_seed >> 32;
> +		iod->integrity.app_tag = bip->app_tag;

In case of io merge, each bio may have different ->app_tag.

Given you have to copy meta data via user copy, I suggest to follow the PI
standard and make it per-internal.

Thanks, 
Ming
Re: [PATCH v2 04/19] ublk: set request integrity params in ublksrv_io_desc
Posted by Caleb Sander Mateos 1 month ago
On Sat, Jan 3, 2026 at 6:14 AM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Fri, Jan 02, 2026 at 05:45:14PM -0700, Caleb Sander Mateos wrote:
> > Indicate to the ublk server when an incoming request has integrity data
> > by setting UBLK_IO_F_INTEGRITY in the ublksrv_io_desc's op_flags field.
> > If the ublk device doesn't support integrity, the request will never
> > provide integrity data. If the ublk device supports integrity, the
> > request may omit the integrity buffer only if metadata_size matches the
> > PI tuple size determined by csum_type. In this case, the ublk server
> > should internally generate/verify the protection information from the
> > data and sector offset.
> > Set the UBLK_IO_F_CHECK_{GUARD,REFTAG,APPTAG} flags based on the
> > request's BIP_CHECK_{GUARD,REFTAG,APPTAG} flags, indicating whether to
> > verify the guard, reference, and app tags in the protection information.
> > The expected reference tag (32 or 48 bits) and app tag (16 bits) are
> > indicated in ublksrv_io_desc's new struct ublksrv_io_integrity integrity
> > field. This field is unioned with the addr field to avoid changing the
>
> It might be fine to set per-rq app_tag, but bios in one request might have
> different app_tag in case of io merge actually.

I based this logic largely on the code under if (ns->head->ms) in
nvme_setup_rw(). That also assumes a single app_tag for the request.
Sounds like an existing bug if bios with different app_tags can be
merged together?

>
> Also block layer builds ref_tag for each internal, please see

What do you mean by "internal"? "interval"?

> t10_pi_generate() and ext_pi_crc64_generate().

Yes, the reftag increases by 1 for each integrity interval. That's why
it suffices for an NVMe command reading multiple blocks to specify
only the expected reftag for the first block; the reftags for
subsequent blocks are incremented accordingly.

Actually, I think we probably don't need to communicate the reftag
seed to the ublk server. NVMe doesn't use the reftag seed (which can
be overridden by struct uio_meta's seed field). Instead,
nvme_set_ref_tag() always uses the offset into the block device
divided by the integrity interval size, as required by all the
existing csum_type formats the kernel supports. So a ublk server could
just use the start_sector field of struct ublksrv_io_desc to compute
the expected reftags. And using start_sector as the reftag also means
merging requests would preserve their expected reftags.

>
> So looks this way is wrong.
>
> More importantly reusing iod->addr for other purpose not related with IO
> buffer is very unfriendly for adding new features, and one lesson is for ZONED support
> by reusing ublksrv_io_cmd->addr for zoned's append lba.

That's a fair point.

>
> For example, there is chance to support dma-buf based zero copy for ublk, and
> please see the io-uring dma-buf support[1], and iod->addr might carry IO buffer info
> in dma-buf format in future.
>
> [1] https://lore.kernel.org/io-uring/cover.1763725387.git.asml.silence@gmail.com/#t
>
>
> > size of struct ublksrv_io_desc. UBLK_F_INTEGRITY requires
> > UBLK_F_USER_COPY and the addr field isn't used for UBLK_F_USER_COPY, so
> > the two fields aren't needed simultaneously.
> >
> > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > ---
> >  drivers/block/ublk_drv.c      | 43 +++++++++++++++++++++++++++++++----
> >  include/uapi/linux/ublk_cmd.h | 27 ++++++++++++++++++++--
> >  2 files changed, 64 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > index 2f9316febf83..51469e0627ff 100644
> > --- a/drivers/block/ublk_drv.c
> > +++ b/drivers/block/ublk_drv.c
> > @@ -316,10 +316,36 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
> >  static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
> >  {
> >       return ubq->flags & UBLK_F_ZONED;
> >  }
> >
> > +static void ublk_setup_iod_buf(const struct ublk_queue *ubq,
> > +                            const struct request *req,
> > +                            struct ublksrv_io_desc *iod)
> > +{
> > +#ifdef CONFIG_BLK_DEV_INTEGRITY
> > +     if (ubq->flags & UBLK_F_INTEGRITY) {
> > +             struct bio_integrity_payload *bip;
> > +             sector_t ref_tag_seed;
> > +
> > +             if (!blk_integrity_rq(req))
> > +                     return;
> > +
> > +             bip = bio_integrity(req->bio);
> > +             ref_tag_seed = bip_get_seed(bip);
>
> As mentioned, t10_pi_generate() and ext_pi_crc64_generate() builds
> per-internal ref tag.

As mentioned, the reftags for subsequent intervals can be computed by
simply incrementing the seed. If the seed is assumed to always be
start_sector >> (interval_exp - SECTOR_SHIFT), then it may not be
necessary to communicate ref_tag_seed at all.

>
>
> > +             iod->integrity.ref_tag_lo = ref_tag_seed;
> > +             iod->integrity.ref_tag_hi = ref_tag_seed >> 32;
> > +             iod->integrity.app_tag = bip->app_tag;
>
> In case of io merge, each bio may have different ->app_tag.

It seems like it would make more sense to prevent merging bios with
different app_tags. In the common case where a request contains a
single bio, which has a single app_tag, it would be much more
efficient to communicate only the 1 app_tag instead of having to pass
a separate app_tag for every logical block/integrity interval.

>
> Given you have to copy meta data via user copy, I suggest to follow the PI
> standard and make it per-internal.

How are you suggesting the ublk server access bip->app_tag and
bip_get_seed(bip) (if overriding the reftag seed is supported)? Would
the ublk server need to make another user copy syscall?

Or would you prefer I drop the BIP_CHECK_* flag support from this
patch set for now?

Thanks,
Caleb
Re: [PATCH v2 04/19] ublk: set request integrity params in ublksrv_io_desc
Posted by Ming Lei 1 month ago
On Mon, Jan 05, 2026 at 08:44:48AM -0800, Caleb Sander Mateos wrote:
> On Sat, Jan 3, 2026 at 6:14 AM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Fri, Jan 02, 2026 at 05:45:14PM -0700, Caleb Sander Mateos wrote:
> > > Indicate to the ublk server when an incoming request has integrity data
> > > by setting UBLK_IO_F_INTEGRITY in the ublksrv_io_desc's op_flags field.
> > > If the ublk device doesn't support integrity, the request will never
> > > provide integrity data. If the ublk device supports integrity, the
> > > request may omit the integrity buffer only if metadata_size matches the
> > > PI tuple size determined by csum_type. In this case, the ublk server
> > > should internally generate/verify the protection information from the
> > > data and sector offset.
> > > Set the UBLK_IO_F_CHECK_{GUARD,REFTAG,APPTAG} flags based on the
> > > request's BIP_CHECK_{GUARD,REFTAG,APPTAG} flags, indicating whether to
> > > verify the guard, reference, and app tags in the protection information.
> > > The expected reference tag (32 or 48 bits) and app tag (16 bits) are
> > > indicated in ublksrv_io_desc's new struct ublksrv_io_integrity integrity
> > > field. This field is unioned with the addr field to avoid changing the
> >
> > It might be fine to set per-rq app_tag, but bios in one request might have
> > different app_tag in case of io merge actually.
> 
> I based this logic largely on the code under if (ns->head->ms) in
> nvme_setup_rw(). That also assumes a single app_tag for the request.
> Sounds like an existing bug if bios with different app_tags can be
> merged together?

Looks it is true.

> 
> >
> > Also block layer builds ref_tag for each internal, please see
> 
> What do you mean by "internal"? "interval"?
> 
> > t10_pi_generate() and ext_pi_crc64_generate().
> 
> Yes, the reftag increases by 1 for each integrity interval. That's why
> it suffices for an NVMe command reading multiple blocks to specify
> only the expected reftag for the first block; the reftags for
> subsequent blocks are incremented accordingly.
> 
> Actually, I think we probably don't need to communicate the reftag
> seed to the ublk server. NVMe doesn't use the reftag seed (which can
> be overridden by struct uio_meta's seed field). Instead,
> nvme_set_ref_tag() always uses the offset into the block device
> divided by the integrity interval size, as required by all the
> existing csum_type formats the kernel supports. So a ublk server could
> just use the start_sector field of struct ublksrv_io_desc to compute
> the expected reftags. And using start_sector as the reftag also means
> merging requests would preserve their expected reftags.

IMO, this way looks fine from user viewpoint, especially aligning with NVMe. 

> 
> >
> > So looks this way is wrong.
> >
> > More importantly reusing iod->addr for other purpose not related with IO
> > buffer is very unfriendly for adding new features, and one lesson is for ZONED support
> > by reusing ublksrv_io_cmd->addr for zoned's append lba.
> 
> That's a fair point.

One candidate is add per-IO mmaped meta area, which can be flexible to
cover more use cases. 

> 
> >
> > For example, there is chance to support dma-buf based zero copy for ublk, and
> > please see the io-uring dma-buf support[1], and iod->addr might carry IO buffer info
> > in dma-buf format in future.
> >
> > [1] https://lore.kernel.org/io-uring/cover.1763725387.git.asml.silence@gmail.com/#t

BTW, PI data size is often small, and it belongs to kernel, there could be
chance to define PI data as pre-mapped DMA-BUF, then almost all drivers can
benefit from avoiding the runtime dma mapping for meta. But that may be one
bigger thing.

> >
> >
> > > size of struct ublksrv_io_desc. UBLK_F_INTEGRITY requires
> > > UBLK_F_USER_COPY and the addr field isn't used for UBLK_F_USER_COPY, so
> > > the two fields aren't needed simultaneously.
> > >
> > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > ---
> > >  drivers/block/ublk_drv.c      | 43 +++++++++++++++++++++++++++++++----
> > >  include/uapi/linux/ublk_cmd.h | 27 ++++++++++++++++++++--
> > >  2 files changed, 64 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > index 2f9316febf83..51469e0627ff 100644
> > > --- a/drivers/block/ublk_drv.c
> > > +++ b/drivers/block/ublk_drv.c
> > > @@ -316,10 +316,36 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
> > >  static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
> > >  {
> > >       return ubq->flags & UBLK_F_ZONED;
> > >  }
> > >
> > > +static void ublk_setup_iod_buf(const struct ublk_queue *ubq,
> > > +                            const struct request *req,
> > > +                            struct ublksrv_io_desc *iod)
> > > +{
> > > +#ifdef CONFIG_BLK_DEV_INTEGRITY
> > > +     if (ubq->flags & UBLK_F_INTEGRITY) {
> > > +             struct bio_integrity_payload *bip;
> > > +             sector_t ref_tag_seed;
> > > +
> > > +             if (!blk_integrity_rq(req))
> > > +                     return;
> > > +
> > > +             bip = bio_integrity(req->bio);
> > > +             ref_tag_seed = bip_get_seed(bip);
> >
> > As mentioned, t10_pi_generate() and ext_pi_crc64_generate() builds
> > per-internal ref tag.
> 
> As mentioned, the reftags for subsequent intervals can be computed by
> simply incrementing the seed. If the seed is assumed to always be
> start_sector >> (interval_exp - SECTOR_SHIFT), then it may not be
> necessary to communicate ref_tag_seed at all.

Fair enough, but this should be documented in UAPI interface.

> 
> >
> >
> > > +             iod->integrity.ref_tag_lo = ref_tag_seed;
> > > +             iod->integrity.ref_tag_hi = ref_tag_seed >> 32;
> > > +             iod->integrity.app_tag = bip->app_tag;
> >
> > In case of io merge, each bio may have different ->app_tag.
> 
> It seems like it would make more sense to prevent merging bios with
> different app_tags. In the common case where a request contains a
> single bio, which has a single app_tag, it would be much more
> efficient to communicate only the 1 app_tag instead of having to pass
> a separate app_tag for every logical block/integrity interval.

OK.

> 
> >
> > Given you have to copy meta data via user copy, I suggest to follow the PI
> > standard and make it per-internal.
> 
> How are you suggesting the ublk server access bip->app_tag and
> bip_get_seed(bip) (if overriding the reftag seed is supported)? Would
> the ublk server need to make another user copy syscall?
> 
> Or would you prefer I drop the BIP_CHECK_* flag support from this
> patch set for now?

I can understand the motivation, and extra syscall should be avoided for
communicating reftag & apptag only, given you have explained both can be
per-request instead of per-interval.

But iod->addr should be avoided for this purpose, otherwise, new feature
can conflict with this usage easily.

But per-io mmapped area can solve this issue, the meta size can be one parameter
of `ublksrv_ctrl_dev_info` with feature flag of UBLK_F_MMAPED_IO_META, what
do you think of this way?


Thanks,
Ming

Re: [PATCH v2 04/19] ublk: set request integrity params in ublksrv_io_desc
Posted by Caleb Sander Mateos 1 month ago
On Mon, Jan 5, 2026 at 5:55 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Mon, Jan 05, 2026 at 08:44:48AM -0800, Caleb Sander Mateos wrote:
> > On Sat, Jan 3, 2026 at 6:14 AM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Fri, Jan 02, 2026 at 05:45:14PM -0700, Caleb Sander Mateos wrote:
> > > > Indicate to the ublk server when an incoming request has integrity data
> > > > by setting UBLK_IO_F_INTEGRITY in the ublksrv_io_desc's op_flags field.
> > > > If the ublk device doesn't support integrity, the request will never
> > > > provide integrity data. If the ublk device supports integrity, the
> > > > request may omit the integrity buffer only if metadata_size matches the
> > > > PI tuple size determined by csum_type. In this case, the ublk server
> > > > should internally generate/verify the protection information from the
> > > > data and sector offset.
> > > > Set the UBLK_IO_F_CHECK_{GUARD,REFTAG,APPTAG} flags based on the
> > > > request's BIP_CHECK_{GUARD,REFTAG,APPTAG} flags, indicating whether to
> > > > verify the guard, reference, and app tags in the protection information.
> > > > The expected reference tag (32 or 48 bits) and app tag (16 bits) are
> > > > indicated in ublksrv_io_desc's new struct ublksrv_io_integrity integrity
> > > > field. This field is unioned with the addr field to avoid changing the
> > >
> > > It might be fine to set per-rq app_tag, but bios in one request might have
> > > different app_tag in case of io merge actually.
> >
> > I based this logic largely on the code under if (ns->head->ms) in
> > nvme_setup_rw(). That also assumes a single app_tag for the request.
> > Sounds like an existing bug if bios with different app_tags can be
> > merged together?
>
> Looks it is true.
>
> >
> > >
> > > Also block layer builds ref_tag for each internal, please see
> >
> > What do you mean by "internal"? "interval"?
> >
> > > t10_pi_generate() and ext_pi_crc64_generate().
> >
> > Yes, the reftag increases by 1 for each integrity interval. That's why
> > it suffices for an NVMe command reading multiple blocks to specify
> > only the expected reftag for the first block; the reftags for
> > subsequent blocks are incremented accordingly.
> >
> > Actually, I think we probably don't need to communicate the reftag
> > seed to the ublk server. NVMe doesn't use the reftag seed (which can
> > be overridden by struct uio_meta's seed field). Instead,
> > nvme_set_ref_tag() always uses the offset into the block device
> > divided by the integrity interval size, as required by all the
> > existing csum_type formats the kernel supports. So a ublk server could
> > just use the start_sector field of struct ublksrv_io_desc to compute
> > the expected reftags. And using start_sector as the reftag also means
> > merging requests would preserve their expected reftags.
>
> IMO, this way looks fine from user viewpoint, especially aligning with NVMe.
>
> >
> > >
> > > So looks this way is wrong.
> > >
> > > More importantly reusing iod->addr for other purpose not related with IO
> > > buffer is very unfriendly for adding new features, and one lesson is for ZONED support
> > > by reusing ublksrv_io_cmd->addr for zoned's append lba.
> >
> > That's a fair point.
>
> One candidate is add per-IO mmaped meta area, which can be flexible to
> cover more use cases.
>
> >
> > >
> > > For example, there is chance to support dma-buf based zero copy for ublk, and
> > > please see the io-uring dma-buf support[1], and iod->addr might carry IO buffer info
> > > in dma-buf format in future.
> > >
> > > [1] https://lore.kernel.org/io-uring/cover.1763725387.git.asml.silence@gmail.com/#t
>
> BTW, PI data size is often small, and it belongs to kernel, there could be

The integrity data buffer isn't necessarily kernel memory. With
IORING_RW_ATTR_FLAG_PI, userspace can use a buffer from its address
space for the integrity data.

> chance to define PI data as pre-mapped DMA-BUF, then almost all drivers can
> benefit from avoiding the runtime dma mapping for meta. But that may be one
> bigger thing.

Yes, I think it would be a significant amount of work to use
pre-mapped buffers for integrity data. I'd like to avoid that in this
patch set and focus on just the ublk integrity data support.

>
> > >
> > >
> > > > size of struct ublksrv_io_desc. UBLK_F_INTEGRITY requires
> > > > UBLK_F_USER_COPY and the addr field isn't used for UBLK_F_USER_COPY, so
> > > > the two fields aren't needed simultaneously.
> > > >
> > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > ---
> > > >  drivers/block/ublk_drv.c      | 43 +++++++++++++++++++++++++++++++----
> > > >  include/uapi/linux/ublk_cmd.h | 27 ++++++++++++++++++++--
> > > >  2 files changed, 64 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > index 2f9316febf83..51469e0627ff 100644
> > > > --- a/drivers/block/ublk_drv.c
> > > > +++ b/drivers/block/ublk_drv.c
> > > > @@ -316,10 +316,36 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
> > > >  static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
> > > >  {
> > > >       return ubq->flags & UBLK_F_ZONED;
> > > >  }
> > > >
> > > > +static void ublk_setup_iod_buf(const struct ublk_queue *ubq,
> > > > +                            const struct request *req,
> > > > +                            struct ublksrv_io_desc *iod)
> > > > +{
> > > > +#ifdef CONFIG_BLK_DEV_INTEGRITY
> > > > +     if (ubq->flags & UBLK_F_INTEGRITY) {
> > > > +             struct bio_integrity_payload *bip;
> > > > +             sector_t ref_tag_seed;
> > > > +
> > > > +             if (!blk_integrity_rq(req))
> > > > +                     return;
> > > > +
> > > > +             bip = bio_integrity(req->bio);
> > > > +             ref_tag_seed = bip_get_seed(bip);
> > >
> > > As mentioned, t10_pi_generate() and ext_pi_crc64_generate() builds
> > > per-internal ref tag.
> >
> > As mentioned, the reftags for subsequent intervals can be computed by
> > simply incrementing the seed. If the seed is assumed to always be
> > start_sector >> (interval_exp - SECTOR_SHIFT), then it may not be
> > necessary to communicate ref_tag_seed at all.
>
> Fair enough, but this should be documented in UAPI interface.
>
> >
> > >
> > >
> > > > +             iod->integrity.ref_tag_lo = ref_tag_seed;
> > > > +             iod->integrity.ref_tag_hi = ref_tag_seed >> 32;
> > > > +             iod->integrity.app_tag = bip->app_tag;
> > >
> > > In case of io merge, each bio may have different ->app_tag.
> >
> > It seems like it would make more sense to prevent merging bios with
> > different app_tags. In the common case where a request contains a
> > single bio, which has a single app_tag, it would be much more
> > efficient to communicate only the 1 app_tag instead of having to pass
> > a separate app_tag for every logical block/integrity interval.
>
> OK.
>
> >
> > >
> > > Given you have to copy meta data via user copy, I suggest to follow the PI
> > > standard and make it per-internal.
> >
> > How are you suggesting the ublk server access bip->app_tag and
> > bip_get_seed(bip) (if overriding the reftag seed is supported)? Would
> > the ublk server need to make another user copy syscall?
> >
> > Or would you prefer I drop the BIP_CHECK_* flag support from this
> > patch set for now?
>
> I can understand the motivation, and extra syscall should be avoided for
> communicating reftag & apptag only, given you have explained both can be
> per-request instead of per-interval.
>
> But iod->addr should be avoided for this purpose, otherwise, new feature
> can conflict with this usage easily.
>
> But per-io mmapped area can solve this issue, the meta size can be one parameter
> of `ublksrv_ctrl_dev_info` with feature flag of UBLK_F_MMAPED_IO_META, what
> do you think of this way?

Are you thinking this userspace-mapped region would consist of one u16
app_tag per ublk I/O descriptor? It seems a bit complex to introduce
an additional userspace-mapped region just to communicate the app_tag.
What do you think about making the size of the I/O descriptor
information dependent on the feature flags? Basically extend
ublksrv_io_desc with an app_tag field if UBLK_F_INTEGRITY is set. This
mechanism could be extended to communicate other per-request
information in the future.

Thanks,
Caleb
Re: [PATCH v2 04/19] ublk: set request integrity params in ublksrv_io_desc
Posted by Ming Lei 1 month ago
On Wed, Jan 07, 2026 at 09:45:08AM -0800, Caleb Sander Mateos wrote:
> On Mon, Jan 5, 2026 at 5:55 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Mon, Jan 05, 2026 at 08:44:48AM -0800, Caleb Sander Mateos wrote:
> > > On Sat, Jan 3, 2026 at 6:14 AM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Fri, Jan 02, 2026 at 05:45:14PM -0700, Caleb Sander Mateos wrote:
> > > > > Indicate to the ublk server when an incoming request has integrity data
> > > > > by setting UBLK_IO_F_INTEGRITY in the ublksrv_io_desc's op_flags field.
> > > > > If the ublk device doesn't support integrity, the request will never
> > > > > provide integrity data. If the ublk device supports integrity, the
> > > > > request may omit the integrity buffer only if metadata_size matches the
> > > > > PI tuple size determined by csum_type. In this case, the ublk server
> > > > > should internally generate/verify the protection information from the
> > > > > data and sector offset.
> > > > > Set the UBLK_IO_F_CHECK_{GUARD,REFTAG,APPTAG} flags based on the
> > > > > request's BIP_CHECK_{GUARD,REFTAG,APPTAG} flags, indicating whether to
> > > > > verify the guard, reference, and app tags in the protection information.
> > > > > The expected reference tag (32 or 48 bits) and app tag (16 bits) are
> > > > > indicated in ublksrv_io_desc's new struct ublksrv_io_integrity integrity
> > > > > field. This field is unioned with the addr field to avoid changing the
> > > >
> > > > It might be fine to set per-rq app_tag, but bios in one request might have
> > > > different app_tag in case of io merge actually.
> > >
> > > I based this logic largely on the code under if (ns->head->ms) in
> > > nvme_setup_rw(). That also assumes a single app_tag for the request.
> > > Sounds like an existing bug if bios with different app_tags can be
> > > merged together?
> >
> > Looks it is true.
> >
> > >
> > > >
> > > > Also block layer builds ref_tag for each internal, please see
> > >
> > > What do you mean by "internal"? "interval"?
> > >
> > > > t10_pi_generate() and ext_pi_crc64_generate().
> > >
> > > Yes, the reftag increases by 1 for each integrity interval. That's why
> > > it suffices for an NVMe command reading multiple blocks to specify
> > > only the expected reftag for the first block; the reftags for
> > > subsequent blocks are incremented accordingly.
> > >
> > > Actually, I think we probably don't need to communicate the reftag
> > > seed to the ublk server. NVMe doesn't use the reftag seed (which can
> > > be overridden by struct uio_meta's seed field). Instead,
> > > nvme_set_ref_tag() always uses the offset into the block device
> > > divided by the integrity interval size, as required by all the
> > > existing csum_type formats the kernel supports. So a ublk server could
> > > just use the start_sector field of struct ublksrv_io_desc to compute
> > > the expected reftags. And using start_sector as the reftag also means
> > > merging requests would preserve their expected reftags.
> >
> > IMO, this way looks fine from user viewpoint, especially aligning with NVMe.
> >
> > >
> > > >
> > > > So looks this way is wrong.
> > > >
> > > > More importantly reusing iod->addr for other purpose not related with IO
> > > > buffer is very unfriendly for adding new features, and one lesson is for ZONED support
> > > > by reusing ublksrv_io_cmd->addr for zoned's append lba.
> > >
> > > That's a fair point.
> >
> > One candidate is add per-IO mmaped meta area, which can be flexible to
> > cover more use cases.
> >
> > >
> > > >
> > > > For example, there is chance to support dma-buf based zero copy for ublk, and
> > > > please see the io-uring dma-buf support[1], and iod->addr might carry IO buffer info
> > > > in dma-buf format in future.
> > > >
> > > > [1] https://lore.kernel.org/io-uring/cover.1763725387.git.asml.silence@gmail.com/#t
> >
> > BTW, PI data size is often small, and it belongs to kernel, there could be
> 
> The integrity data buffer isn't necessarily kernel memory. With
> IORING_RW_ATTR_FLAG_PI, userspace can use a buffer from its address
> space for the integrity data.

You are right, that is something I missed, but it is from io_uring
interface.

> 
> > chance to define PI data as pre-mapped DMA-BUF, then almost all drivers can
> > benefit from avoiding the runtime dma mapping for meta. But that may be one
> > bigger thing.
> 
> Yes, I think it would be a significant amount of work to use
> pre-mapped buffers for integrity data. I'd like to avoid that in this
> patch set and focus on just the ublk integrity data support.

Agree.

> 
> >
> > > >
> > > >
> > > > > size of struct ublksrv_io_desc. UBLK_F_INTEGRITY requires
> > > > > UBLK_F_USER_COPY and the addr field isn't used for UBLK_F_USER_COPY, so
> > > > > the two fields aren't needed simultaneously.
> > > > >
> > > > > Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
> > > > > ---
> > > > >  drivers/block/ublk_drv.c      | 43 +++++++++++++++++++++++++++++++----
> > > > >  include/uapi/linux/ublk_cmd.h | 27 ++++++++++++++++++++--
> > > > >  2 files changed, 64 insertions(+), 6 deletions(-)
> > > > >
> > > > > diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> > > > > index 2f9316febf83..51469e0627ff 100644
> > > > > --- a/drivers/block/ublk_drv.c
> > > > > +++ b/drivers/block/ublk_drv.c
> > > > > @@ -316,10 +316,36 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
> > > > >  static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
> > > > >  {
> > > > >       return ubq->flags & UBLK_F_ZONED;
> > > > >  }
> > > > >
> > > > > +static void ublk_setup_iod_buf(const struct ublk_queue *ubq,
> > > > > +                            const struct request *req,
> > > > > +                            struct ublksrv_io_desc *iod)
> > > > > +{
> > > > > +#ifdef CONFIG_BLK_DEV_INTEGRITY
> > > > > +     if (ubq->flags & UBLK_F_INTEGRITY) {
> > > > > +             struct bio_integrity_payload *bip;
> > > > > +             sector_t ref_tag_seed;
> > > > > +
> > > > > +             if (!blk_integrity_rq(req))
> > > > > +                     return;
> > > > > +
> > > > > +             bip = bio_integrity(req->bio);
> > > > > +             ref_tag_seed = bip_get_seed(bip);
> > > >
> > > > As mentioned, t10_pi_generate() and ext_pi_crc64_generate() builds
> > > > per-internal ref tag.
> > >
> > > As mentioned, the reftags for subsequent intervals can be computed by
> > > simply incrementing the seed. If the seed is assumed to always be
> > > start_sector >> (interval_exp - SECTOR_SHIFT), then it may not be
> > > necessary to communicate ref_tag_seed at all.
> >
> > Fair enough, but this should be documented in UAPI interface.
> >
> > >
> > > >
> > > >
> > > > > +             iod->integrity.ref_tag_lo = ref_tag_seed;
> > > > > +             iod->integrity.ref_tag_hi = ref_tag_seed >> 32;
> > > > > +             iod->integrity.app_tag = bip->app_tag;
> > > >
> > > > In case of io merge, each bio may have different ->app_tag.
> > >
> > > It seems like it would make more sense to prevent merging bios with
> > > different app_tags. In the common case where a request contains a
> > > single bio, which has a single app_tag, it would be much more
> > > efficient to communicate only the 1 app_tag instead of having to pass
> > > a separate app_tag for every logical block/integrity interval.
> >
> > OK.
> >
> > >
> > > >
> > > > Given you have to copy meta data via user copy, I suggest to follow the PI
> > > > standard and make it per-internal.
> > >
> > > How are you suggesting the ublk server access bip->app_tag and
> > > bip_get_seed(bip) (if overriding the reftag seed is supported)? Would
> > > the ublk server need to make another user copy syscall?
> > >
> > > Or would you prefer I drop the BIP_CHECK_* flag support from this
> > > patch set for now?
> >
> > I can understand the motivation, and extra syscall should be avoided for
> > communicating reftag & apptag only, given you have explained both can be
> > per-request instead of per-interval.
> >
> > But iod->addr should be avoided for this purpose, otherwise, new feature
> > can conflict with this usage easily.
> >
> > But per-io mmapped area can solve this issue, the meta size can be one parameter
> > of `ublksrv_ctrl_dev_info` with feature flag of UBLK_F_MMAPED_IO_META, what
> > do you think of this way?
> 
> Are you thinking this userspace-mapped region would consist of one u16
> app_tag per ublk I/O descriptor? It seems a bit complex to introduce
> an additional userspace-mapped region just to communicate the app_tag.
> What do you think about making the size of the I/O descriptor
> information dependent on the feature flags? Basically extend
> ublksrv_io_desc with an app_tag field if UBLK_F_INTEGRITY is set. This
> mechanism could be extended to communicate other per-request
> information in the future.

Right, that is basically what I suggested:

- add new feature flag UBLK_F_IO_DESC_EXT or UBLK_F_IO_META, meantime add
  .io_desc_ext_size or .io_meta_size field in `ublksrv_ctrl_dev_info`

- in case of this feature enabled, per-IO mmaped data size is increased to
  
  	sizeof(ublksrv_io_desc) + 
		.io_desc_ext_size or .io_meta_size(should be 8 bytes aligned)

This way should be easy to implement, and it is flexible for many extentions
in future, such as, you can set proper size for avoiding extra PI user copy
for small IO.


Thanks,
Ming