[PATCH RFC] io_uring: Pass whole sqe to commands

Breno Leitao posted 1 patch 1 year, 1 month ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/multipath-tcp/mptcp_net-next tags/patchew/20230406165705.3161734-1-leitao@debian.org
Maintainers: Ming Lei <ming.lei@redhat.com>, Jens Axboe <axboe@kernel.dk>, Keith Busch <kbusch@kernel.org>, Christoph Hellwig <hch@lst.de>, Sagi Grimberg <sagi@grimberg.me>, Pavel Begunkov <asml.silence@gmail.com>
drivers/block/ublk_drv.c  | 24 ++++++++++++------------
drivers/nvme/host/ioctl.c |  2 +-
include/linux/io_uring.h  |  2 +-
io_uring/opdef.c          |  2 +-
io_uring/uring_cmd.c      | 11 ++++++-----
5 files changed, 21 insertions(+), 20 deletions(-)
[PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Breno Leitao 1 year, 1 month ago
Currently uring CMD operation relies on having large SQEs, but future
operations might want to use normal SQE.

The io_uring_cmd currently only saves the payload (cmd) part of the SQE,
but, for commands that use normal SQE size, it might be necessary to
access the initial SQE fields outside of the payload/cmd block.  So,
saves the whole SQE other than just the pdu.

This changes slighlty how the io_uring_cmd works, since the cmd
structures and callbacks are not opaque to io_uring anymore. I.e, the
callbacks can look at the SQE entries, not only, in the cmd structure.

The main advantage is that we don't need to create custom structures for
simple commands.

Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/block/ublk_drv.c  | 24 ++++++++++++------------
 drivers/nvme/host/ioctl.c |  2 +-
 include/linux/io_uring.h  |  2 +-
 io_uring/opdef.c          |  2 +-
 io_uring/uring_cmd.c      | 11 ++++++-----
 5 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index d1d1c8d606c8..0e35d82eb070 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1258,7 +1258,7 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
 
 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
-	struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd;
+	struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->sqe->cmd;
 	struct ublk_device *ub = cmd->file->private_data;
 	struct ublk_queue *ubq;
 	struct ublk_io *io;
@@ -1562,7 +1562,7 @@ static struct ublk_device *ublk_get_device_from_id(int idx)
 
 static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	int ublksrv_pid = (int)header->data[0];
 	struct gendisk *disk;
 	int ret = -EINVAL;
@@ -1624,7 +1624,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
 static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	cpumask_var_t cpumask;
 	unsigned long queue;
@@ -1675,7 +1675,7 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
 
 static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	struct ublksrv_ctrl_dev_info info;
 	struct ublk_device *ub;
@@ -1838,7 +1838,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
 
 static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 
 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
 			__func__, cmd->cmd_op, header->dev_id, header->queue_id,
@@ -1857,7 +1857,7 @@ static int ublk_ctrl_stop_dev(struct ublk_device *ub)
 static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 
 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
@@ -1888,7 +1888,7 @@ static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
 static int ublk_ctrl_get_params(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	struct ublk_params_header ph;
 	int ret;
@@ -1919,7 +1919,7 @@ static int ublk_ctrl_get_params(struct ublk_device *ub,
 static int ublk_ctrl_set_params(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	struct ublk_params_header ph;
 	int ret = -EFAULT;
@@ -1977,7 +1977,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
 static int ublk_ctrl_start_recovery(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	int ret = -EINVAL;
 	int i;
 
@@ -2019,7 +2019,7 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub,
 static int ublk_ctrl_end_recovery(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	int ublksrv_pid = (int)header->data[0];
 	int ret = -EINVAL;
 
@@ -2086,7 +2086,7 @@ static int ublk_char_dev_permission(struct ublk_device *ub,
 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
 		struct io_uring_cmd *cmd)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
 	void __user *argp = (void __user *)(unsigned long)header->addr;
 	char *dev_path = NULL;
@@ -2165,7 +2165,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
-	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->sqe->cmd;
 	struct ublk_device *ub = NULL;
 	int ret = -EINVAL;
 
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 723e7d5b778f..304da8f3200b 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -550,7 +550,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
 {
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
-	const struct nvme_uring_cmd *cmd = ioucmd->cmd;
+	const struct nvme_uring_cmd *cmd = (struct nvme_uring_cmd *)ioucmd->sqe->cmd;
 	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
 	struct nvme_uring_data d;
 	struct nvme_command c;
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 934e5dd4ccc0..650e6f12cc18 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -24,7 +24,7 @@ enum io_uring_cmd_flags {
 
 struct io_uring_cmd {
 	struct file	*file;
-	const void	*cmd;
+	const struct io_uring_sqe *sqe;
 	union {
 		/* callback to defer completions to task context */
 		void (*task_work_cb)(struct io_uring_cmd *cmd);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index cca7c5b55208..3b9c6489b8b6 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -627,7 +627,7 @@ const struct io_cold_def io_cold_defs[] = {
 	},
 	[IORING_OP_URING_CMD] = {
 		.name			= "URING_CMD",
-		.async_size		= uring_cmd_pdu_size(1),
+		.async_size		= 2 * sizeof(struct io_uring_sqe),
 		.prep_async		= io_uring_cmd_prep_async,
 	},
 	[IORING_OP_SEND_ZC] = {
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 2e4c483075d3..9648134ccae1 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -63,14 +63,15 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 int io_uring_cmd_prep_async(struct io_kiocb *req)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	size_t cmd_size;
+	size_t size = sizeof(struct io_uring_sqe);
 
 	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
 	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
 
-	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
+	if (req->ctx->flags & IORING_SETUP_SQE128)
+		size <<= 1;
 
-	memcpy(req->async_data, ioucmd->cmd, cmd_size);
+	memcpy(req->async_data, ioucmd->sqe, size);
 	return 0;
 }
 
@@ -96,7 +97,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		req->imu = ctx->user_bufs[index];
 		io_req_set_rsrc_node(req, ctx, 0);
 	}
-	ioucmd->cmd = sqe->cmd;
+	ioucmd->sqe = sqe;
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 	return 0;
 }
@@ -128,7 +129,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	if (req_has_async_data(req))
-		ioucmd->cmd = req->async_data;
+		ioucmd->sqe = req->async_data;
 
 	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
 	if (ret == -EAGAIN) {
-- 
2.34.1
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Ming Lei 1 year, 1 month ago
On Thu, Apr 06, 2023 at 09:57:05AM -0700, Breno Leitao wrote:
> Currently uring CMD operation relies on having large SQEs, but future
> operations might want to use normal SQE.
> 
> The io_uring_cmd currently only saves the payload (cmd) part of the SQE,
> but, for commands that use normal SQE size, it might be necessary to
> access the initial SQE fields outside of the payload/cmd block.  So,
> saves the whole SQE other than just the pdu.
> 
> This changes slighlty how the io_uring_cmd works, since the cmd
> structures and callbacks are not opaque to io_uring anymore. I.e, the
> callbacks can look at the SQE entries, not only, in the cmd structure.
> 
> The main advantage is that we don't need to create custom structures for
> simple commands.
> 
> Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---

...

> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> index 2e4c483075d3..9648134ccae1 100644
> --- a/io_uring/uring_cmd.c
> +++ b/io_uring/uring_cmd.c
> @@ -63,14 +63,15 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
>  int io_uring_cmd_prep_async(struct io_kiocb *req)
>  {
>  	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> -	size_t cmd_size;
> +	size_t size = sizeof(struct io_uring_sqe);
>  
>  	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
>  	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
>  
> -	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
> +	if (req->ctx->flags & IORING_SETUP_SQE128)
> +		size <<= 1;
>  
> -	memcpy(req->async_data, ioucmd->cmd, cmd_size);
> +	memcpy(req->async_data, ioucmd->sqe, size);

The copy will make some fields of sqe become READ TWICE, and driver may see
different sqe field value compared with the one observed in io_init_req().

Can this kind of inconsistency cause trouble to driver?

If it isn't one problem, this patch looks fine.

But I guess any access on cmd->sqe in driver may have to be careful for dealing
with potential post-sqe-update.

Thanks,
Ming
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Breno Leitao 1 year, 1 month ago
Hello Ming,

On Thu, Apr 13, 2023 at 10:56:49AM +0800, Ming Lei wrote:
> On Thu, Apr 06, 2023 at 09:57:05AM -0700, Breno Leitao wrote:
> > Currently uring CMD operation relies on having large SQEs, but future
> > operations might want to use normal SQE.
> > 
> > The io_uring_cmd currently only saves the payload (cmd) part of the SQE,
> > but, for commands that use normal SQE size, it might be necessary to
> > access the initial SQE fields outside of the payload/cmd block.  So,
> > saves the whole SQE other than just the pdu.
> > 
> > This changes slighlty how the io_uring_cmd works, since the cmd
> > structures and callbacks are not opaque to io_uring anymore. I.e, the
> > callbacks can look at the SQE entries, not only, in the cmd structure.
> > 
> > The main advantage is that we don't need to create custom structures for
> > simple commands.
> > 
> > Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
> > Signed-off-by: Breno Leitao <leitao@debian.org>
> > ---
> 
> ...
> 
> > diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> > index 2e4c483075d3..9648134ccae1 100644
> > --- a/io_uring/uring_cmd.c
> > +++ b/io_uring/uring_cmd.c
> > @@ -63,14 +63,15 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
> >  int io_uring_cmd_prep_async(struct io_kiocb *req)
> >  {
> >  	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> > -	size_t cmd_size;
> > +	size_t size = sizeof(struct io_uring_sqe);
> >  
> >  	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
> >  	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
> >  
> > -	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
> > +	if (req->ctx->flags & IORING_SETUP_SQE128)
> > +		size <<= 1;
> >  
> > -	memcpy(req->async_data, ioucmd->cmd, cmd_size);
> > +	memcpy(req->async_data, ioucmd->sqe, size);
> 
> The copy will make some fields of sqe become READ TWICE, and driver may see
> different sqe field value compared with the one observed in io_init_req().

This copy only happens if the operation goes to the async path
(calling io_uring_cmd_prep_async()).  This only happens if
f_op->uring_cmd() returns -EAGAIN.

          ret = file->f_op->uring_cmd(ioucmd, issue_flags);
          if (ret == -EAGAIN) {
                  if (!req_has_async_data(req)) {
                          if (io_alloc_async_data(req))
                                  return -ENOMEM;
                          io_uring_cmd_prep_async(req);
                  }
                  return -EAGAIN;
          }

Are you saying that after this copy, the operation is still reading from
sqe instead of req->async_data?

If you have an example of the two copes flow, that would be great.

Thanks for the review,
Breno
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Ming Lei 1 year, 1 month ago
On Thu, Apr 13, 2023 at 09:47:56AM -0700, Breno Leitao wrote:
> Hello Ming,
> 
> On Thu, Apr 13, 2023 at 10:56:49AM +0800, Ming Lei wrote:
> > On Thu, Apr 06, 2023 at 09:57:05AM -0700, Breno Leitao wrote:
> > > Currently uring CMD operation relies on having large SQEs, but future
> > > operations might want to use normal SQE.
> > > 
> > > The io_uring_cmd currently only saves the payload (cmd) part of the SQE,
> > > but, for commands that use normal SQE size, it might be necessary to
> > > access the initial SQE fields outside of the payload/cmd block.  So,
> > > saves the whole SQE other than just the pdu.
> > > 
> > > This changes slighlty how the io_uring_cmd works, since the cmd
> > > structures and callbacks are not opaque to io_uring anymore. I.e, the
> > > callbacks can look at the SQE entries, not only, in the cmd structure.
> > > 
> > > The main advantage is that we don't need to create custom structures for
> > > simple commands.
> > > 
> > > Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
> > > Signed-off-by: Breno Leitao <leitao@debian.org>
> > > ---
> > 
> > ...
> > 
> > > diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> > > index 2e4c483075d3..9648134ccae1 100644
> > > --- a/io_uring/uring_cmd.c
> > > +++ b/io_uring/uring_cmd.c
> > > @@ -63,14 +63,15 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
> > >  int io_uring_cmd_prep_async(struct io_kiocb *req)
> > >  {
> > >  	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> > > -	size_t cmd_size;
> > > +	size_t size = sizeof(struct io_uring_sqe);
> > >  
> > >  	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
> > >  	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
> > >  
> > > -	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
> > > +	if (req->ctx->flags & IORING_SETUP_SQE128)
> > > +		size <<= 1;
> > >  
> > > -	memcpy(req->async_data, ioucmd->cmd, cmd_size);
> > > +	memcpy(req->async_data, ioucmd->sqe, size);
> > 
> > The copy will make some fields of sqe become READ TWICE, and driver may see
> > different sqe field value compared with the one observed in io_init_req().
> 
> This copy only happens if the operation goes to the async path
> (calling io_uring_cmd_prep_async()).  This only happens if
> f_op->uring_cmd() returns -EAGAIN.
> 
>           ret = file->f_op->uring_cmd(ioucmd, issue_flags);
>           if (ret == -EAGAIN) {
>                   if (!req_has_async_data(req)) {
>                           if (io_alloc_async_data(req))
>                                   return -ENOMEM;
>                           io_uring_cmd_prep_async(req);
>                   }
>                   return -EAGAIN;
>           }
> 
> Are you saying that after this copy, the operation is still reading from
> sqe instead of req->async_data?

I meant that the 2nd read is on the sqe copy(req->aync_data), but same
fields can become different between the two READs(first is done on original
SQE during io_init_req(), and second is done on sqe copy in driver).

Will this kind of inconsistency cause trouble for driver? Cause READ
TWICE becomes possible with this patch.

> 
> If you have an example of the two copes flow, that would be great.

Not any example yet, but also not see any access on cmd->sqe(except for cmd_op)
in your patches too.


Thanks,
Ming
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Pavel Begunkov 1 year, 1 month ago
On 4/14/23 03:12, Ming Lei wrote:
> On Thu, Apr 13, 2023 at 09:47:56AM -0700, Breno Leitao wrote:
>> Hello Ming,
>>
>> On Thu, Apr 13, 2023 at 10:56:49AM +0800, Ming Lei wrote:
>>> On Thu, Apr 06, 2023 at 09:57:05AM -0700, Breno Leitao wrote:
>>>> Currently uring CMD operation relies on having large SQEs, but future
>>>> operations might want to use normal SQE.
>>>>
>>>> The io_uring_cmd currently only saves the payload (cmd) part of the SQE,
>>>> but, for commands that use normal SQE size, it might be necessary to
>>>> access the initial SQE fields outside of the payload/cmd block.  So,
>>>> saves the whole SQE other than just the pdu.
>>>>
>>>> This changes slighlty how the io_uring_cmd works, since the cmd
>>>> structures and callbacks are not opaque to io_uring anymore. I.e, the
>>>> callbacks can look at the SQE entries, not only, in the cmd structure.
>>>>
>>>> The main advantage is that we don't need to create custom structures for
>>>> simple commands.
>>>>
>>>> Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
>>>> Signed-off-by: Breno Leitao <leitao@debian.org>
>>>> ---
>>>
>>> ...
>>>
>>>> diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
>>>> index 2e4c483075d3..9648134ccae1 100644
>>>> --- a/io_uring/uring_cmd.c
>>>> +++ b/io_uring/uring_cmd.c
>>>> @@ -63,14 +63,15 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
>>>>   int io_uring_cmd_prep_async(struct io_kiocb *req)
>>>>   {
>>>>   	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
>>>> -	size_t cmd_size;
>>>> +	size_t size = sizeof(struct io_uring_sqe);
>>>>   
>>>>   	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
>>>>   	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
>>>>   
>>>> -	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
>>>> +	if (req->ctx->flags & IORING_SETUP_SQE128)
>>>> +		size <<= 1;
>>>>   
>>>> -	memcpy(req->async_data, ioucmd->cmd, cmd_size);
>>>> +	memcpy(req->async_data, ioucmd->sqe, size);
>>>
>>> The copy will make some fields of sqe become READ TWICE, and driver may see
>>> different sqe field value compared with the one observed in io_init_req().
>>
>> This copy only happens if the operation goes to the async path
>> (calling io_uring_cmd_prep_async()).  This only happens if
>> f_op->uring_cmd() returns -EAGAIN.
>>
>>            ret = file->f_op->uring_cmd(ioucmd, issue_flags);
>>            if (ret == -EAGAIN) {
>>                    if (!req_has_async_data(req)) {
>>                            if (io_alloc_async_data(req))
>>                                    return -ENOMEM;
>>                            io_uring_cmd_prep_async(req);
>>                    }
>>                    return -EAGAIN;
>>            }
>>
>> Are you saying that after this copy, the operation is still reading from
>> sqe instead of req->async_data?
> 
> I meant that the 2nd read is on the sqe copy(req->aync_data), but same
> fields can become different between the two READs(first is done on original
> SQE during io_init_req(), and second is done on sqe copy in driver).
> 
> Will this kind of inconsistency cause trouble for driver? Cause READ
> TWICE becomes possible with this patch.

Right it might happen, and I was keeping that in mind, but it's not
specific to this patch. It won't reload core io_uring bits, and all
fields cmds use already have this problem.

Unless there is a better option, the direction we'll be moving in is
adding a preparation step that should read and stash parts of SQE
it cares about, which should also make full SQE copy not
needed / optional.

>> If you have an example of the two copes flow, that would be great.
> 
> Not any example yet, but also not see any access on cmd->sqe(except for cmd_op)
> in your patches too.

-- 
Pavel Begunkov
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Ming Lei 1 year, 1 month ago
On Fri, Apr 14, 2023 at 02:12:10PM +0100, Pavel Begunkov wrote:
> On 4/14/23 03:12, Ming Lei wrote:
> > On Thu, Apr 13, 2023 at 09:47:56AM -0700, Breno Leitao wrote:
> > > Hello Ming,
> > > 
> > > On Thu, Apr 13, 2023 at 10:56:49AM +0800, Ming Lei wrote:
> > > > On Thu, Apr 06, 2023 at 09:57:05AM -0700, Breno Leitao wrote:
> > > > > Currently uring CMD operation relies on having large SQEs, but future
> > > > > operations might want to use normal SQE.
> > > > > 
> > > > > The io_uring_cmd currently only saves the payload (cmd) part of the SQE,
> > > > > but, for commands that use normal SQE size, it might be necessary to
> > > > > access the initial SQE fields outside of the payload/cmd block.  So,
> > > > > saves the whole SQE other than just the pdu.
> > > > > 
> > > > > This changes slighlty how the io_uring_cmd works, since the cmd
> > > > > structures and callbacks are not opaque to io_uring anymore. I.e, the
> > > > > callbacks can look at the SQE entries, not only, in the cmd structure.
> > > > > 
> > > > > The main advantage is that we don't need to create custom structures for
> > > > > simple commands.
> > > > > 
> > > > > Suggested-by: Pavel Begunkov <asml.silence@gmail.com>
> > > > > Signed-off-by: Breno Leitao <leitao@debian.org>
> > > > > ---
> > > > 
> > > > ...
> > > > 
> > > > > diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
> > > > > index 2e4c483075d3..9648134ccae1 100644
> > > > > --- a/io_uring/uring_cmd.c
> > > > > +++ b/io_uring/uring_cmd.c
> > > > > @@ -63,14 +63,15 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
> > > > >   int io_uring_cmd_prep_async(struct io_kiocb *req)
> > > > >   {
> > > > >   	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> > > > > -	size_t cmd_size;
> > > > > +	size_t size = sizeof(struct io_uring_sqe);
> > > > >   	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
> > > > >   	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
> > > > > -	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
> > > > > +	if (req->ctx->flags & IORING_SETUP_SQE128)
> > > > > +		size <<= 1;
> > > > > -	memcpy(req->async_data, ioucmd->cmd, cmd_size);
> > > > > +	memcpy(req->async_data, ioucmd->sqe, size);
> > > > 
> > > > The copy will make some fields of sqe become READ TWICE, and driver may see
> > > > different sqe field value compared with the one observed in io_init_req().
> > > 
> > > This copy only happens if the operation goes to the async path
> > > (calling io_uring_cmd_prep_async()).  This only happens if
> > > f_op->uring_cmd() returns -EAGAIN.
> > > 
> > >            ret = file->f_op->uring_cmd(ioucmd, issue_flags);
> > >            if (ret == -EAGAIN) {
> > >                    if (!req_has_async_data(req)) {
> > >                            if (io_alloc_async_data(req))
> > >                                    return -ENOMEM;
> > >                            io_uring_cmd_prep_async(req);
> > >                    }
> > >                    return -EAGAIN;
> > >            }
> > > 
> > > Are you saying that after this copy, the operation is still reading from
> > > sqe instead of req->async_data?
> > 
> > I meant that the 2nd read is on the sqe copy(req->aync_data), but same
> > fields can become different between the two READs(first is done on original
> > SQE during io_init_req(), and second is done on sqe copy in driver).
> > 
> > Will this kind of inconsistency cause trouble for driver? Cause READ
> > TWICE becomes possible with this patch.
> 
> Right it might happen, and I was keeping that in mind, but it's not
> specific to this patch. It won't reload core io_uring bits, and all

It depends if driver reloads core bits or not, anyway the patch exports
all fields and opens the window.

> fields cmds use already have this problem.

driver is supposed to load cmds field just once too, right?

> 
> Unless there is a better option, the direction we'll be moving in is
> adding a preparation step that should read and stash parts of SQE
> it cares about, which should also make full SQE copy not
> needed / optional.

Sounds good.


Thanks,
Ming
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Pavel Begunkov 1 year, 1 month ago
On 4/14/23 14:59, Ming Lei wrote:
[...]
>>> Will this kind of inconsistency cause trouble for driver? Cause READ
>>> TWICE becomes possible with this patch.
>>
>> Right it might happen, and I was keeping that in mind, but it's not
>> specific to this patch. It won't reload core io_uring bits, and all
> 
> It depends if driver reloads core bits or not, anyway the patch exports
> all fields and opens the window.

If a driver tries to reload core bits and even worse modify io_uring
request without proper helpers, it should be rooted out and thrown
into a bin. In any case cmds are expected to exercise cautiousness
while working with SQEs as they may change. I'd even argue that
hiding it as void *cmd makes it much less obvious.

>> fields cmds use already have this problem.
> 
> driver is supposed to load cmds field just once too, right?

Ideally they shouldn't, but it's fine to reload as long as
the cmd can handle it. And it should always be READ_ONCE()
and so.

>> Unless there is a better option, the direction we'll be moving in is
>> adding a preparation step that should read and stash parts of SQE
>> it cares about, which should also make full SQE copy not
>> needed / optional.
> 
> Sounds good.

-- 
Pavel Begunkov
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Ming Lei 1 year, 1 month ago
On Fri, Apr 14, 2023 at 03:56:47PM +0100, Pavel Begunkov wrote:
> On 4/14/23 14:59, Ming Lei wrote:
> [...]
> > > > Will this kind of inconsistency cause trouble for driver? Cause READ
> > > > TWICE becomes possible with this patch.
> > > 
> > > Right it might happen, and I was keeping that in mind, but it's not
> > > specific to this patch. It won't reload core io_uring bits, and all
> > 
> > It depends if driver reloads core bits or not, anyway the patch exports
> > all fields and opens the window.
> 
> If a driver tries to reload core bits and even worse modify io_uring
> request without proper helpers, it should be rooted out and thrown
> into a bin. In any case cmds are expected to exercise cautiousness
> while working with SQEs as they may change. I'd even argue that
> hiding it as void *cmd makes it much less obvious.

Fair enough, if it is well documented, then people will know these
problems and any change in this area can get careful review.


Thanks, 
Ming
Re: io_uring: Pass whole sqe to commands: Tests Results
Posted by MPTCP CI 1 year, 1 month ago
Hi Breno,

Thank you for your modifications, that's great!

Our CI did some validations and here is its report:

- KVM Validation: normal (except selftest_mptcp_join):
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/6539795959119872
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/6539795959119872/summary/summary.txt

- KVM Validation: debug (only selftest_mptcp_join):
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/5273158563921920
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/5273158563921920/summary/summary.txt

- KVM Validation: normal (only selftest_mptcp_join):
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/4710208610500608
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/4710208610500608/summary/summary.txt

- KVM Validation: debug (except selftest_mptcp_join):
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/5836108517343232
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/5836108517343232/summary/summary.txt

Initiator: Patchew Applier
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/9999e6372112


If there are some issues, you can reproduce them using the same environment as
the one used by the CI thanks to a docker image, e.g.:

    $ cd [kernel source code]
    $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
        --pull always mptcp/mptcp-upstream-virtme-docker:latest \
        auto-debug

For more details:

    https://github.com/multipath-tcp/mptcp-upstream-virtme-docker


Please note that despite all the efforts that have been already done to have a
stable tests suite when executed on a public CI like here, it is possible some
reported issues are not due to your modifications. Still, do not hesitate to
help us improve that ;-)

Cheers,
MPTCP GH Action bot
Bot operated by Matthieu Baerts (Tessares)
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Keith Busch 1 year, 1 month ago
On Thu, Apr 06, 2023 at 09:57:05AM -0700, Breno Leitao wrote:
> Currently uring CMD operation relies on having large SQEs, but future
> operations might want to use normal SQE.
> 
> The io_uring_cmd currently only saves the payload (cmd) part of the SQE,
> but, for commands that use normal SQE size, it might be necessary to
> access the initial SQE fields outside of the payload/cmd block.  So,
> saves the whole SQE other than just the pdu.
> 
> This changes slighlty how the io_uring_cmd works, since the cmd
> structures and callbacks are not opaque to io_uring anymore. I.e, the
> callbacks can look at the SQE entries, not only, in the cmd structure.
> 
> The main advantage is that we don't need to create custom structures for
> simple commands.

This looks good to me. The only disadvantage I can see is that the async
fallback allocates just a tiny bit more data than before, but no biggie.

Reviewed-by: Keith Busch <kbusch@kernel.org>
 
> @@ -63,14 +63,15 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
>  int io_uring_cmd_prep_async(struct io_kiocb *req)
>  {
>  	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> -	size_t cmd_size;
> +	size_t size = sizeof(struct io_uring_sqe);
>  
>  	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
>  	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);

One minor suggestion. The above is the only user of uring_cmd_pdu_size() now,
which is kind of a convoluted way to enfoce the offset of the 'cmd' field. It
may be more clear to replace these with:

	BUILD_BUG_ON(offsetof(struct io_uring_sqe, cmd) == 48);
  
> -	cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
> +	if (req->ctx->flags & IORING_SETUP_SQE128)
> +		size <<= 1;
>  
> -	memcpy(req->async_data, ioucmd->cmd, cmd_size);
> +	memcpy(req->async_data, ioucmd->sqe, size);
>  	return 0;
>  }
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Breno Leitao 1 year, 1 month ago
On Fri, Apr 07, 2023 at 12:51:44PM -0600, Keith Busch wrote:
> > @@ -63,14 +63,15 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
> >  int io_uring_cmd_prep_async(struct io_kiocb *req)
> >  {
> >  	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
> > -	size_t cmd_size;
> > +	size_t size = sizeof(struct io_uring_sqe);
> >  
> >  	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
> >  	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
> 
> One minor suggestion. The above is the only user of uring_cmd_pdu_size() now,
> which is kind of a convoluted way to enfoce the offset of the 'cmd' field. It
> may be more clear to replace these with:

I agree with you here. Basically it is a bug if the payload (pdu) size is
is different than 16 for single SQE or != 80 for extended SQE.

So, basically it is checking for two things:
   * the cmd offset is 48
   * the io_uring_sqe struct is 64

Since this is a uapi, I am not confidence that they will change at all.
I can replace the code with your suggestion.

> 	BUILD_BUG_ON(offsetof(struct io_uring_sqe, cmd) == 48);

It should be "offset(struct io_uring_sqe, cmd) != 48)", right?

Thanks for the review!
Re: [PATCH RFC] io_uring: Pass whole sqe to commands
Posted by Pavel Begunkov 1 year, 1 month ago
On 4/11/23 13:22, Breno Leitao wrote:
> On Fri, Apr 07, 2023 at 12:51:44PM -0600, Keith Busch wrote:
>>> @@ -63,14 +63,15 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done);
>>>   int io_uring_cmd_prep_async(struct io_kiocb *req)
>>>   {
>>>   	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
>>> -	size_t cmd_size;
>>> +	size_t size = sizeof(struct io_uring_sqe);
>>>   
>>>   	BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
>>>   	BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
>>
>> One minor suggestion. The above is the only user of uring_cmd_pdu_size() now,
>> which is kind of a convoluted way to enfoce the offset of the 'cmd' field. It
>> may be more clear to replace these with:
> 
> I agree with you here. Basically it is a bug if the payload (pdu) size is
> is different than 16 for single SQE or != 80 for extended SQE.
> 
> So, basically it is checking for two things:
>     * the cmd offset is 48
>     * the io_uring_sqe struct is 64
> 
> Since this is a uapi, I am not confidence that they will change at all.
> I can replace the code with your suggestion.
> 
>> 	BUILD_BUG_ON(offsetof(struct io_uring_sqe, cmd) == 48);
> 
> It should be "offset(struct io_uring_sqe, cmd) != 48)", right?

Which is already checked, see io_uring_init()

-- 
Pavel Begunkov
Re: io_uring: Pass whole sqe to commands: Tests Results
Posted by MPTCP CI 1 year, 1 month ago
Hi Breno,

Thank you for your modifications, that's great!

Our CI did some validations and here is its report:

- KVM Validation: normal (except selftest_mptcp_join):
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/6625829522767872
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/6625829522767872/summary/summary.txt

- KVM Validation: debug (except selftest_mptcp_join):
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/5711035848458240
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/5711035848458240/summary/summary.txt

- KVM Validation: normal (only selftest_mptcp_join):
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/4585135941615616
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/4585135941615616/summary/summary.txt

- KVM Validation: debug (only selftest_mptcp_join):
  - Success! ✅:
  - Task: https://cirrus-ci.com/task/5148085895036928
  - Summary: https://api.cirrus-ci.com/v1/artifact/task/5148085895036928/summary/summary.txt

Initiator: Patchew Applier
Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/035a67e200f0


If there are some issues, you can reproduce them using the same environment as
the one used by the CI thanks to a docker image, e.g.:

    $ cd [kernel source code]
    $ docker run -v "${PWD}:${PWD}:rw" -w "${PWD}" --privileged --rm -it \
        --pull always mptcp/mptcp-upstream-virtme-docker:latest \
        auto-debug

For more details:

    https://github.com/multipath-tcp/mptcp-upstream-virtme-docker


Please note that despite all the efforts that have been already done to have a
stable tests suite when executed on a public CI like here, it is possible some
reported issues are not due to your modifications. Still, do not hesitate to
help us improve that ;-)

Cheers,
MPTCP GH Action bot
Bot operated by Matthieu Baerts (Tessares)