block/nvme: add support for discard

[Qemu-devel] [PATCH v4] block/nvme: add support for discard

Posted by Maxim Levitsky 6 years, 7 months ago

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
---
 block/nvme.c       | 81 ++++++++++++++++++++++++++++++++++++++++++++++
 block/trace-events |  2 ++
 2 files changed, 83 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index 02e0846643..96a715dcc1 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -111,6 +111,7 @@ typedef struct {
     bool plugged;
 
     bool supports_write_zeros;
+    bool supports_discard;
 
     CoMutex dma_map_lock;
     CoQueue dma_flush_queue;
@@ -460,6 +461,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
                           s->page_size / sizeof(uint64_t) * s->page_size);
 
     s->supports_write_zeros = (idctrl->oncs & NVME_ONCS_WRITE_ZEROS) != 0;
+    s->supports_discard = (idctrl->oncs & NVME_ONCS_DSM) != 0;
 
     memset(resp, 0, 4096);
 
@@ -1149,6 +1151,84 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
 }
 
 
+static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
+                                         int64_t offset,
+                                         int bytes)
+{
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *ioq = s->queues[1];
+    NVMeRequest *req;
+    NvmeDsmRange *buf;
+    QEMUIOVector local_qiov;
+    int r;
+
+    NvmeCmd cmd = {
+        .opcode = NVME_CMD_DSM,
+        .nsid = cpu_to_le32(s->nsid),
+        .cdw10 = 0, /*number of ranges - 0 based*/
+        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
+    };
+
+    NVMeCoData data = {
+        .ctx = bdrv_get_aio_context(bs),
+        .ret = -EINPROGRESS,
+    };
+
+    if (!s->supports_discard) {
+        return -ENOTSUP;
+    }
+
+    assert(s->nr_queues > 1);
+
+    buf = qemu_try_blockalign0(bs, 4096);
+    if (!buf) {
+            return -ENOMEM;
+    }
+
+    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
+    buf->slba = cpu_to_le64(offset >> s->blkshift);
+    buf->cattr = 0;
+
+    qemu_iovec_init(&local_qiov, 1);
+    qemu_iovec_add(&local_qiov, buf, 4096);
+
+    req = nvme_get_free_req(ioq);
+    assert(req);
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    r = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+
+    if (r) {
+        req->busy = false;
+        return r;
+    }
+
+    trace_nvme_dsm(s, offset, bytes);
+
+    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
+        qemu_coroutine_yield();
+    }
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    r = nvme_cmd_unmap_qiov(bs, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+    if (r) {
+        return r;
+    }
+
+    trace_nvme_dsm_done(s, offset, bytes, data.ret);
+
+    qemu_iovec_destroy(&local_qiov);
+    qemu_vfree(buf);
+    return data.ret;
+
+}
+
+
 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue, Error **errp)
 {
@@ -1363,6 +1443,7 @@ static BlockDriver bdrv_nvme = {
     .bdrv_co_pwritev          = nvme_co_pwritev,
 
     .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
+    .bdrv_co_pdiscard         = nvme_co_pdiscard,
 
     .bdrv_co_flush_to_disk    = nvme_co_flush,
     .bdrv_reopen_prepare      = nvme_reopen_prepare,
diff --git a/block/trace-events b/block/trace-events
index 12f363bb44..f763f79d99 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -152,6 +152,8 @@ nvme_write_zeros(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offs
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
 nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
 nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *q) "q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
-- 
2.17.2

Re: [Qemu-devel] [PATCH v4] block/nvme: add support for discard

Posted by Max Reitz 6 years, 7 months ago

On 03.07.19 18:07, Maxim Levitsky wrote:
> Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> ---
>  block/nvme.c       | 81 ++++++++++++++++++++++++++++++++++++++++++++++
>  block/trace-events |  2 ++
>  2 files changed, 83 insertions(+)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 02e0846643..96a715dcc1 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c

[...]

> @@ -460,6 +461,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
>                            s->page_size / sizeof(uint64_t) * s->page_size);
>  
>      s->supports_write_zeros = (idctrl->oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> +    s->supports_discard = (idctrl->oncs & NVME_ONCS_DSM) != 0;

Shouldn’t this be le16_to_cpu(idctrl->oncs)?  Same in the previous
patch, now that I think about it.

>  
>      memset(resp, 0, 4096);
>  
> @@ -1149,6 +1151,84 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
>  }
>  
>  
> +static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
> +                                         int64_t offset,
> +                                         int bytes)
> +{
> +    BDRVNVMeState *s = bs->opaque;
> +    NVMeQueuePair *ioq = s->queues[1];
> +    NVMeRequest *req;
> +    NvmeDsmRange *buf;
> +    QEMUIOVector local_qiov;
> +    int r;
> +
> +    NvmeCmd cmd = {
> +        .opcode = NVME_CMD_DSM,
> +        .nsid = cpu_to_le32(s->nsid),
> +        .cdw10 = 0, /*number of ranges - 0 based*/

I’d make this cpu_to_le32(0).  Sure, there is no effect for 0, but in
theory this is a variable value, so...

> +        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
> +    };
> +
> +    NVMeCoData data = {
> +        .ctx = bdrv_get_aio_context(bs),
> +        .ret = -EINPROGRESS,
> +    };
> +
> +    if (!s->supports_discard) {
> +        return -ENOTSUP;
> +    }
> +
> +    assert(s->nr_queues > 1);
> +
> +    buf = qemu_try_blockalign0(bs, 4096);

I’m not sure whether this needs to be 4096 or whether 16 would suffice,
 but I suppose this gets us the least trouble.

> +    if (!buf) {
> +            return -ENOMEM;

Indentation is off.

> +    }
> +
> +    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
> +    buf->slba = cpu_to_le64(offset >> s->blkshift);
> +    buf->cattr = 0;
> +
> +    qemu_iovec_init(&local_qiov, 1);
> +    qemu_iovec_add(&local_qiov, buf, 4096);
> +
> +    req = nvme_get_free_req(ioq);
> +    assert(req);
> +
> +    qemu_co_mutex_lock(&s->dma_map_lock);
> +    r = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
> +    qemu_co_mutex_unlock(&s->dma_map_lock);
> +
> +    if (r) {
> +        req->busy = false;
> +        return r;

Leaking buf and local_qiov here.

> +    }
> +
> +    trace_nvme_dsm(s, offset, bytes);
> +
> +    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
> +
> +    data.co = qemu_coroutine_self();
> +    while (data.ret == -EINPROGRESS) {
> +        qemu_coroutine_yield();
> +    }
> +
> +    qemu_co_mutex_lock(&s->dma_map_lock);
> +    r = nvme_cmd_unmap_qiov(bs, &local_qiov);
> +    qemu_co_mutex_unlock(&s->dma_map_lock);
> +    if (r) {
> +        return r;

Leaking buf and local_qiov here, too.

Max

> +    }
> +
> +    trace_nvme_dsm_done(s, offset, bytes, data.ret);
> +
> +    qemu_iovec_destroy(&local_qiov);
> +    qemu_vfree(buf);
> +    return data.ret;
> +
> +}
> +
> +
>  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
>                                 BlockReopenQueue *queue, Error **errp)
>  {

Re: [Qemu-devel] [PATCH v4] block/nvme: add support for discard

Posted by Maxim Levitsky 6 years, 7 months ago

On Fri, 2019-07-05 at 15:50 +0200, Max Reitz wrote:
> On 03.07.19 18:07, Maxim Levitsky wrote:
> > Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
> > ---
> >  block/nvme.c       | 81 ++++++++++++++++++++++++++++++++++++++++++++++
> >  block/trace-events |  2 ++
> >  2 files changed, 83 insertions(+)
> > 
> > diff --git a/block/nvme.c b/block/nvme.c
> > index 02e0846643..96a715dcc1 100644
> > --- a/block/nvme.c
> > +++ b/block/nvme.c
> 
> [...]
> 
> > @@ -460,6 +461,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
> >                            s->page_size / sizeof(uint64_t) * s->page_size);
> >  
> >      s->supports_write_zeros = (idctrl->oncs & NVME_ONCS_WRITE_ZEROS) != 0;
> > +    s->supports_discard = (idctrl->oncs & NVME_ONCS_DSM) != 0;
> 
> Shouldn’t this be le16_to_cpu(idctrl->oncs)?  Same in the previous
> patch, now that I think about it.

This reminds me of how I basically scrubbed though nvme-mdev looking for endiannes bugs, 
manually searching for every reference to hardware controlled structure.
Thank you very much!!


> 
> >  
> >      memset(resp, 0, 4096);
> >  
> > @@ -1149,6 +1151,84 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
> >  }
> >  
> >  
> > +static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
> > +                                         int64_t offset,
> > +                                         int bytes)
> > +{
> > +    BDRVNVMeState *s = bs->opaque;
> > +    NVMeQueuePair *ioq = s->queues[1];
> > +    NVMeRequest *req;
> > +    NvmeDsmRange *buf;
> > +    QEMUIOVector local_qiov;
> > +    int r;
> > +
> > +    NvmeCmd cmd = {
> > +        .opcode = NVME_CMD_DSM,
> > +        .nsid = cpu_to_le32(s->nsid),
> > +        .cdw10 = 0, /*number of ranges - 0 based*/
> 
> I’d make this cpu_to_le32(0).  Sure, there is no effect for 0, but in
> theory this is a variable value, so...
Let it be.

> 
> > +        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
> > +    };
> > +
> > +    NVMeCoData data = {
> > +        .ctx = bdrv_get_aio_context(bs),
> > +        .ret = -EINPROGRESS,
> > +    };
> > +
> > +    if (!s->supports_discard) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    assert(s->nr_queues > 1);
> > +
> > +    buf = qemu_try_blockalign0(bs, 4096);
> 
> I’m not sure whether this needs to be 4096 or whether 16 would suffice,
>  but I suppose this gets us the least trouble.
Exactly. Even better would be now that I think about it to use 's->page_size', the device page size.
It is at least 4K (spec minimum).

Speaking of which, there is a theoretical bug there - the device in theory can indicate that its minimal page size is larger that 4K.
The kernel currently rejects such devices, but here the driver just forces 4K page size in the CC register

> 
> > +    if (!buf) {
> > +            return -ENOMEM;
> 
> Indentation is off.
True!

> 
> > +    }
> > +
> > +    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
> > +    buf->slba = cpu_to_le64(offset >> s->blkshift);
> > +    buf->cattr = 0;
> > +
> > +    qemu_iovec_init(&local_qiov, 1);
> > +    qemu_iovec_add(&local_qiov, buf, 4096);
> > +
> > +    req = nvme_get_free_req(ioq);
> > +    assert(req);
> > +
> > +    qemu_co_mutex_lock(&s->dma_map_lock);
> > +    r = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
> > +    qemu_co_mutex_unlock(&s->dma_map_lock);
> > +
> > +    if (r) {
> > +        req->busy = false;
> > +        return r;
> 
> Leaking buf and local_qiov here.
True, fixed.

> 
> > +    }
> > +
> > +    trace_nvme_dsm(s, offset, bytes);
> > +
> > +    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
> > +
> > +    data.co = qemu_coroutine_self();
> > +    while (data.ret == -EINPROGRESS) {
> > +        qemu_coroutine_yield();
> > +    }
> > +
> > +    qemu_co_mutex_lock(&s->dma_map_lock);
> > +    r = nvme_cmd_unmap_qiov(bs, &local_qiov);
> > +    qemu_co_mutex_unlock(&s->dma_map_lock);
> > +    if (r) {
> > +        return r;
> 
> Leaking buf and local_qiov here, too.
True, fixed - next time will check error paths better.

> 
> Max
> 
> > +    }
> > +
> > +    trace_nvme_dsm_done(s, offset, bytes, data.ret);
> > +
> > +    qemu_iovec_destroy(&local_qiov);
> > +    qemu_vfree(buf);
> > +    return data.ret;
> > +
> > +}
> > +
> > +
> >  static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
> >                                 BlockReopenQueue *queue, Error **errp)
> >  {
> 
> 

Thanks for the review,
	Best regards,
		Maxim Levitsky