net/9p/trans_virtio.c | 211 +++++++++++++------------------------------------- 1 file changed, 52 insertions(+), 159 deletions(-)
From: Dominique Martinet <asmadeus@codewreck.org>
This simplifies the code quite a bit and should fix issues with
blowing up when iov_iter points at kmalloc data
RFC - Not really tested yet!!
This brings two major changes to how we've always done things with
virtio 9p though:
- We no longer fill in "chan->sg" with user data, but instead allocate a
scatterlist; this should not be a problem nor a slowdown as previous
code would allocate a page list instead, the main difference is that
this might eventually lead to lifting the 512KB msize limit if
compatible with virtio?
- we no longer keep track of how many pages have been pinned, so no
longer block on p9_max_pages pages pinned -- the limit was
`nr_free_buffer_pages()/4;` at the time of driver probe which is
rather arbitrary, it might not really be required? A user could try to
lock memory by issuing IO faster than virtio can handle...
We could just assume all requests come from users and count everything
if that turned out to be useful...
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
Changes in v2:
- Full rewrite to use extract_iter_to_sg(), marked as RFC as not really
tested. This appears to work under basic IO, but I really didn't test
anything that might stress the limits at all...
Please have a look and yell if I did anything that looks too unholy
here!
- Link to v1: https://lore.kernel.org/r/20251210-virtio_trans_iter-v1-1-92eee6d8b6db@codewreck.org
---
net/9p/trans_virtio.c | 211 +++++++++++++-------------------------------------
1 file changed, 52 insertions(+), 159 deletions(-)
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 10c2dd48643818907f4370243eb971fceba4d40b..37b5cf30f8b6b4347ca47b9e840945c838234ecd 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -39,8 +39,6 @@
/* a single mutex to manage channel initialization and attachment */
static DEFINE_MUTEX(virtio_9p_lock);
-static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
-static atomic_t vp_pinned = ATOMIC_INIT(0);
/**
* struct virtio_chan - per-instance transport information
@@ -51,7 +49,6 @@ static atomic_t vp_pinned = ATOMIC_INIT(0);
* @vq: virtio queue associated with this channel
* @ring_bufs_avail: flag to indicate there is some available in the ring buf
* @vc_wq: wait queue for waiting for thing to be added to ring buf
- * @p9_max_pages: maximum number of pinned pages
* @sg: scatter gather list which is used to pack a request (protected?)
* @chan_list: linked list of channels
*
@@ -71,10 +68,6 @@ struct virtio_chan {
struct virtqueue *vq;
int ring_bufs_avail;
wait_queue_head_t *vc_wq;
- /* This is global limit. Since we don't have a global structure,
- * will be placing it in each channel.
- */
- unsigned long p9_max_pages;
/* Scatterlist: can be too big for stack. */
struct scatterlist sg[VIRTQUEUE_NUM];
/**
@@ -202,48 +195,6 @@ static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
return 0;
}
-/**
- * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer,
- * this takes a list of pages.
- * @sg: scatter/gather list to pack into
- * @start: which segment of the sg_list to start at
- * @limit: maximum number of pages in sg list.
- * @pdata: a list of pages to add into sg.
- * @nr_pages: number of pages to pack into the scatter/gather list
- * @offs: amount of data in the beginning of first page _not_ to pack
- * @count: amount of data to pack into the scatter/gather list
- */
-static int
-pack_sg_list_p(struct scatterlist *sg, int start, int limit,
- struct page **pdata, int nr_pages, size_t offs, int count)
-{
- int i = 0, s;
- int data_off = offs;
- int index = start;
-
- BUG_ON(nr_pages > (limit - start));
- /*
- * if the first page doesn't start at
- * page boundary find the offset
- */
- while (nr_pages) {
- s = PAGE_SIZE - data_off;
- if (s > count)
- s = count;
- BUG_ON(index >= limit);
- /* Make sure we don't terminate early. */
- sg_unmark_end(&sg[index]);
- sg_set_page(&sg[index++], pdata[i++], s, data_off);
- data_off = 0;
- count -= s;
- nr_pages--;
- }
-
- if (index-start)
- sg_mark_end(&sg[index - 1]);
- return index - start;
-}
-
/**
* p9_virtio_request - issue a request
* @client: client instance issuing the request
@@ -305,86 +256,15 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
return 0;
}
-static int p9_get_mapped_pages(struct virtio_chan *chan,
- struct page ***pages,
- struct iov_iter *data,
- int count,
- size_t *offs,
- int *need_drop)
-{
- int nr_pages;
- int err;
-
- if (!iov_iter_count(data))
- return 0;
-
- if (!iov_iter_is_kvec(data)) {
- int n;
- /*
- * We allow only p9_max_pages pinned. We wait for the
- * Other zc request to finish here
- */
- if (atomic_read(&vp_pinned) >= chan->p9_max_pages) {
- err = wait_event_killable(vp_wq,
- (atomic_read(&vp_pinned) < chan->p9_max_pages));
- if (err == -ERESTARTSYS)
- return err;
- }
- n = iov_iter_get_pages_alloc2(data, pages, count, offs);
- if (n < 0)
- return n;
- *need_drop = 1;
- nr_pages = DIV_ROUND_UP(n + *offs, PAGE_SIZE);
- atomic_add(nr_pages, &vp_pinned);
- return n;
- } else {
- /* kernel buffer, no need to pin pages */
- int index;
- size_t len;
- void *p;
-
- /* we'd already checked that it's non-empty */
- while (1) {
- len = iov_iter_single_seg_count(data);
- if (likely(len)) {
- p = data->kvec->iov_base + data->iov_offset;
- break;
- }
- iov_iter_advance(data, 0);
- }
- if (len > count)
- len = count;
-
- nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) -
- (unsigned long)p / PAGE_SIZE;
-
- *pages = kmalloc_array(nr_pages, sizeof(struct page *),
- GFP_NOFS);
- if (!*pages)
- return -ENOMEM;
-
- *need_drop = 0;
- p -= (*offs = offset_in_page(p));
- for (index = 0; index < nr_pages; index++) {
- if (is_vmalloc_addr(p))
- (*pages)[index] = vmalloc_to_page(p);
- else
- (*pages)[index] = kmap_to_page(p);
- p += PAGE_SIZE;
- }
- iov_iter_advance(data, len);
- return len;
- }
-}
-
static void handle_rerror(struct p9_req_t *req, int in_hdr_len,
- size_t offs, struct page **pages)
+ size_t offs, struct sg_table *sg_table)
{
unsigned size, n;
void *to = req->rc.sdata + in_hdr_len;
+ struct scatterlist *sg = sg_table->sgl;
// Fits entirely into the static data? Nothing to do.
- if (req->rc.size < in_hdr_len || !pages)
+ if (req->rc.size < in_hdr_len)
return;
// Really long error message? Tough, truncate the reply. Might get
@@ -398,12 +278,39 @@ static void handle_rerror(struct p9_req_t *req, int in_hdr_len,
size = req->rc.size - in_hdr_len;
n = PAGE_SIZE - offs;
if (size > n) {
- memcpy_from_page(to, *pages++, offs, n);
+ memcpy_from_page(to, sg_page(sg), offs, n);
offs = 0;
to += n;
size -= n;
+ if (sg_table->nents < 2)
+ return;
+ sg++;
}
- memcpy_from_page(to, *pages, offs, size);
+ memcpy_from_page(to, sg_page(sg), offs, size);
+}
+
+/**
+ * compute sg_max depending on length, allocate sg_table->sgl and run
+ * extract_iter_to_sg()
+ */
+static ssize_t p9_get_mapped_sg(struct iov_iter *iter, int len,
+ struct sg_table *sg_table)
+{
+ size_t sg_max;
+ ssize_t n;
+
+ sg_max = DIV_ROUND_UP(len, PAGE_SIZE);
+ sg_table->sgl = kcalloc(sg_max, sizeof(*sg_table->sgl), GFP_NOFS);
+ if (!sg_table->sgl)
+ return -ENOMEM;
+
+ n = extract_iter_to_sg(iter, len, sg_table, sg_max, 0);
+
+ WARN_ON(n < 0);
+ if (sg_table->nents > 0)
+ sg_mark_end(&sg_table->sgl[sg_table->nents - 1]);
+
+ return n;
}
/**
@@ -424,10 +331,9 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
{
int in, out, err, out_sgs, in_sgs;
unsigned long flags;
- int in_nr_pages = 0, out_nr_pages = 0;
- struct page **in_pages = NULL, **out_pages = NULL;
struct virtio_chan *chan = client->trans;
struct scatterlist *sgs[4];
+ struct sg_table sg_table = { 0 };
size_t offs = 0;
int need_drop = 0;
int kicked = 0;
@@ -435,14 +341,15 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
p9_debug(P9_DEBUG_TRANS, "virtio request\n");
if (uodata) {
+ ssize_t n;
__le32 sz;
- int n = p9_get_mapped_pages(chan, &out_pages, uodata,
- outlen, &offs, &need_drop);
+
+ n = p9_get_mapped_sg(uodata, outlen, &sg_table);
if (n < 0) {
err = n;
goto err_out;
}
- out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
+ need_drop = n > 0 && iov_iter_extract_will_pin(uodata);
if (n != outlen) {
__le32 v = cpu_to_le32(n);
memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4);
@@ -455,13 +362,14 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
sz = cpu_to_le32(req->tc.size + outlen);
memcpy(&req->tc.sdata[0], &sz, sizeof(sz));
} else if (uidata) {
- int n = p9_get_mapped_pages(chan, &in_pages, uidata,
- inlen, &offs, &need_drop);
+ ssize_t n;
+
+ n = p9_get_mapped_sg(uidata, inlen, &sg_table);
if (n < 0) {
err = n;
goto err_out;
}
- in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
+ need_drop = n > 0 && iov_iter_extract_will_pin(uidata);
if (n != inlen) {
__le32 v = cpu_to_le32(n);
memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4);
@@ -481,11 +389,8 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
if (out)
sgs[out_sgs++] = chan->sg;
- if (out_pages) {
- sgs[out_sgs++] = chan->sg + out;
- out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
- out_pages, out_nr_pages, offs, outlen);
- }
+ if (uodata && outlen > 0)
+ sgs[out_sgs++] = sg_table.sgl;
/*
* Take care of in data
@@ -499,11 +404,8 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
if (in)
sgs[out_sgs + in_sgs++] = chan->sg + out;
- if (in_pages) {
- sgs[out_sgs + in_sgs++] = chan->sg + out + in;
- pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
- in_pages, in_nr_pages, offs, inlen);
- }
+ if (uidata && inlen > 0)
+ sgs[out_sgs + in_sgs++] = sg_table.sgl;
BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req,
@@ -535,27 +437,20 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
READ_ONCE(req->status) >= REQ_STATUS_RCVD);
// RERROR needs reply (== error string) in static data
if (READ_ONCE(req->status) == REQ_STATUS_RCVD &&
- unlikely(req->rc.sdata[4] == P9_RERROR))
- handle_rerror(req, in_hdr_len, offs, in_pages);
+ unlikely(req->rc.sdata[4] == P9_RERROR) &&
+ uidata && inlen > 0)
+ handle_rerror(req, in_hdr_len, offs, &sg_table);
/*
* Non kernel buffers are pinned, unpin them
*/
err_out:
if (need_drop) {
- if (in_pages) {
- p9_release_pages(in_pages, in_nr_pages);
- atomic_sub(in_nr_pages, &vp_pinned);
- }
- if (out_pages) {
- p9_release_pages(out_pages, out_nr_pages);
- atomic_sub(out_nr_pages, &vp_pinned);
- }
- /* wakeup anybody waiting for slots to pin pages */
- wake_up(&vp_wq);
+ /* from extract_user_to_sg() error cleanup code */
+ while (sg_table.nents > 0)
+ unpin_user_page(sg_page(&sg_table.sgl[--sg_table.nents]));
}
- kvfree(in_pages);
- kvfree(out_pages);
+ kfree(sg_table.sgl);
if (!kicked) {
/* reply won't come */
p9_req_put(client, req);
@@ -649,8 +544,6 @@ static int p9_virtio_probe(struct virtio_device *vdev)
}
init_waitqueue_head(chan->vc_wq);
chan->ring_bufs_avail = 1;
- /* Ceiling limit to avoid denial of service attacks */
- chan->p9_max_pages = nr_free_buffer_pages()/4;
virtio_device_ready(vdev);
---
base-commit: 3e281113f871d7f9c69ca55a4d806a72180b7e8a
change-id: 20251210-virtio_trans_iter-5973892db2e3
Best regards,
--
Dominique Martinet <asmadeus@codewreck.org>
On Saturday, 13 December 2025 16:07:40 CET Dominique Martinet via B4 Relay wrote: > From: Dominique Martinet <asmadeus@codewreck.org> > > This simplifies the code quite a bit and should fix issues with > blowing up when iov_iter points at kmalloc data > > RFC - Not really tested yet!! TBH, that bothers me. Also considering the huge amount of changes; again, what was actually wrong with the previously suggested simple patch v1 [1]? All I can see is a discussion about the term "folio" being misused in the commit log message, but nothing about the patch being wrong per se. [1] https://lore.kernel.org/all/20251210-virtio_trans_iter-v1-1-92eee6d8b6db@codewreck.org/ > This brings two major changes to how we've always done things with > virtio 9p though: > - We no longer fill in "chan->sg" with user data, but instead allocate a > scatterlist; this should not be a problem nor a slowdown as previous > code would allocate a page list instead, the main difference is that > this might eventually lead to lifting the 512KB msize limit if > compatible with virtio? Remember that I already had a patch set for lifting the msize limit [2], which was *heavily* tested and will of course break with these changes BTW, and the reason why I used a custom struct virtqueue_sg instead of the shared sg_table API was that the latter could not be resized (see commit log of patch 3). [2] https://lore.kernel.org/all/cover.1657920926.git.linux_oss@crudebyte.com/ /Christian
(Chris, sorry, I had dropped you from Ccs as you were Cc'd from the patch itself and not the header... See [1] for start of thread if you're interested) [1] https://lore.kernel.org/r/20251214-virtio_trans_iter-v2-1-f7f7072e8c15@codewreck.org Christian Schoenebeck wrote on Sat, Dec 13, 2025 at 07:02:00PM +0100: > On Saturday, 13 December 2025 16:07:40 CET Dominique Martinet via B4 Relay wrote: > > This simplifies the code quite a bit and should fix issues with > > blowing up when iov_iter points at kmalloc data > > > > RFC - Not really tested yet!! > > TBH, that bothers me. Well, that just means "don't bother spending time testing much (or debugging if you do test and run into bugs)" and it likely won't be merged as is -- this is enough to start discussions without wasting more time if this gets refused. > Also considering the huge amount of changes; again, what > was actually wrong with the previously suggested simple patch v1 [1]? All I > can see is a discussion about the term "folio" being misused in the commit log > message, but nothing about the patch being wrong per se. > [1] https://lore.kernel.org/all/20251210-virtio_trans_iter-v1-1-92eee6d8b6db@codewreck.org/ I agree we're close to a "perfect is the enemy of good" case here, but while it didn't show up in discussions I did bring it up in the patch comments themselves. My main problem is I'm pretty sure this will break any non-user non-kvec iov_iter; at the very least if we go that route we should guard the else with is_kvec(), figure out what type of iov Chris gets and handle that properly -- likely bvec? I still couldn't figure how to reproduce :/ -- and error out cleanly in other cases. That's enough work that I figured switching boat wouldn't be much different, and if nothing else I've learned -a lot- about the kernel scatterlist, iov_iter and virtio APIs, so we can always say that time wasn't wasted even if this patch ends up dropped. The second problem that I'm reading between the lines of the replies is that iov_iter_get_pages_alloc2() is more or less broken/not supported for what we're trying to use it for, and the faster we stop using it the less bugs we'll get. (It's also really not such a huge patch, the bulk of the change is removed stuff we no longer use and massaging the cleanup path, but extract_iter_to_sg() is doing exactly what we were doing (lookup pages and sg_set_page() from the iov_iter) in better (for some reason when I added traces iov_iter_get_pages_alloc2() always stopped at one page for me?! I thought it was a cache thing but also happens with cache=none, I didn't spend time looking as this code will likely go away one way or another) > > This brings two major changes to how we've always done things with > > virtio 9p though: > > - We no longer fill in "chan->sg" with user data, but instead allocate a > > scatterlist; this should not be a problem nor a slowdown as previous > > code would allocate a page list instead, the main difference is that > > this might eventually lead to lifting the 512KB msize limit if > > compatible with virtio? > > Remember that I already had a patch set for lifting the msize limit [2], which > was *heavily* tested and will of course break with these changes BTW, and the > reason why I used a custom struct virtqueue_sg instead of the shared sg_table > API was that the latter could not be resized (see commit log of patch 3). > > [2] https://lore.kernel.org/all/cover.1657920926.git.linux_oss@crudebyte.com/ Ugh, I'm sorry, it had completely slipped out of my mind... And it was waiting for me to debug the RDMA issues wasn't it :/ FWIW I never heard back from former employer, and my lack of hardware situation hasn't changed, so we can probably mark RDMA as deprecated and see who complains and get them to look into it if they care... (I'm really sorry about having forgotten, but while I never have much time for 9p at any given moment if you don't hear back from me on some subject you want to push please do send a reminder after a month or so... It doesn't excuse my behavior and if we had any other maintainer active that might improve the situation, but at least it might prevent such useful patches from being forgotten while waiting on me) (... actually now I'm done re-reading the patches we've already applied patch 10/11 that could impact RDMA, and the rest is specific to virtio, so there's nothing else that could go wrong with it as far as this is concerned?...) OTOH I've learnt a lot about the scatterlist API meanwhile, and I don't necessarily agree about why you've had to basically reimplement sg_table just to chain them (what you described in patchset 3: > A custom struct virtqueue_sg is defined instead of using > shared API struct sg_table, because the latter would not > allow to resize the table after allocation. sg_append_table > API OTOH would not fit either, because it requires a list > of pages beforehand upon allocation. And both APIs only > support all-or-nothing allocation. ) Having looked at sg_append_table I agree it doesn't look appropriate, but I think sg_table would work just fine -- you can call sg_chain on the last element of the table. It'll still require some of the helpers you introduced, but the virtqueue_sg type can go away, and we'd just need to loop over extract_iter_to_sg() for each of the sg_table "segment". If I'm not mistaken here, then the patches aren't that incompatible, and it's something worth doing in one order or the other. Anyway, what now? I'm happy to delay the switch to extract_iter_to_sg() to after your virtio msize rework if you want to send it again, but I don't think it's as simple as picking up the v1. I've honestly spent too long on this for this weekend already, so I'll log off for now but if you have any suggestion I'm all ears. Thanks, -- Dominique Martinet | Asmadeus
© 2016 - 2025 Red Hat, Inc.