If RWF_UNCACHED is set for a write, mark new folios being written with
uncached. This is done by passing in the fact that it's an uncached write
through the folio pointer. We can only get there when IOCB_UNCACHED was
allowed, which can only happen if the file system opts in. Opting in means
they need to check for the LSB in the folio pointer to know if it's an
uncached write or not. If it is, then FGP_UNCACHED should be used if
creating new folios is necessary.
Uncached writes will drop any folios they create upon writeback
completion, but leave folios that may exist in that range alone. Since
->write_begin() doesn't currently take any flags, and to avoid needing
to change the callback kernel wide, use the foliop being passed in to
->write_begin() to signal if this is an uncached write or not. File
systems can then use that to mark newly created folios as uncached.
This provides similar benefits to using RWF_UNCACHED with reads. Testing
buffered writes on 32 files:
writing bs 65536, uncached 0
1s: 196035MB/sec
2s: 132308MB/sec
3s: 132438MB/sec
4s: 116528MB/sec
5s: 103898MB/sec
6s: 108893MB/sec
7s: 99678MB/sec
8s: 106545MB/sec
9s: 106826MB/sec
10s: 101544MB/sec
11s: 111044MB/sec
12s: 124257MB/sec
13s: 116031MB/sec
14s: 114540MB/sec
15s: 115011MB/sec
16s: 115260MB/sec
17s: 116068MB/sec
18s: 116096MB/sec
where it's quite obvious where the page cache filled, and performance
dropped from to about half of where it started, settling in at around
115GB/sec. Meanwhile, 32 kswapds were running full steam trying to
reclaim pages.
Running the same test with uncached buffered writes:
writing bs 65536, uncached 1
1s: 198974MB/sec
2s: 189618MB/sec
3s: 193601MB/sec
4s: 188582MB/sec
5s: 193487MB/sec
6s: 188341MB/sec
7s: 194325MB/sec
8s: 188114MB/sec
9s: 192740MB/sec
10s: 189206MB/sec
11s: 193442MB/sec
12s: 189659MB/sec
13s: 191732MB/sec
14s: 190701MB/sec
15s: 191789MB/sec
16s: 191259MB/sec
17s: 190613MB/sec
18s: 191951MB/sec
and the behavior is fully predictable, performing the same throughout
even after the page cache would otherwise have fully filled with dirty
data. It's also about 65% faster, and using half the CPU of the system
compared to the normal buffered write.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
include/linux/fs.h | 5 +++++
include/linux/pagemap.h | 9 +++++++++
mm/filemap.c | 12 +++++++++++-
3 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 45510d0b8de0..122ae821989f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2877,6 +2877,11 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
(iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
if (ret)
return ret;
+ } else if (iocb->ki_flags & IOCB_UNCACHED) {
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+ filemap_fdatawrite_range_kick(mapping, iocb->ki_pos,
+ iocb->ki_pos + count);
}
return count;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d55bf995bd9e..cc02518d338d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -14,6 +14,7 @@
#include <linux/gfp.h>
#include <linux/bitops.h>
#include <linux/hardirq.h> /* for in_interrupt() */
+#include <linux/writeback.h>
#include <linux/hugetlb_inline.h>
struct folio_batch;
@@ -70,6 +71,14 @@ static inline int filemap_write_and_wait(struct address_space *mapping)
return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
}
+/*
+ * Value passed in to ->write_begin() if IOCB_UNCACHED is set for the write,
+ * and the ->write_begin() handler on a file system supporting FOP_UNCACHED
+ * must check for this and pass FGP_UNCACHED for folio creation.
+ */
+#define foliop_uncached ((struct folio *) 0xfee1c001)
+#define foliop_is_uncached(foliop) (*(foliop) == foliop_uncached)
+
/**
* filemap_set_wb_err - set a writeback error on an address_space
* @mapping: mapping in which to set writeback error
diff --git a/mm/filemap.c b/mm/filemap.c
index 13815194ed8a..297cb53332ff 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4076,7 +4076,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
ssize_t written = 0;
do {
- struct folio *folio;
+ struct folio *folio = NULL;
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
@@ -4104,6 +4104,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
break;
}
+ /*
+ * If IOCB_UNCACHED is set here, we now the file system
+ * supports it. And hence it'll know to check folip for being
+ * set to this magic value. If so, it's an uncached write.
+ * Whenever ->write_begin() changes prototypes again, this
+ * can go away and just pass iocb or iocb flags.
+ */
+ if (iocb->ki_flags & IOCB_UNCACHED)
+ folio = foliop_uncached;
+
status = a_ops->write_begin(file, mapping, pos, bytes,
&folio, &fsdata);
if (unlikely(status < 0))
--
2.45.2
On 2024/11/14 23:25, Jens Axboe wrote: > If RWF_UNCACHED is set for a write, mark new folios being written with > uncached. This is done by passing in the fact that it's an uncached write > through the folio pointer. We can only get there when IOCB_UNCACHED was > allowed, which can only happen if the file system opts in. Opting in means > they need to check for the LSB in the folio pointer to know if it's an > uncached write or not. If it is, then FGP_UNCACHED should be used if > creating new folios is necessary. > > Uncached writes will drop any folios they create upon writeback > completion, but leave folios that may exist in that range alone. Since > ->write_begin() doesn't currently take any flags, and to avoid needing > to change the callback kernel wide, use the foliop being passed in to > ->write_begin() to signal if this is an uncached write or not. File > systems can then use that to mark newly created folios as uncached. > > This provides similar benefits to using RWF_UNCACHED with reads. Testing > buffered writes on 32 files: > > writing bs 65536, uncached 0 > 1s: 196035MB/sec > 2s: 132308MB/sec > 3s: 132438MB/sec > 4s: 116528MB/sec > 5s: 103898MB/sec > 6s: 108893MB/sec > 7s: 99678MB/sec > 8s: 106545MB/sec > 9s: 106826MB/sec > 10s: 101544MB/sec > 11s: 111044MB/sec > 12s: 124257MB/sec > 13s: 116031MB/sec > 14s: 114540MB/sec > 15s: 115011MB/sec > 16s: 115260MB/sec > 17s: 116068MB/sec > 18s: 116096MB/sec > > where it's quite obvious where the page cache filled, and performance > dropped from to about half of where it started, settling in at around > 115GB/sec. Meanwhile, 32 kswapds were running full steam trying to > reclaim pages. > > Running the same test with uncached buffered writes: > > writing bs 65536, uncached 1 > 1s: 198974MB/sec > 2s: 189618MB/sec > 3s: 193601MB/sec > 4s: 188582MB/sec > 5s: 193487MB/sec > 6s: 188341MB/sec > 7s: 194325MB/sec > 8s: 188114MB/sec > 9s: 192740MB/sec > 10s: 189206MB/sec > 11s: 193442MB/sec > 12s: 189659MB/sec > 13s: 191732MB/sec > 14s: 190701MB/sec > 15s: 191789MB/sec > 16s: 191259MB/sec > 17s: 190613MB/sec > 18s: 191951MB/sec > > and the behavior is fully predictable, performing the same throughout > even after the page cache would otherwise have fully filled with dirty > data. It's also about 65% faster, and using half the CPU of the system > compared to the normal buffered write. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > --- > include/linux/fs.h | 5 +++++ > include/linux/pagemap.h | 9 +++++++++ > mm/filemap.c | 12 +++++++++++- > 3 files changed, 25 insertions(+), 1 deletion(-) > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 45510d0b8de0..122ae821989f 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -2877,6 +2877,11 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) > (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); > if (ret) > return ret; > + } else if (iocb->ki_flags & IOCB_UNCACHED) { > + struct address_space *mapping = iocb->ki_filp->f_mapping; > + > + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, > + iocb->ki_pos + count); > } > Hi Jens, The filemap_fdatawrite_range_kick() helper function is not added until the next patch, so you should swap the order of patch 10 and patch 11. Regards, Baokun > return count; > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index d55bf995bd9e..cc02518d338d 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -14,6 +14,7 @@ > #include <linux/gfp.h> > #include <linux/bitops.h> > #include <linux/hardirq.h> /* for in_interrupt() */ > +#include <linux/writeback.h> > #include <linux/hugetlb_inline.h> > > struct folio_batch; > @@ -70,6 +71,14 @@ static inline int filemap_write_and_wait(struct address_space *mapping) > return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); > } > > +/* > + * Value passed in to ->write_begin() if IOCB_UNCACHED is set for the write, > + * and the ->write_begin() handler on a file system supporting FOP_UNCACHED > + * must check for this and pass FGP_UNCACHED for folio creation. > + */ > +#define foliop_uncached ((struct folio *) 0xfee1c001) > +#define foliop_is_uncached(foliop) (*(foliop) == foliop_uncached) > + > /** > * filemap_set_wb_err - set a writeback error on an address_space > * @mapping: mapping in which to set writeback error > diff --git a/mm/filemap.c b/mm/filemap.c > index 13815194ed8a..297cb53332ff 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -4076,7 +4076,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) > ssize_t written = 0; > > do { > - struct folio *folio; > + struct folio *folio = NULL; > size_t offset; /* Offset into folio */ > size_t bytes; /* Bytes to write to folio */ > size_t copied; /* Bytes copied from user */ > @@ -4104,6 +4104,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) > break; > } > > + /* > + * If IOCB_UNCACHED is set here, we now the file system > + * supports it. And hence it'll know to check folip for being > + * set to this magic value. If so, it's an uncached write. > + * Whenever ->write_begin() changes prototypes again, this > + * can go away and just pass iocb or iocb flags. > + */ > + if (iocb->ki_flags & IOCB_UNCACHED) > + folio = foliop_uncached; > + > status = a_ops->write_begin(file, mapping, pos, bytes, > &folio, &fsdata); > if (unlikely(status < 0))
On 11/18/24 1:42 AM, Baokun Li wrote: >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> index 45510d0b8de0..122ae821989f 100644 >> --- a/include/linux/fs.h >> +++ b/include/linux/fs.h >> @@ -2877,6 +2877,11 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) >> (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); >> if (ret) >> return ret; >> + } else if (iocb->ki_flags & IOCB_UNCACHED) { >> + struct address_space *mapping = iocb->ki_filp->f_mapping; >> + >> + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, >> + iocb->ki_pos + count); >> } >> > > Hi Jens, > > The filemap_fdatawrite_range_kick() helper function is not added until > the next patch, so you should swap the order of patch 10 and patch 11. Ah thanks, not sure how I missed that. I'll swap them for the next posting, and also do a basic bisection test just to ensure I did't do more of those... -- Jens Axboe
© 2016 - 2024 Red Hat, Inc.