From: Daniel Gomez <da.gomez@samsung.com>
Add large folio support for shmem write and fallocate paths matching the
same high order preference mechanism used in the iomap buffered IO path
as used in __filemap_get_folio().
Add shmem_mapping_size_order() to get a hint for the order of the folio
based on the file size which takes care of the mapping requirements.
If the top level huge page (controlled by '/sys/kernel/mm/transparent_hugepage/shmem_enabled')
is enabled, we just allow PMD sized THP to keep interface backward
compatibility.
Co-developed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
mm/shmem.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 48 insertions(+), 3 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 0613421e09e7..6dece90ff421 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1672,6 +1672,36 @@ bool shmem_hpage_pmd_enabled(void)
return false;
}
+/**
+ * shmem_mapping_size_order - Get maximum folio order for the given file size.
+ * @mapping: Target address_space.
+ * @index: The page index.
+ * @size: The suggested size of the folio to create.
+ *
+ * This returns a high order for folios (when supported) based on the file size
+ * which the mapping currently allows at the given index. The index is relevant
+ * due to alignment considerations the mapping might have. The returned order
+ * may be less than the size passed.
+ *
+ * Like __filemap_get_folio order calculation.
+ *
+ * Return: The order.
+ */
+static inline unsigned int
+shmem_mapping_size_order(struct address_space *mapping, pgoff_t index, size_t size)
+{
+ unsigned int order = get_order(max_t(size_t, size, PAGE_SIZE));
+
+ if (!mapping_large_folio_support(mapping))
+ return 0;
+
+ /* If we're not aligned, allocate a smaller folio */
+ if (index & ((1UL << order) - 1))
+ order = __ffs(index);
+
+ return min_t(size_t, order, MAX_PAGECACHE_ORDER);
+}
+
unsigned long shmem_allowable_huge_orders(struct inode *inode,
struct vm_area_struct *vma, pgoff_t index,
loff_t write_end, bool shmem_huge_force)
@@ -1694,11 +1724,26 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
global_huge = shmem_huge_global_enabled(inode, index, write_end,
shmem_huge_force, vma, vm_flags);
if (!vma || !vma_is_anon_shmem(vma)) {
+ size_t len;
+
+ /*
+ * For tmpfs, if top level huge page is enabled, we just allow
+ * PMD sized THP to keep interface backward compatibility.
+ */
+ if (global_huge)
+ return BIT(HPAGE_PMD_ORDER);
+
+ if (!write_end)
+ return 0;
+
/*
- * For tmpfs, we now only support PMD sized THP if huge page
- * is enabled, otherwise fallback to order 0.
+ * Otherwise, get a highest order hint based on the size of
+ * write and fallocate paths, then will try each allowable
+ * huge orders.
*/
- return global_huge ? BIT(HPAGE_PMD_ORDER) : 0;
+ len = write_end - (index << PAGE_SHIFT);
+ order = shmem_mapping_size_order(inode->i_mapping, index, len);
+ return order > 0 ? BIT(order + 1) - 1 : 0;
}
/*
--
2.39.3
On Thu, Sep 26, 2024 at 04:27:26PM +0800, Baolin Wang wrote: > +static inline unsigned int > +shmem_mapping_size_order(struct address_space *mapping, pgoff_t index, size_t size) > +{ > + unsigned int order = get_order(max_t(size_t, size, PAGE_SIZE)); Why introduce the max_t() call here? Did nobody read the documentation or implementation for get_order() before writing this patch? Besides, get_order() is wrong (at least relative to other filesystems). get_order() rounds up instead of down, so what should we do for a write() of size 512 * 1024 + 1 byte? Other filesystems allocate an order-8 folio plus an order-0 folio. This code would have us allocate an order-9 folio. I think that's a bad idea.
On 9/26/2024 2:16 PM, Matthew Wilcox wrote: > On Thu, Sep 26, 2024 at 04:27:26PM +0800, Baolin Wang wrote: >> +static inline unsigned int >> +shmem_mapping_size_order(struct address_space *mapping, pgoff_t index, size_t size) >> +{ >> + unsigned int order = get_order(max_t(size_t, size, PAGE_SIZE)); > > Why introduce the max_t() call here? Did nobody read the documentation > or implementation for get_order() before writing this patch? get_order() result is undefined if the size is 0. I've used max_t() here to avoid that case. Perhaps should we prevent that case before getting here? > > Besides, get_order() is wrong (at least relative to other filesystems). > get_order() rounds up instead of down, so what should we do for a write() > of size 512 * 1024 + 1 byte? Other filesystems allocate an order-8 folio > plus an order-0 folio. This code would have us allocate an order-9 folio. > I think that's a bad idea. > I think one of my earlier attemps was to use fgf_set_order + FGF_GET_ORDER() as in iomap. But the solution taken there was to share code between shmem and filemap and that wasn't considered a good idea. Shall we just replicate iomap_get_folio()? Or else, what do you suggest here? Daniel
On Thu, Sep 26, 2024 at 02:58:31PM +0200, Daniel Gomez wrote: > On 9/26/2024 2:16 PM, Matthew Wilcox wrote: > > On Thu, Sep 26, 2024 at 04:27:26PM +0800, Baolin Wang wrote: > > > +static inline unsigned int > > > +shmem_mapping_size_order(struct address_space *mapping, pgoff_t index, size_t size) > > > +{ > > > + unsigned int order = get_order(max_t(size_t, size, PAGE_SIZE)); > > > > Why introduce the max_t() call here? Did nobody read the documentation > > or implementation for get_order() before writing this patch? > > get_order() result is undefined if the size is 0. I've used max_t() here to > avoid that case. Perhaps should we prevent that case before getting here? Surely we've handled a length-0 write before we get here? > I think one of my earlier attemps was to use fgf_set_order + FGF_GET_ORDER() > as in iomap. But the solution taken there was to share code between shmem > and filemap and that wasn't considered a good idea. Shall we just replicate > iomap_get_folio()? Or else, what do you suggest here? We could move three of the four lines from fgf_set_order() into a new function and call it from both fgf_set_order() and shmem? >
On 2024/9/26 21:40, Matthew Wilcox wrote: > On Thu, Sep 26, 2024 at 02:58:31PM +0200, Daniel Gomez wrote: >> On 9/26/2024 2:16 PM, Matthew Wilcox wrote: >>> On Thu, Sep 26, 2024 at 04:27:26PM +0800, Baolin Wang wrote: >>>> +static inline unsigned int >>>> +shmem_mapping_size_order(struct address_space *mapping, pgoff_t index, size_t size) >>>> +{ >>>> + unsigned int order = get_order(max_t(size_t, size, PAGE_SIZE)); >>> >>> Why introduce the max_t() call here? Did nobody read the documentation >>> or implementation for get_order() before writing this patch? >> >> get_order() result is undefined if the size is 0. I've used max_t() here to >> avoid that case. Perhaps should we prevent that case before getting here? > > Surely we've handled a length-0 write before we get here? > >> I think one of my earlier attemps was to use fgf_set_order + FGF_GET_ORDER() >> as in iomap. But the solution taken there was to share code between shmem >> and filemap and that wasn't considered a good idea. Shall we just replicate >> iomap_get_folio()? Or else, what do you suggest here? > > We could move three of the four lines from fgf_set_order() into a > new function and call it from both fgf_set_order() and shmem? Sounds good. How about the following changes? Do you have a perferred name for the new helper? Thanks. diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d9c7edb6422b..ce418acd2737 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -629,6 +629,16 @@ typedef unsigned int __bitwise fgf_t; #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +static inline unsigned int filemap_get_order(size_t size) +{ + unsigned int shift = ilog2(size); + + if (shift <= PAGE_SHIFT) + return 0; + + return shift - PAGE_SHIFT; +} + /** * fgf_set_order - Encode a length in the fgf_t flags. * @size: The suggested size of the folio to create. @@ -642,11 +652,11 @@ typedef unsigned int __bitwise fgf_t; */ static inline fgf_t fgf_set_order(size_t size) { - unsigned int shift = ilog2(size); + unsigned int order = filemap_get_order(size); - if (shift <= PAGE_SHIFT) + if (!order) return 0; - return (__force fgf_t)((shift - PAGE_SHIFT) << 26); + return (__force fgf_t)(order << 26); } void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
© 2016 - 2024 Red Hat, Inc.