:p
atchew
Login
This series add support for high order folios in shmem write path. This is a continuation of the shmem work from Luis here [1] following Matthew Wilcox's suggestion [2] regarding the path to take for the folio allocation order calculation. [1] RFC v2 add support for blocksize > PAGE_SIZE https://lore.kernel.org/all/ZHBowMEDfyrAAOWH@bombadil.infradead.org/T/#md3e93ab46ce2ad9254e1eb54ffe71211988b5632 [2] https://lore.kernel.org/all/ZHD9zmIeNXICDaRJ@casper.infradead.org/ Patches have been tested and sent from next-230911. They do apply cleanly to the latest next-230914. fsx and fstests has been performed on tmpfs with noswap with the following results: - fsx: 2d test, 21,5B - fstests: Same result as baseline for next-230911 [3][4][5] [3] Baseline next-230911 failures are: generic/080 generic/126 generic/193 generic/633 generic/689 [4] fstests logs baseline: https://gitlab.com/-/snippets/3598621 [5] fstests logs patches: https://gitlab.com/-/snippets/3598628 There are at least 2 cases/topics to handle that I'd appreciate feedback. 1. With the new strategy, you might end up with a folio order matching HPAGE_PMD_ORDER. However, we won't respect the 'huge' flag anymore if THP is enabled. 2. When the above (1.) occurs, the code skips the huge path, so xa_find with hindex is skipped. Daniel Daniel Gomez (5): filemap: make the folio order calculation shareable shmem: drop BLOCKS_PER_PAGE macro shmem: add order parameter support to shmem_alloc_folio shmem: add file length in shmem_get_folio path shmem: add large folios support to the write path Luis Chamberlain (1): shmem: account for large order folios fs/iomap/buffered-io.c | 6 ++- include/linux/pagemap.h | 42 ++++++++++++++++--- include/linux/shmem_fs.h | 2 +- mm/filemap.c | 8 ---- mm/khugepaged.c | 2 +- mm/shmem.c | 91 +++++++++++++++++++++++++--------------- 6 files changed, 100 insertions(+), 51 deletions(-) -- 2.39.2
To make the code that clamps the folio order in the __filemap_get_folio routine reusable to others, move and merge it to the fgf_set_order new subroutine (mapping_size_order), so when mapping the size at a given index, the order calculated is already valid and ready to be used when order is retrieved from fgp_flags with FGF_GET_ORDER. Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- fs/iomap/buffered-io.c | 6 ++++-- include/linux/pagemap.h | 42 ++++++++++++++++++++++++++++++++++++----- mm/filemap.c | 8 -------- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index XXXXXXX..XXXXXXX 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -XXX,XX +XXX,XX @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS; + pgoff_t index = pos >> PAGE_SHIFT; + struct address_space *mapping = iter->inode->i_mapping; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; - fgp |= fgf_set_order(len); + fgp |= fgf_set_order(mapping, index, len); - return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, + return __filemap_get_folio(mapping, index, fgp, mapping_gfp_mask(iter->inode->i_mapping)); } EXPORT_SYMBOL_GPL(iomap_get_folio); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index XXXXXXX..XXXXXXX 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -XXX,XX +XXX,XX @@ typedef unsigned int __bitwise fgf_t; #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +/** + * mapping_size_order - Get maximum folio order for the given file size. + * @mapping: Target address_space. + * @index: The page index. + * @size: The suggested size of the folio to create. + * + * This returns a high order for folios (when supported) based on the file size + * which the mapping currently allows at the given index. The index is relevant + * due to alignment considerations the mapping might have. The returned order + * may be less than the size passed. + * + * Return: The order. + */ +static inline unsigned int mapping_size_order(struct address_space *mapping, + pgoff_t index, size_t size) +{ + unsigned int order = ilog2(size); + + if ((order <= PAGE_SHIFT) || (!mapping_large_folio_support(mapping))) + return 0; + else + order = order - PAGE_SHIFT; + + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); + + order = min_t(size_t, order, MAX_PAGECACHE_ORDER); + + /* Order-1 not supported due to THP dependency */ + return (order == 1) ? 0 : order; +} + /** * fgf_set_order - Encode a length in the fgf_t flags. * @size: The suggested size of the folio to create. @@ -XXX,XX +XXX,XX @@ typedef unsigned int __bitwise fgf_t; * due to alignment constraints, memory pressure, or the presence of * other folios at nearby indices. */ -static inline fgf_t fgf_set_order(size_t size) +static inline fgf_t fgf_set_order(struct address_space *mapping, pgoff_t index, + size_t size) { - unsigned int shift = ilog2(size); + unsigned int order = mapping_size_order(mapping, index, size); - if (shift <= PAGE_SHIFT) - return 0; - return (__force fgf_t)((shift - PAGE_SHIFT) << 26); + return (__force fgf_t)(order << 26); } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); diff --git a/mm/filemap.c b/mm/filemap.c index XXXXXXX..XXXXXXX 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -XXX,XX +XXX,XX @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; - if (!mapping_large_folio_support(mapping)) - order = 0; - if (order > MAX_PAGECACHE_ORDER) - order = MAX_PAGECACHE_ORDER; - /* If we're not aligned, allocate a smaller folio */ - if (index & ((1UL << order) - 1)) - order = __ffs(index); - do { gfp_t alloc_gfp = gfp; -- 2.39.2
The commit [1] replaced all BLOCKS_PER_PAGE in favor of the generic PAGE_SECTORS but definition was not removed. Drop it as unused macro. [1] e09764cff44b5 ("shmem: quota support"). Signed-off-by: Daniel Gomez <da.gomez@samsung.com> Reviewed-by: Luis Chamberlain <mcgrof@kernel.org> --- mm/shmem.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ static struct vfsmount *shm_mnt; #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_SIZE/512) #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ -- 2.39.2
From: Luis Chamberlain <mcgrof@kernel.org> shmem uses the shem_info_inode alloced, swapped to account for allocated pages and swapped pages. In preparation for large order folios adjust the accounting to use folio_nr_pages(). This should produce no functional changes yet as larger order folios are not yet used or supported in shmem. Signed-off-by: Luis Chamberlain <mcgrof@kernel.org> Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- mm/shmem.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) - swapped++; + if (xa_is_value(folio)) + swapped += (folio_nr_pages(folio)); if (xas.xa_index == max) break; if (need_resched()) { @@ -XXX,XX +XXX,XX @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio = fbatch.folios[i]; if (xa_is_value(folio)) { + long swaps_freed; if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, - indices[i], folio); + swaps_freed = folio_nr_pages(folio); + if (!shmem_free_swap(mapping, indices[i], folio)) + nr_swaps_freed += swaps_freed; continue; } @@ -XXX,XX +XXX,XX @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio = fbatch.folios[i]; if (xa_is_value(folio)) { + long swaps_freed; if (unfalloc) continue; + swaps_freed = folio_nr_pages(folio); if (shmem_free_swap(mapping, indices[i], folio)) { /* Swap was replaced by page: retry */ index = indices[i]; break; } - nr_swaps_freed++; + nr_swaps_freed += swaps_freed; continue; } @@ -XXX,XX +XXX,XX @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { - shmem_recalc_inode(inode, 0, 1); + shmem_recalc_inode(inode, 0, folio_nr_pages(folio)); swap_shmem_alloc(swap); shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); @@ -XXX,XX +XXX,XX @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; + long num_swap_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, @@ -XXX,XX +XXX,XX @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, return; folio_wait_writeback(folio); + num_swap_pages = folio_nr_pages(folio); delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ - shmem_recalc_inode(inode, -1, -1); + shmem_recalc_inode(inode, num_swap_pages, num_swap_pages); swap_free(swap); } @@ -XXX,XX +XXX,XX @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (error) goto failed; - shmem_recalc_inode(inode, 0, -1); + shmem_recalc_inode(inode, 0, folio_nr_pages(folio)); if (sgp == SGP_WRITE) folio_mark_accessed(folio); @@ -XXX,XX +XXX,XX @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (ret) goto out_delete_from_cache; - shmem_recalc_inode(inode, 1, 0); + shmem_recalc_inode(inode, folio_nr_pages(folio), 0); folio_unlock(folio); return 0; out_delete_from_cache: -- 2.39.2
In preparation for high order folio support for the write path, add order parameter when allocating a folio. This is on the write path when huge support is not enabled or when it is but the huge page allocation fails, the fallback will take advantage of this too. Use order 0 for the non write paths such as reads or swap in as these currently lack high order folios support. Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- mm/shmem.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_alloc_hugefolio(gfp_t gfp, } static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + unsigned int order) { struct vm_area_struct pvma; struct folio *folio; shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); + folio = vma_alloc_folio(gfp, order, &pvma, 0, false); shmem_pseudo_vma_destroy(&pvma); return folio; } static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge) + pgoff_t index, bool huge, unsigned int *order) { struct shmem_inode_info *info = SHMEM_I(inode); struct folio *folio; @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; - nr = huge ? HPAGE_PMD_NR : 1; + nr = huge ? HPAGE_PMD_NR : 1U << *order; err = shmem_inode_acct_block(inode, nr); if (err) @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, if (huge) folio = shmem_alloc_hugefolio(gfp, info, index); else - folio = shmem_alloc_folio(gfp, info, index); + folio = shmem_alloc_folio(gfp, info, index, *order); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); @@ -XXX,XX +XXX,XX @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, */ gfp &= ~GFP_CONSTRAINT_MASK; VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, info, index); + new = shmem_alloc_folio(gfp, info, index, 0); if (!new) return -ENOMEM; @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, int error; int once = 0; int alloced = 0; + unsigned int order = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); + folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true, + &order); if (IS_ERR(folio)) { alloc_nohuge: - folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); + folio = shmem_alloc_and_acct_folio(gfp, inode, index, false, + &order); } if (IS_ERR(folio)) { int retry = 5; @@ -XXX,XX +XXX,XX @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, info, pgoff); + folio = shmem_alloc_folio(gfp, info, pgoff, 0); if (!folio) goto out_unacct_blocks; -- 2.39.2
To be able to calculate folio order based on the file size when allocation occurs on the write path. Use of length 0 for non write paths. Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- include/linux/shmem_fs.h | 2 +- mm/khugepaged.c | 2 +- mm/shmem.c | 28 ++++++++++++++++------------ 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index XXXXXXX..XXXXXXX 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -XXX,XX +XXX,XX @@ enum sgp_type { }; int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp); + enum sgp_type sgp, size_t len); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index XXXXXXX..XXXXXXX 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -XXX,XX +XXX,XX @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, - &folio, SGP_NOALLOC)) { + &folio, SGP_NOALLOC, 0)) { result = SCAN_FAIL; goto xa_unlocked; } diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * (although in some cases this is just a waste of time). */ folio = NULL; - shmem_get_folio(inode, index, &folio, SGP_READ); + shmem_get_folio(inode, index, &folio, SGP_READ, 0); return folio; } @@ -XXX,XX +XXX,XX @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) + vm_fault_t *fault_type, size_t len) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, } int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp) + enum sgp_type sgp, size_t len) { return shmem_get_folio_gfp(inode, index, foliop, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); + mapping_gfp_mask(inode->i_mapping), + NULL, NULL, NULL, len); } /* @@ -XXX,XX +XXX,XX @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) } err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, - gfp, vma, vmf, &ret); + gfp, vma, vmf, &ret, i_size_read(inode)); if (err) return vmf_error(err); if (folio) @@ -XXX,XX +XXX,XX @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; int ret = 0; + if (!mapping_large_folio_support(mapping)) + len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); + /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { @@ -XXX,XX +XXX,XX @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + ret = shmem_get_folio(inode, index, &folio, SGP_WRITE, len); if (ret) return ret; @@ -XXX,XX +XXX,XX @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_get_folio(inode, index, &folio, SGP_READ); + error = shmem_get_folio(inode, index, &folio, SGP_READ, 0); if (error) { if (error == -EINVAL) error = 0; @@ -XXX,XX +XXX,XX @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, break; error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, - SGP_READ); + SGP_READ, 0); if (error) { if (error == -EINVAL) error = 0; @@ -XXX,XX +XXX,XX @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, error = -ENOMEM; else error = shmem_get_folio(inode, index, &folio, - SGP_FALLOC); + SGP_FALLOC, 0); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ @@ -XXX,XX +XXX,XX @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); + error = shmem_get_folio(inode, 0, &folio, SGP_WRITE, 0); if (error) goto out_remove_offset; inode->i_mapping->a_ops = &shmem_aops; @@ -XXX,XX +XXX,XX @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, &folio, SGP_READ, 0); if (error) return ERR_PTR(error); if (!folio) @@ -XXX,XX +XXX,XX @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, BUG_ON(!shmem_mapping(mapping)); error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, - gfp, NULL, NULL, NULL); + gfp, NULL, NULL, NULL, i_size_read(inode)); if (error) return ERR_PTR(error); -- 2.39.2
Add large folio support for shmem write path matching the same high order preference mechanism used for iomap buffered IO path as used in __filemap_get_folio(). Use the __folio_get_max_order to get a hint for the order of the folio based on file size which takes care of the mapping requirements. Swap does not support high order folios for now, so make it order 0 in case swap is enabled. Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- mm/shmem.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_alloc_folio(gfp_t gfp, } static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge, unsigned int *order) + pgoff_t index, bool huge, unsigned int *order, + struct shmem_sb_info *sbinfo) { struct shmem_inode_info *info = SHMEM_I(inode); struct folio *folio; int nr; int err; + if (!sbinfo->noswap) + *order = 0; + else + *order = (*order == 1) ? 0 : *order; + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; nr = huge ? HPAGE_PMD_NR : 1U << *order; @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } + order = mapping_size_order(inode->i_mapping, index, len); + if (!shmem_is_huge(inode, index, false, vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) goto alloc_nohuge; @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true, - &order); + &order, sbinfo); if (IS_ERR(folio)) { alloc_nohuge: folio = shmem_alloc_and_acct_folio(gfp, inode, index, false, - &order); + &order, sbinfo); } if (IS_ERR(folio)) { int retry = 5; @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (folio_test_large(folio)) { folio_unlock(folio); folio_put(folio); + if (order > 0) + order--; goto alloc_nohuge; } unlock: -- 2.39.2
This series add support for high order folios in shmem write path when swap is disabled (noswap option). This is part of the Large Block Size (LBS) effort [1][2] and a continuation of the shmem work from Luis here [3] following Matthew Wilcox's suggestion [4] regarding the path to take for the folio allocation order calculation. [1] https://kernelnewbies.org/KernelProjects/large-block-size [2] https://docs.google.com/spreadsheets/d/e/2PACX-1vS7sQfw90S00l2rfOKm83Jlg0px8KxMQE4HHp_DKRGbAGcAV-xu6LITHBEc4xzVh9wLH6WM2lR0cZS8/pubhtml# [3] RFC v2 add support for blocksize > PAGE_SIZE https://lore.kernel.org/all/ZHBowMEDfyrAAOWH@bombadil.infradead.org/T/#md3e93ab46ce2ad9254e1eb54ffe71211988b5632 [4] https://lore.kernel.org/all/ZHD9zmIeNXICDaRJ@casper.infradead.org/ fsx and fstests has been performed on tmpfs with noswap with the following results: V2: - fsx: 4,9B - fstests: Same result as baseline for next-230918. V1: - fsx: 2d test, 21,5B - fstests: Same result as baseline for next-230911 [3][4][5] Patches have been tested and sent from next-230918. [3] Baseline next-230911 failures are: generic/080 generic/126 generic/193 generic/633 generic/689 [4] fstests logs baseline: https://gitlab.com/-/snippets/3598621 [5] fstests logs patches: https://gitlab.com/-/snippets/3598628 Note: because of next-230918 regression in rmap, patch [8] applied. [8] 20230918151729.5A1F4C32796@smtp.kernel.org Daniel Changes since v1 * Order handling code simplified in shmem_get_folio_gfp after Matthew Willcox's review. * Drop patch 1/6 [6] and merge mapping_size_order code directly in shmem. * Added MAX_SHMEM_ORDER to make it explicit we don't have the same max order as in pagecache (MAX_PAGECACHE_ORDER). * Use HPAGE_PMD_ORDER-1 as MAX_SHMEM_ORDER to respect huge mount option. * Update cover letter: drop huge strategy question and add more context regarding LBS project. Add fsx and fstests summary with new baseline. * Add fixes found by Matthew in patch 3/6 [7]. * Fix length (i_size_read -> PAGE_SIZE) that is passed to shmem_get_folio_gfp in shmem_fault and shmem_read_folio_gfp to PAGE_SIZE. * Add patch as suggested by Matthew to return the number of pages freed in shmem_free_swap (instead of errno). When no pages are freed, return 0 (pages). Note: As an alternative, we can embed -ENOENT and make use of IS_ERR_VALUE. Approach discarded because little value was added. If this method is preferred, please let discuss it. [6] filemap: make the folio order calculation shareable [7] shmem: account for large order folios Daniel Gomez (5): shmem: drop BLOCKS_PER_PAGE macro shmem: return freed pages in shmem_free_swap shmem: add order parameter support to shmem_alloc_folio shmem: add file length in shmem_get_folio path shmem: add large folios support to the write path Luis Chamberlain (1): shmem: account for large order folios include/linux/shmem_fs.h | 2 +- mm/khugepaged.c | 2 +- mm/shmem.c | 141 ++++++++++++++++++++++++++------------- 3 files changed, 97 insertions(+), 48 deletions(-) -- 2.39.2
The commit [1] replaced all BLOCKS_PER_PAGE in favor of the generic PAGE_SECTORS but definition was not removed. Drop it as unused macro. [1] e09764cff44b5 ("shmem: quota support"). Signed-off-by: Daniel Gomez <da.gomez@samsung.com> Reviewed-by: Luis Chamberlain <mcgrof@kernel.org> --- mm/shmem.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ static struct vfsmount *shm_mnt; #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_SIZE/512) #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ -- 2.39.2
Both shmem_free_swap callers require to get the number of pages in the folio after calling shmem_free_swap. Make shmem_free_swap return the expected value directly and return 0 number of pages being freed to avoid error handling in the external accounting. Suggested-by: Matthew Wilcox <willy@infradead.org> Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- mm/shmem.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) /* * Remove swap entry from page cache, free the swap and its page cache. */ -static int shmem_free_swap(struct address_space *mapping, +static long shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) - return -ENOENT; + return 0; + free_swap_and_cache(radix_to_swp_entry(radswap)); - return 0; + + return folio_nr_pages((struct folio *)radswap); } /* @@ -XXX,XX +XXX,XX @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, + nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } @@ -XXX,XX +XXX,XX @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - if (shmem_free_swap(mapping, indices[i], folio)) { + nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); + if (!nr_swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } - nr_swaps_freed++; continue; } -- 2.39.2
From: Luis Chamberlain <mcgrof@kernel.org> shmem uses the shem_info_inode alloced, swapped to account for allocated pages and swapped pages. In preparation for large order folios adjust the accounting to use folio_nr_pages(). This should produce no functional changes yet as larger order folios are not yet used or supported in shmem. Signed-off-by: Luis Chamberlain <mcgrof@kernel.org> Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- mm/shmem.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) - swapped++; + if (xa_is_value(folio)) + swapped += folio_nr_pages(folio); if (xas.xa_index == max) break; if (need_resched()) { @@ -XXX,XX +XXX,XX @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { - shmem_recalc_inode(inode, 0, 1); + shmem_recalc_inode(inode, 0, folio_nr_pages(folio)); swap_shmem_alloc(swap); shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); @@ -XXX,XX +XXX,XX @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; + long num_swap_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, @@ -XXX,XX +XXX,XX @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, return; folio_wait_writeback(folio); + num_swap_pages = folio_nr_pages(folio); delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ - shmem_recalc_inode(inode, -1, -1); + shmem_recalc_inode(inode, -num_swap_pages, -num_swap_pages); swap_free(swap); } @@ -XXX,XX +XXX,XX @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (error) goto failed; - shmem_recalc_inode(inode, 0, -1); + shmem_recalc_inode(inode, 0, -folio_nr_pages(folio)); if (sgp == SGP_WRITE) folio_mark_accessed(folio); @@ -XXX,XX +XXX,XX @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (ret) goto out_delete_from_cache; - shmem_recalc_inode(inode, 1, 0); + shmem_recalc_inode(inode, folio_nr_pages(folio), 0); folio_unlock(folio); return 0; out_delete_from_cache: -- 2.39.2
In preparation for high order folio support for the write path, add order parameter when allocating a folio. This is on the write path when huge support is not enabled or when it is but the huge page allocation fails, the fallback will take advantage of this too. Use order 0 for the non write paths such as reads or swap in as these currently lack high order folios support. Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- mm/shmem.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_alloc_hugefolio(gfp_t gfp, } static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + unsigned int order) { struct vm_area_struct pvma; struct folio *folio; shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); + folio = vma_alloc_folio(gfp, order, &pvma, 0, false); shmem_pseudo_vma_destroy(&pvma); return folio; } static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge) + pgoff_t index, bool huge, unsigned int *order) { struct shmem_inode_info *info = SHMEM_I(inode); struct folio *folio; @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; - nr = huge ? HPAGE_PMD_NR : 1; + nr = huge ? HPAGE_PMD_NR : 1U << *order; err = shmem_inode_acct_block(inode, nr); if (err) @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, if (huge) folio = shmem_alloc_hugefolio(gfp, info, index); else - folio = shmem_alloc_folio(gfp, info, index); + folio = shmem_alloc_folio(gfp, info, index, *order); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); @@ -XXX,XX +XXX,XX @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, */ gfp &= ~GFP_CONSTRAINT_MASK; VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, info, index); + new = shmem_alloc_folio(gfp, info, index, 0); if (!new) return -ENOMEM; @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, int error; int once = 0; int alloced = 0; + unsigned int order = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); + folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true, + &order); if (IS_ERR(folio)) { alloc_nohuge: - folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); + folio = shmem_alloc_and_acct_folio(gfp, inode, index, false, + &order); } if (IS_ERR(folio)) { int retry = 5; @@ -XXX,XX +XXX,XX @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, info, pgoff); + folio = shmem_alloc_folio(gfp, info, pgoff, 0); if (!folio) goto out_unacct_blocks; -- 2.39.2
To be able to calculate folio order based on the file size when allocation occurs on the write path. Use of length 0 for non write paths and PAGE_SIZE for pagecache read and vm fault. Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- include/linux/shmem_fs.h | 2 +- mm/khugepaged.c | 2 +- mm/shmem.c | 32 ++++++++++++++++++-------------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index XXXXXXX..XXXXXXX 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -XXX,XX +XXX,XX @@ enum sgp_type { }; int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp); + enum sgp_type sgp, size_t len); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index XXXXXXX..XXXXXXX 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -XXX,XX +XXX,XX @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, - &folio, SGP_NOALLOC)) { + &folio, SGP_NOALLOC, 0)) { result = SCAN_FAIL; goto xa_unlocked; } diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * (although in some cases this is just a waste of time). */ folio = NULL; - shmem_get_folio(inode, index, &folio, SGP_READ); + shmem_get_folio(inode, index, &folio, SGP_READ, 0); return folio; } @@ -XXX,XX +XXX,XX @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) + vm_fault_t *fault_type, size_t len) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, } int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp) + enum sgp_type sgp, size_t len) { return shmem_get_folio_gfp(inode, index, foliop, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); + mapping_gfp_mask(inode->i_mapping), + NULL, NULL, NULL, len); } /* @@ -XXX,XX +XXX,XX @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, - gfp, vma, vmf, &ret); + err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, gfp, + vma, vmf, &ret, PAGE_SIZE); if (err) return vmf_error(err); if (folio) @@ -XXX,XX +XXX,XX @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; int ret = 0; + if (!mapping_large_folio_support(mapping)) + len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); + /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { @@ -XXX,XX +XXX,XX @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + ret = shmem_get_folio(inode, index, &folio, SGP_WRITE, len); if (ret) return ret; @@ -XXX,XX +XXX,XX @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_get_folio(inode, index, &folio, SGP_READ); + error = shmem_get_folio(inode, index, &folio, SGP_READ, 0); if (error) { if (error == -EINVAL) error = 0; @@ -XXX,XX +XXX,XX @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, break; error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, - SGP_READ); + SGP_READ, 0); if (error) { if (error == -EINVAL) error = 0; @@ -XXX,XX +XXX,XX @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, error = -ENOMEM; else error = shmem_get_folio(inode, index, &folio, - SGP_FALLOC); + SGP_FALLOC, 0); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ @@ -XXX,XX +XXX,XX @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); + error = shmem_get_folio(inode, 0, &folio, SGP_WRITE, 0); if (error) goto out_remove_offset; inode->i_mapping->a_ops = &shmem_aops; @@ -XXX,XX +XXX,XX @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, &folio, SGP_READ, 0); if (error) return ERR_PTR(error); if (!folio) @@ -XXX,XX +XXX,XX @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, int error; BUG_ON(!shmem_mapping(mapping)); - error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, - gfp, NULL, NULL, NULL); + error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, gfp, NULL, + NULL, NULL, PAGE_SIZE); if (error) return ERR_PTR(error); -- 2.39.2
Add large folio support for shmem write path matching the same high order preference mechanism used for iomap buffered IO path as used in __filemap_get_folio() with a difference on the max order permitted (being PMD_ORDER-1) to respect the huge mount option when large folio is supported. Use the __folio_get_max_order to get a hint for the order of the folio based on file size which takes care of the mapping requirements. Swap does not support high order folios for now, so make it order 0 in case swap is enabled. Signed-off-by: Daniel Gomez <da.gomez@samsung.com> --- mm/shmem.c | 66 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index XXXXXXX..XXXXXXX 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -XXX,XX +XXX,XX @@ static struct vfsmount *shm_mnt; /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 +/* Like MAX_PAGECACHE_ORDER but respecting huge option */ +#define MAX_SHMEM_ORDER HPAGE_PMD_ORDER - 1 + /* * shmem_fallocate communicates with shmem_fault or shmem_writepage via * inode->i_private (with i_rwsem making sure that it has only one user at @@ -XXX,XX +XXX,XX @@ static struct folio *shmem_alloc_folio(gfp_t gfp, return folio; } +/** + * shmem_mapping_size_order - Get maximum folio order for the given file size. + * @mapping: Target address_space. + * @index: The page index. + * @size: The suggested size of the folio to create. + * + * This returns a high order for folios (when supported) based on the file size + * which the mapping currently allows at the given index. The index is relevant + * due to alignment considerations the mapping might have. The returned order + * may be less than the size passed. + * + * Like __filemap_get_folio order calculation. + * + * Return: The order. + */ +static inline unsigned int +shmem_mapping_size_order(struct address_space *mapping, pgoff_t index, + size_t size, struct shmem_sb_info *sbinfo) +{ + unsigned int order = ilog2(size); + + if ((order <= PAGE_SHIFT) || + (!mapping_large_folio_support(mapping) || !sbinfo->noswap)) + return 0; + else + order = order - PAGE_SHIFT; + + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); + + order = min_t(size_t, order, MAX_SHMEM_ORDER); + + /* Order-1 not supported due to THP dependency */ + return (order == 1) ? 0 : order; +} + static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge, unsigned int *order) + pgoff_t index, unsigned int order) { struct shmem_inode_info *info = SHMEM_I(inode); struct folio *folio; - int nr; - int err; - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - huge = false; - nr = huge ? HPAGE_PMD_NR : 1U << *order; + int nr = 1U << order; + int err = shmem_inode_acct_block(inode, nr); - err = shmem_inode_acct_block(inode, nr); if (err) goto failed; - if (huge) + if (order == HPAGE_PMD_ORDER) folio = shmem_alloc_hugefolio(gfp, info, index); else - folio = shmem_alloc_folio(gfp, info, index, *order); + folio = shmem_alloc_folio(gfp, info, index, order); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } + order = shmem_mapping_size_order(inode->i_mapping, index, len, sbinfo); + if (!shmem_is_huge(inode, index, false, vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) goto alloc_nohuge; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true, - &order); + folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, + HPAGE_PMD_ORDER); if (IS_ERR(folio)) { alloc_nohuge: - folio = shmem_alloc_and_acct_folio(gfp, inode, index, false, - &order); + folio = shmem_alloc_and_acct_folio(gfp, inode, index, order); } if (IS_ERR(folio)) { int retry = 5; @@ -XXX,XX +XXX,XX @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (folio_test_large(folio)) { folio_unlock(folio); folio_put(folio); + if (--order == 1) + order = 0; goto alloc_nohuge; } unlock: -- 2.39.2