drivers/block/brd.c | 56 +++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 27 deletions(-)
From: Yu Kuai <yukuai3@huawei.com>
As discussed [1], hold rcu for copying data from/to page is too heavy.
it's better to protect page with rcu around for page lookup and then
grab a reference to prevent page to be freed by discard.
[1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
drivers/block/brd.c | 56 +++++++++++++++++++++++----------------------
1 file changed, 29 insertions(+), 27 deletions(-)
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 0c2eabe14af3..ce311d054cab 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -44,45 +44,55 @@ struct brd_device {
};
/*
- * Look up and return a brd's page for a given sector.
+ * Look up and return a brd's page with reference grabbed for a given sector.
*/
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
- return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+ struct page *page;
+
+ rcu_read_lock();
+ page = xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
+ if (page)
+ get_page(page);
+ rcu_read_unlock();
+
+ return page;
}
/*
* Insert a new page for a given sector, if one does not already exist.
+ * The returned page will grab reference.
*/
static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
blk_opf_t opf)
- __releases(rcu)
- __acquires(rcu)
{
gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
struct page *page, *ret;
- rcu_read_unlock();
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
- if (!page) {
- rcu_read_lock();
+ if (!page)
return ERR_PTR(-ENOMEM);
- }
xa_lock(&brd->brd_pages);
ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
page, gfp);
- rcu_read_lock();
- if (ret) {
+ if (!ret) {
+ brd->brd_nr_pages++;
+ get_page(page);
xa_unlock(&brd->brd_pages);
- __free_page(page);
- if (xa_is_err(ret))
- return ERR_PTR(xa_err(ret));
+ return page;
+ }
+
+ if (!xa_is_err(ret)) {
+ get_page(ret);
+ xa_unlock(&brd->brd_pages);
+ put_page(page);
return ret;
}
- brd->brd_nr_pages++;
+
xa_unlock(&brd->brd_pages);
- return page;
+ put_page(page);
+ return ERR_PTR(xa_err(ret));
}
/*
@@ -95,7 +105,7 @@ static void brd_free_pages(struct brd_device *brd)
pgoff_t idx;
xa_for_each(&brd->brd_pages, idx, page) {
- __free_page(page);
+ put_page(page);
cond_resched();
}
@@ -117,7 +127,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
- rcu_read_lock();
page = brd_lookup_page(brd, sector);
if (!page && op_is_write(opf)) {
page = brd_insert_page(brd, sector, opf);
@@ -135,13 +144,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
memset(kaddr, 0, bv.bv_len);
}
kunmap_local(kaddr);
- rcu_read_unlock();
bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
+ if (page)
+ put_page(page);
return true;
out_error:
- rcu_read_unlock();
if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
bio_wouldblock_error(bio);
else
@@ -149,13 +158,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
return false;
}
-static void brd_free_one_page(struct rcu_head *head)
-{
- struct page *page = container_of(head, struct page, rcu_head);
-
- __free_page(page);
-}
-
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
{
sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
@@ -170,7 +172,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
if (page) {
- call_rcu(&page->rcu_head, brd_free_one_page);
+ put_page(page);
brd->brd_nr_pages--;
}
aligned_sector += PAGE_SECTORS;
--
2.39.2
Hi, On 7/29/2025 3:09 PM, Yu Kuai wrote: > From: Yu Kuai <yukuai3@huawei.com> > > As discussed [1], hold rcu for copying data from/to page is too heavy. > it's better to protect page with rcu around for page lookup and then > grab a reference to prevent page to be freed by discard. > > [1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/ > > Signed-off-by: Yu Kuai <yukuai3@huawei.com> > --- > drivers/block/brd.c | 56 +++++++++++++++++++++++---------------------- > 1 file changed, 29 insertions(+), 27 deletions(-) > > diff --git a/drivers/block/brd.c b/drivers/block/brd.c > index 0c2eabe14af3..ce311d054cab 100644 > --- a/drivers/block/brd.c > +++ b/drivers/block/brd.c > @@ -44,45 +44,55 @@ struct brd_device { > }; > > /* > - * Look up and return a brd's page for a given sector. > + * Look up and return a brd's page with reference grabbed for a given sector. > */ > static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) > { > - return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); > + struct page *page; > + > + rcu_read_lock(); > + page = xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); > + if (page) > + get_page(page); get_page_unless_zero() instead ? Er, it seems even with get_page_unless_zero(), it is not enough, becausethe page may be reused. Maybe we need to read the brd_pages xarray to ensure the page is still there. > + rcu_read_unlock(); > + > + return page; > } > > /* > * Insert a new page for a given sector, if one does not already exist. > + * The returned page will grab reference. > */ > static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, > blk_opf_t opf) > - __releases(rcu) > - __acquires(rcu) > { > gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; > struct page *page, *ret; > > - rcu_read_unlock(); > page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); > - if (!page) { > - rcu_read_lock(); > + if (!page) > return ERR_PTR(-ENOMEM); > - } > > xa_lock(&brd->brd_pages); > ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, > page, gfp); > - rcu_read_lock(); > - if (ret) { > + if (!ret) { > + brd->brd_nr_pages++; > + get_page(page); > xa_unlock(&brd->brd_pages); If I understand correctly, the initial ref-count after alloc_page() is 1 instead of 0, so I think the get_page(page) here and get_page(ret) below is unnecessary and it will lead to memory leak. > - __free_page(page); > - if (xa_is_err(ret)) > - return ERR_PTR(xa_err(ret)); > + return page; > + } > + > + if (!xa_is_err(ret)) { > + get_page(ret); > + xa_unlock(&brd->brd_pages); > + put_page(page); > return ret; > } > - brd->brd_nr_pages++; > + > xa_unlock(&brd->brd_pages); > - return page; > + put_page(page); > + return ERR_PTR(xa_err(ret)); > } > > /* > @@ -95,7 +105,7 @@ static void brd_free_pages(struct brd_device *brd) > pgoff_t idx; > > xa_for_each(&brd->brd_pages, idx, page) { > - __free_page(page); > + put_page(page); > cond_resched(); > } > > @@ -117,7 +127,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) > > bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); > > - rcu_read_lock(); > page = brd_lookup_page(brd, sector); > if (!page && op_is_write(opf)) { > page = brd_insert_page(brd, sector, opf); > @@ -135,13 +144,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) > memset(kaddr, 0, bv.bv_len); > } > kunmap_local(kaddr); > - rcu_read_unlock(); > > bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); > + if (page) > + put_page(page); > return true; > > out_error: > - rcu_read_unlock(); > if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) > bio_wouldblock_error(bio); > else > @@ -149,13 +158,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) > return false; > } > > -static void brd_free_one_page(struct rcu_head *head) > -{ > - struct page *page = container_of(head, struct page, rcu_head); > - > - __free_page(page); > -} > - > static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) > { > sector_t aligned_sector = round_up(sector, PAGE_SECTORS); > @@ -170,7 +172,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) > while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { > page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); > if (page) { > - call_rcu(&page->rcu_head, brd_free_one_page); > + put_page(page); > brd->brd_nr_pages--; > } > aligned_sector += PAGE_SECTORS;
Hi, 在 2025/07/29 15:52, Hou Tao 写道: > Hi, > > On 7/29/2025 3:09 PM, Yu Kuai wrote: >> From: Yu Kuai <yukuai3@huawei.com> >> >> As discussed [1], hold rcu for copying data from/to page is too heavy. >> it's better to protect page with rcu around for page lookup and then >> grab a reference to prevent page to be freed by discard. >> >> [1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/ >> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> --- >> drivers/block/brd.c | 56 +++++++++++++++++++++++---------------------- >> 1 file changed, 29 insertions(+), 27 deletions(-) >> >> diff --git a/drivers/block/brd.c b/drivers/block/brd.c >> index 0c2eabe14af3..ce311d054cab 100644 >> --- a/drivers/block/brd.c >> +++ b/drivers/block/brd.c >> @@ -44,45 +44,55 @@ struct brd_device { >> }; >> >> /* >> - * Look up and return a brd's page for a given sector. >> + * Look up and return a brd's page with reference grabbed for a given sector. >> */ >> static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) >> { >> - return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); >> + struct page *page; >> + >> + rcu_read_lock(); >> + page = xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); >> + if (page) >> + get_page(page); > > get_page_unless_zero() instead ? Er, it seems even with > get_page_unless_zero(), it is not enough, becausethe page may be reused. > Maybe we need to read the brd_pages xarray to ensure the page is still > there. Yes, you're right, refer to the pagecache procedures, I'll switch to use xas_load + xas_reload in the next version. Thanks, Kuai >> + rcu_read_unlock(); >> + >> + return page; >> } >> >> /* >> * Insert a new page for a given sector, if one does not already exist. >> + * The returned page will grab reference. >> */ >> static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, >> blk_opf_t opf) >> - __releases(rcu) >> - __acquires(rcu) >> { >> gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; >> struct page *page, *ret; >> >> - rcu_read_unlock(); >> page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); >> - if (!page) { >> - rcu_read_lock(); >> + if (!page) >> return ERR_PTR(-ENOMEM); >> - } >> >> xa_lock(&brd->brd_pages); >> ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, >> page, gfp); >> - rcu_read_lock(); >> - if (ret) { >> + if (!ret) { >> + brd->brd_nr_pages++; >> + get_page(page); >> xa_unlock(&brd->brd_pages); > > If I understand correctly, the initial ref-count after alloc_page() is 1 > instead of 0, so I think the get_page(page) here and get_page(ret) below > is unnecessary and it will lead to memory leak. >> - __free_page(page); >> - if (xa_is_err(ret)) >> - return ERR_PTR(xa_err(ret)); >> + return page; >> + } >> + >> + if (!xa_is_err(ret)) { >> + get_page(ret); >> + xa_unlock(&brd->brd_pages); >> + put_page(page); >> return ret; >> } >> - brd->brd_nr_pages++; >> + >> xa_unlock(&brd->brd_pages); >> - return page; >> + put_page(page); >> + return ERR_PTR(xa_err(ret)); >> } >> >> /* >> @@ -95,7 +105,7 @@ static void brd_free_pages(struct brd_device *brd) >> pgoff_t idx; >> >> xa_for_each(&brd->brd_pages, idx, page) { >> - __free_page(page); >> + put_page(page); >> cond_resched(); >> } >> >> @@ -117,7 +127,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) >> >> bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); >> >> - rcu_read_lock(); >> page = brd_lookup_page(brd, sector); >> if (!page && op_is_write(opf)) { >> page = brd_insert_page(brd, sector, opf); >> @@ -135,13 +144,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) >> memset(kaddr, 0, bv.bv_len); >> } >> kunmap_local(kaddr); >> - rcu_read_unlock(); >> >> bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); >> + if (page) >> + put_page(page); >> return true; >> >> out_error: >> - rcu_read_unlock(); >> if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) >> bio_wouldblock_error(bio); >> else >> @@ -149,13 +158,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) >> return false; >> } >> >> -static void brd_free_one_page(struct rcu_head *head) >> -{ >> - struct page *page = container_of(head, struct page, rcu_head); >> - >> - __free_page(page); >> -} >> - >> static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) >> { >> sector_t aligned_sector = round_up(sector, PAGE_SECTORS); >> @@ -170,7 +172,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) >> while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { >> page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); >> if (page) { >> - call_rcu(&page->rcu_head, brd_free_one_page); >> + put_page(page); >> brd->brd_nr_pages--; >> } >> aligned_sector += PAGE_SECTORS; > > . >
© 2016 - 2025 Red Hat, Inc.