From: Yu Kuai <yukuai3@huawei.com>
Include following APIs:
- llbitmap_startwrite
- llbitmap_endwrite
- llbitmap_start_discard
- llbitmap_end_discard
- llbitmap_unplug
- llbitmap_flush
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
drivers/md/md-llbitmap.c | 206 +++++++++++++++++++++++++++++++++++++++
1 file changed, 206 insertions(+)
diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
index 4b54aa6fbe40..71234c0ae160 100644
--- a/drivers/md/md-llbitmap.c
+++ b/drivers/md/md-llbitmap.c
@@ -784,6 +784,68 @@ static int llbitmap_read_sb(struct llbitmap *llbitmap)
return ret;
}
+static void llbitmap_pending_timer_fn(struct timer_list *t)
+{
+ struct llbitmap *llbitmap = from_timer(llbitmap, t, pending_timer);
+
+ if (work_busy(&llbitmap->daemon_work)) {
+ pr_warn("daemon_work not finished\n");
+ set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
+ return;
+ }
+
+ queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
+}
+
+static void md_llbitmap_daemon_fn(struct work_struct *work)
+{
+ struct llbitmap *llbitmap =
+ container_of(work, struct llbitmap, daemon_work);
+ unsigned long start;
+ unsigned long end;
+ bool restart;
+ int idx;
+
+ if (llbitmap->mddev->degraded)
+ return;
+
+retry:
+ start = 0;
+ end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_SB_SIZE) - 1;
+ restart = false;
+
+ for (idx = 0; idx < llbitmap->nr_pages; idx++) {
+ struct llbitmap_barrier *barrier = &llbitmap->barrier[idx];
+
+ if (idx > 0) {
+ start = end + 1;
+ end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
+ }
+
+ if (!test_bit(LLPageFlush, &barrier->flags) &&
+ time_before(jiffies, barrier->expire)) {
+ restart = true;
+ continue;
+ }
+
+ llbitmap_suspend(llbitmap, idx);
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
+ llbitmap_resume(llbitmap, idx);
+ }
+
+ /*
+ * If the daemon took a long time to finish, retry to prevent missing
+ * clearing dirty bits.
+ */
+ if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
+ goto retry;
+
+ /* If some page is dirty but not expired, setup timer again */
+ if (restart)
+ mod_timer(&llbitmap->pending_timer,
+ jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
+}
+
static int llbitmap_create(struct mddev *mddev)
{
struct llbitmap *llbitmap;
@@ -870,3 +932,147 @@ static void llbitmap_destroy(struct mddev *mddev)
kfree(llbitmap);
mutex_unlock(&mddev->bitmap_info.mutex);
}
+
+static int llbitmap_startwrite(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = offset >> llbitmap->chunkshift;
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_SB_SIZE) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_SB_SIZE) >> PAGE_SHIFT;
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
+
+
+ while (page_start <= page_end) {
+ llbitmap_raise_barrier(llbitmap, page_start);
+ page_start++;
+ }
+
+ return 0;
+}
+
+static void llbitmap_endwrite(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = offset >> llbitmap->chunkshift;
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_SB_SIZE) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_SB_SIZE) >> PAGE_SHIFT;
+
+ while (page_start <= page_end) {
+ llbitmap_release_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static int llbitmap_start_discard(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = DIV_ROUND_UP(offset, llbitmap->chunksize);
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_SB_SIZE) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_SB_SIZE) >> PAGE_SHIFT;
+
+ llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
+
+ while (page_start <= page_end) {
+ llbitmap_raise_barrier(llbitmap, page_start);
+ page_start++;
+ }
+
+ return 0;
+}
+
+static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
+ unsigned long sectors)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ unsigned long start = DIV_ROUND_UP(offset, llbitmap->chunksize);
+ unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
+ int page_start = (start + BITMAP_SB_SIZE) >> PAGE_SHIFT;
+ int page_end = (end + BITMAP_SB_SIZE) >> PAGE_SHIFT;
+
+ while (page_start <= page_end) {
+ llbitmap_release_barrier(llbitmap, page_start);
+ page_start++;
+ }
+}
+
+static void llbitmap_unplug_fn(struct work_struct *work)
+{
+ struct llbitmap_unplug_work *unplug_work =
+ container_of(work, struct llbitmap_unplug_work, work);
+ struct llbitmap *llbitmap = unplug_work->llbitmap;
+ struct blk_plug plug;
+ int i;
+
+ blk_start_plug(&plug);
+
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ if (!test_bit(LLPageDirty, &llbitmap->barrier[i].flags) ||
+ !test_and_clear_bit(LLPageDirty, &llbitmap->barrier[i].flags))
+ continue;
+
+ llbitmap_write_page(llbitmap, i);
+ }
+
+ blk_finish_plug(&plug);
+ md_super_wait(llbitmap->mddev);
+ complete(unplug_work->done);
+}
+
+static bool llbitmap_dirty(struct llbitmap *llbitmap)
+{
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++)
+ if (test_bit(LLPageDirty, &llbitmap->barrier[i].flags))
+ return true;
+
+ return false;
+}
+
+static void llbitmap_unplug(struct mddev *mddev, bool sync)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+ struct llbitmap *llbitmap = mddev->bitmap;
+ struct llbitmap_unplug_work unplug_work = {
+ .llbitmap = llbitmap,
+ .done = &done,
+ };
+
+ if (!llbitmap_dirty(llbitmap))
+ return;
+
+ INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
+ queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&unplug_work.work);
+}
+
+static void llbitmap_flush(struct mddev *mddev)
+{
+ struct llbitmap *llbitmap = mddev->bitmap;
+ struct blk_plug plug;
+ int i;
+
+ for (i = 0; i < llbitmap->nr_pages; i++)
+ set_bit(LLPageFlush, &llbitmap->barrier[i].flags);
+
+ timer_delete_sync(&llbitmap->pending_timer);
+ queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
+ flush_work(&llbitmap->daemon_work);
+
+ blk_start_plug(&plug);
+ for (i = 0; i < llbitmap->nr_pages; i++) {
+ /* mark all bits as dirty */
+ bitmap_fill(llbitmap->barrier[i].dirty, llbitmap->bits_per_page);
+ llbitmap_write_page(llbitmap, i);
+ }
+ blk_finish_plug(&plug);
+ md_super_wait(llbitmap->mddev);
+}
--
2.39.2
On Mon, May 12, 2025 at 09:19:23AM +0800, Yu Kuai wrote:
> +static void llbitmap_unplug(struct mddev *mddev, bool sync)
> +{
> + DECLARE_COMPLETION_ONSTACK(done);
> + struct llbitmap *llbitmap = mddev->bitmap;
> + struct llbitmap_unplug_work unplug_work = {
> + .llbitmap = llbitmap,
> + .done = &done,
> + };
> +
> + if (!llbitmap_dirty(llbitmap))
> + return;
> +
> + INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
> + queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
> + wait_for_completion(&done);
> + destroy_work_on_stack(&unplug_work.work);
Why is this deferring the work to a workqueue, but then synchronously
waits on it?
Hi,
在 2025/05/12 13:17, Christoph Hellwig 写道:
> On Mon, May 12, 2025 at 09:19:23AM +0800, Yu Kuai wrote:
>> +static void llbitmap_unplug(struct mddev *mddev, bool sync)
>> +{
>> + DECLARE_COMPLETION_ONSTACK(done);
>> + struct llbitmap *llbitmap = mddev->bitmap;
>> + struct llbitmap_unplug_work unplug_work = {
>> + .llbitmap = llbitmap,
>> + .done = &done,
>> + };
>> +
>> + if (!llbitmap_dirty(llbitmap))
>> + return;
>> +
>> + INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
>> + queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
>> + wait_for_completion(&done);
>> + destroy_work_on_stack(&unplug_work.work);
>
> Why is this deferring the work to a workqueue, but then synchronously
> waits on it?
This is the same as old bitmap, by the fact that issue new IO and wait
for such IO to be done from submit_bio() context will deadlock.
1) bitmap bio must be done before this bio can be issued;
2) bitmap bio will be added to current->bio_list, and wait for this bio
to be issued;
Do you have a better sulution to this problem?
Thanks,
Kuai
>
> .
>
On Mon, May 12, 2025 at 04:23:55PM +0800, Yu Kuai wrote: >>> + INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn); >>> + queue_work(md_llbitmap_unplug_wq, &unplug_work.work); >>> + wait_for_completion(&done); >>> + destroy_work_on_stack(&unplug_work.work); >> >> Why is this deferring the work to a workqueue, but then synchronously >> waits on it? > > This is the same as old bitmap, by the fact that issue new IO and wait > for such IO to be done from submit_bio() context will deadlock. > > 1) bitmap bio must be done before this bio can be issued; > 2) bitmap bio will be added to current->bio_list, and wait for this bio > to be issued; > > Do you have a better sulution to this problem? A bew block layer API that bypasses bio_list maybe? I.e. export __submit_bio with a better name and a kerneldoc detailing the narrow use case.
On Mon, May 12, 2025 at 03:26:41PM +0200, Christoph Hellwig wrote: > > 1) bitmap bio must be done before this bio can be issued; > > 2) bitmap bio will be added to current->bio_list, and wait for this bio > > to be issued; > > > > Do you have a better sulution to this problem? > > A bew block layer API that bypasses bio_list maybe? I.e. export > __submit_bio with a better name and a kerneldoc detailing the narrow > use case. That won't work as we'd miss a lot of checks, cgroup handling, etc. But maybe a flag to skip the recursion avoidance?
Hi, 在 2025/05/12 21:30, Christoph Hellwig 写道: > On Mon, May 12, 2025 at 03:26:41PM +0200, Christoph Hellwig wrote: >>> 1) bitmap bio must be done before this bio can be issued; >>> 2) bitmap bio will be added to current->bio_list, and wait for this bio >>> to be issued; >>> >>> Do you have a better sulution to this problem? >> >> A bew block layer API that bypasses bio_list maybe? I.e. export >> __submit_bio with a better name and a kerneldoc detailing the narrow >> use case. > > That won't work as we'd miss a lot of checks, cgroup handling, etc. > > But maybe a flag to skip the recursion avoidance? I think this can work, and this can also improve performance. I'll look into this. Thanks, Kuai > > . >
Hi,
在 2025/05/12 21:36, Yu Kuai 写道:
> Hi,
>
> 在 2025/05/12 21:30, Christoph Hellwig 写道:
>> On Mon, May 12, 2025 at 03:26:41PM +0200, Christoph Hellwig wrote:
>>>> 1) bitmap bio must be done before this bio can be issued;
>>>> 2) bitmap bio will be added to current->bio_list, and wait for this bio
>>>> to be issued;
>>>>
>>>> Do you have a better sulution to this problem?
>>>
>>> A bew block layer API that bypasses bio_list maybe? I.e. export
>>> __submit_bio with a better name and a kerneldoc detailing the narrow
>>> use case.
>>
>> That won't work as we'd miss a lot of checks, cgroup handling, etc.
>>
>> But maybe a flag to skip the recursion avoidance?
>
> I think this can work, and this can also improve performance. I'll look
> into this.
So, I did a quick test with old internal bitmap and make sure following
patch can work.
However, for bitmap file case, bio is issued from submit_bh(), I'll have
to change buffer_head code and I'm not sure if we want to do that.
Thanks,
Kuai
diff --git a/block/blk-core.c b/block/blk-core.c
index b862c66018f2..66ced5769694 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -745,7 +745,7 @@ void submit_bio_noacct_nocheck(struct bio *bio)
* to collect a list of requests submited by a ->submit_bio
method while
* it is active, and then process them after it returned.
*/
- if (current->bio_list)
+ if (current->bio_list && !bio_flagged(bio, BIO_STACKED_META))
bio_list_add(¤t->bio_list[0], bio);
else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
__submit_bio_noacct_mq(bio);
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 431a3ab2e449..e0cb210a4ea4 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1257,35 +1257,6 @@ static void __bitmap_unplug(struct bitmap *bitmap)
md_bitmap_file_kick(bitmap);
}
-struct bitmap_unplug_work {
- struct work_struct work;
- struct bitmap *bitmap;
- struct completion *done;
-};
-
-static void md_bitmap_unplug_fn(struct work_struct *work)
-{
- struct bitmap_unplug_work *unplug_work =
- container_of(work, struct bitmap_unplug_work, work);
-
- __bitmap_unplug(unplug_work->bitmap);
- complete(unplug_work->done);
-}
-
-static void bitmap_unplug_async(struct bitmap *bitmap)
-{
- DECLARE_COMPLETION_ONSTACK(done);
- struct bitmap_unplug_work unplug_work;
-
- INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn);
- unplug_work.bitmap = bitmap;
- unplug_work.done = &done;
-
- queue_work(md_bitmap_wq, &unplug_work.work);
- wait_for_completion(&done);
- destroy_work_on_stack(&unplug_work.work);
-}
-
static void bitmap_unplug(struct mddev *mddev, bool sync)
{
struct bitmap *bitmap = mddev->bitmap;
@@ -1293,10 +1264,7 @@ static void bitmap_unplug(struct mddev *mddev,
bool sync)
if (!bitmap)
return;
- if (sync)
- __bitmap_unplug(bitmap);
- else
- bitmap_unplug_async(bitmap);
+ __bitmap_unplug(bitmap);
}
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t
offset, int needed);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 32b997dfe6f4..179eabd6e038 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1050,6 +1050,7 @@ void md_super_write(struct mddev *mddev, struct
md_rdev *rdev,
__bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
+ bio_set_flag(bio, BIO_STACKED_META);
if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
test_bit(FailFast, &rdev->flags) &&
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f38425338c3f..88164cdae6aa 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -300,6 +300,7 @@ enum {
BIO_REMAPPED,
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write
plugging */
BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append
operation */
+ BIO_STACKED_META, /* bio is metadata from stacked device */
BIO_FLAG_LAST
};
>
> Thanks,
> Kuai
>
>>
>> .
>>
>
> .
>
On Tue, May 13, 2025 at 02:32:04PM +0800, Yu Kuai wrote: > However, for bitmap file case, bio is issued from submit_bh(), I'll have > to change buffer_head code and I'm not sure if we want to do that. Don't bother with the old code, I'm still hoping we can replace it with your new code ASAP. > + BIO_STACKED_META, /* bio is metadata from stacked device */ I don't think that is the right name and description. The whole point of the recursion avoidance is to to supported stacked devices by reducing the stack depth. So we clearly need to document why that is not desirable for some very specific cases like reading in metadata needed to process a write I/O. We should also ensure it doesn't propagate to lover stacked devices. In fact I wonder if a better interfaces would be one to stash away current->bio_list and then restore it after the call, as that would enforce all that.
Hi,
在 2025/05/13 14:48, Christoph Hellwig 写道:
> On Tue, May 13, 2025 at 02:32:04PM +0800, Yu Kuai wrote:
>> However, for bitmap file case, bio is issued from submit_bh(), I'll have
>> to change buffer_head code and I'm not sure if we want to do that.
>
> Don't bother with the old code, I'm still hoping we can replace it with
> your new code ASAP.
Agreed :)
>
>> + BIO_STACKED_META, /* bio is metadata from stacked device */
>
> I don't think that is the right name and description. The whole point
> of the recursion avoidance is to to supported stacked devices by
> reducing the stack depth. So we clearly need to document why that
> is not desirable for some very specific cases like reading in metadata
> needed to process a write I/O. We should also ensure it doesn't
> propagate to lover stacked devices.
>
> In fact I wonder if a better interfaces would be one to stash away
> current->bio_list and then restore it after the call, as that would
> enforce all that.
Yes, following change can work as well.
Just wonder, if the array is created by another array, and which is
created by another array ... In this case, the stack depth can be
huge. :( This is super weird case, however, should we keep the old code
in this case?
Thanks,
Kuai
static void bitmap_unplug(struct mddev *mddev, bool sync)
{
struct bitmap *bitmap = mddev->bitmap;
+ struct bio_list *bio_list = NULL;
if (!bitmap)
return;
- if (sync)
- __bitmap_unplug(bitmap);
- else
- bitmap_unplug_async(bitmap);
+ if (current->bio_list) {
+ bio_list = current->bio_list;
+ current->bio_list = NULL;
+ }
+
+ __bitmap_unplug(bitmap);
+
+ current->bio_list = bio_list;
}
> .
>
On Tue, May 13, 2025 at 03:14:03PM +0800, Yu Kuai wrote: > Yes, following change can work as well. > > Just wonder, if the array is created by another array, and which is > created by another array ... In this case, the stack depth can be > huge. :( This is super weird case, however, should we keep the old code > in this case? Yeah, that's a good question. Stacking multiple arrays using bitmaps on top of each other is weird. But what if other block remappers starting to use this for other remapping and they are stacked? That seems much more likely unfotunately, so maybe we can't go down this route after all, sorry for leading you to it. So instead just write a comment documenting why you switch to a different stack using the workqueue.
Hi, 在 2025/05/13 15:43, Christoph Hellwig 写道: > On Tue, May 13, 2025 at 03:14:03PM +0800, Yu Kuai wrote: >> Yes, following change can work as well. >> >> Just wonder, if the array is created by another array, and which is >> created by another array ... In this case, the stack depth can be >> huge. :( This is super weird case, however, should we keep the old code >> in this case? > > Yeah, that's a good question. Stacking multiple arrays using bitmaps > on top of each other is weird. But what if other block remappers > starting to use this for other remapping and they are stacked? That > seems much more likely unfotunately, so maybe we can't go down this > route after all, sorry for leading you to it. I was thinking about record a stack dev depth in mddev to handle the weird case inside raid. Is there other stack device have the same problem? AFAIK, some dm targets like dm-crypt are using workqueue to handle all IO. I'm still interested because this can improve first write latency. > > So instead just write a comment documenting why you switch to a > different stack using the workqueue. Ok, I'll add comment if we keep using the workqueue. Thanks, Kuai > . >
On Tue, May 13, 2025 at 05:32:13PM +0800, Yu Kuai wrote: > I was thinking about record a stack dev depth in mddev to handle the > weird case inside raid. Is there other stack device have the same > problem? AFAIK, some dm targets like dm-crypt are using workqueue > to handle all IO. I guess anything that might have to read in metadata to serve a data I/O. bcache, dm-snapshot and dm-thinkp would be candidates for that, but I haven't checked the implementation. > I'm still interested because this can improve first write latency. > >> >> So instead just write a comment documenting why you switch to a >> different stack using the workqueue. > > Ok, I'll add comment if we keep using the workqueue. Maybe do that for getting the new bitmap code in ASAP and then revisit the above separately?
Hi 于 2025年5月14日 GMT+08:00 13:17:47,Christoph Hellwig <hch@lst.de> 写道: >On Tue, May 13, 2025 at 05:32:13PM +0800, Yu Kuai wrote: >> I was thinking about record a stack dev depth in mddev to handle the >> weird case inside raid. Is there other stack device have the same >> problem? AFAIK, some dm targets like dm-crypt are using workqueue >> to handle all IO. > >I guess anything that might have to read in metadata to serve a >data I/O. bcache, dm-snapshot and dm-thinkp would be candidates for >that, but I haven't checked the implementation. > >> I'm still interested because this can improve first write latency. >> >>> >>> So instead just write a comment documenting why you switch to a >>> different stack using the workqueue. >> >> Ok, I'll add comment if we keep using the workqueue. > >Maybe do that for getting the new bitmap code in ASAP and then >revisit the above separately? > > Sure, sorry that I am in a business travel suddenly. I will get back to this ASAP I return. Thanks Kuai
© 2016 - 2025 Red Hat, Inc.