1
zblock is a special purpose allocator for storing compressed pages.
1
zblock is a special purpose allocator for storing compressed pages.
2
It stores integer number of compressed objects per its block. These
2
It stores integer number of compressed objects per its block. These
3
blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
3
blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
4
4
5
With zblock, it is possible to densely arrange objects of various sizes
5
With zblock, it is possible to densely arrange objects of various sizes
6
resulting in low internal fragmentation. Also this allocator tries to
6
resulting in low internal fragmentation. Also this allocator tries to
7
fill incomplete blocks instead of adding new ones, in many cases
7
fill incomplete blocks instead of adding new ones, in many cases
8
providing a compression ratio substantially higher than z3fold and zbud
8
providing a compression ratio comparable to zmalloc's.
9
(though lower than zmalloc's).
10
9
11
zblock does not require MMU to operate and also is superior to zsmalloc
10
zblock is also in most cases superior to zsmalloc with regard to
12
with regard to average performance and worst execution times, thus
11
average performance and worst execution times, thus allowing for better
13
allowing for better response time and real-time characteristics of the
12
response time and real-time characteristics of the whole system. E. g.
14
whole system.
13
on a series of stress-ng tests run on a Raspberry Pi 5, we get 5-10%
14
higher value for bogo ops/s in zblock/zsmalloc comparison.
15
15
16
E. g. on a series of stress-ng tests run on a Raspberry Pi 5, we get
16
High memory and page migration are currently not supported by zblock.
17
5-10% higher value for bogo ops/s in zblock/zsmalloc comparison.
18
17
19
Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
18
Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
20
Signed-off-by: Igor Belousov <igor.b@beldev.am>
19
Signed-off-by: Igor Belousov <igor.b@beldev.am>
21
---
20
---
22
Changes since v1:
21
Changes since v2:
23
- adaptations to better handle 16K pages
22
- rebased and tested against the latest -mm tree
24
- block table is dropped in favor of a linked list (the list entry on
23
- 2 lists for allocated blocks (with/without free slots)
25
top of the list always has empty slots)
24
- comments in the code updated per review
26
- rbtree is created at the initialization phase to speed up the search
25
- __GFP_HIGHMEM and __GFP_MOVABLE flags are masked for block allocation
27
for an appropriate block type
26
- removed redundant helper functions
28
27
29
Performance metrics:
28
Documentation/mm/zblock.rst | 24 ++
30
31
1 Kernel build on a Raspberry Pi 5
32
Kernel build on a tmpfs with zblock/zsmalloc as zswap backend (LZ4
33
compression):
34
35
1.1 zblock
36
37
real 25m53.040s
38
user 96m43.424s
39
sys 4m56.652s
40
41
real 25m20.748s
42
user 94m24.324s
43
sys 4m58.005s
44
45
real 25m37.486s
46
user 95m35.913s
47
sys 4m55.892s
48
49
1.2 zsmalloc
50
51
real 26m17.934s
52
user 97m13.342s
53
sys 5m2.415s
54
55
real 25m50.694s
56
user 95m22.065s
57
sys 5m1.305s
58
59
real 25m57.714s
60
user 96m14.675s
61
sys 4m59.081s
62
63
Since zswap is used starting from minute 21, this gives 9% in average in
64
advantage for zblock.
65
66
2 stress-ng
67
Command: stress-ng --vm 4 --vm-bytes 8G --vm-keep --timeout 10m --metrics-brief
68
Results:
69
2.1 zblock
70
71
stress-ng: metrc: [421] stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s
72
stress-ng: metrc: [421] (secs) (secs) (secs) (real time) (usr+sys time)
73
stress-ng: metrc: [421] vm 29701982 601.32 1088.63 209.89 49394.38 22873.66
74
75
2.2 zsmalloc
76
77
stress-ng: metrc: [479] stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s
78
stress-ng: metrc: [479] (secs) (secs) (secs) (real time) (usr+sys time)
79
stress-ng: metrc: [479] vm 29561584 600.59 916.38 895.74 49221.11 16313.22
80
81
Object file size comparison
82
83
1 ARM64
84
1.1 zblock: 10k
85
1.2 zsmalloc: 36K
86
87
2 RISC-V
88
1.1 zblock: 12k
89
1.2 zsmalloc: 77k
90
91
Documentation/mm/zblock.rst | 24 +++
92
MAINTAINERS | 7 +
29
MAINTAINERS | 7 +
93
mm/Kconfig | 8 +
30
mm/Kconfig | 12 +
94
mm/Makefile | 1 +
31
mm/Makefile | 1 +
95
mm/zblock.c | 418 ++++++++++++++++++++++++++++++++++++
32
mm/zblock.c | 436 ++++++++++++++++++++++++++++++++++++
96
mm/zblock.h | 136 ++++++++++++
33
mm/zblock.h | 138 ++++++++++++
97
6 files changed, 594 insertions(+)
34
6 files changed, 618 insertions(+)
98
create mode 100644 Documentation/mm/zblock.rst
35
create mode 100644 Documentation/mm/zblock.rst
99
create mode 100644 mm/zblock.c
36
create mode 100644 mm/zblock.c
100
create mode 100644 mm/zblock.h
37
create mode 100644 mm/zblock.h
101
38
102
diff --git a/Documentation/mm/zblock.rst b/Documentation/mm/zblock.rst
39
diff --git a/Documentation/mm/zblock.rst b/Documentation/mm/zblock.rst
103
new file mode 100644
40
new file mode 100644
104
index XXXXXXX..XXXXXXX
41
index XXXXXXX..XXXXXXX
105
--- /dev/null
42
--- /dev/null
106
+++ b/Documentation/mm/zblock.rst
43
+++ b/Documentation/mm/zblock.rst
107
@@ -XXX,XX +XXX,XX @@
44
@@ -XXX,XX +XXX,XX @@
108
+. SPDX-License-Identifier: GPL-2.0
45
+.. SPDX-License-Identifier: GPL-2.0
109
+
46
+
110
+======
47
+======
111
+zblock
48
+zblock
112
+======
49
+======
113
+
50
+
...
...
142
+L:    linux-mm@kvack.org
79
+L:    linux-mm@kvack.org
143
+S:    Maintained
80
+S:    Maintained
144
+F:    Documentation/mm/zblock.rst
81
+F:    Documentation/mm/zblock.rst
145
+F:    mm/zblock.[ch]
82
+F:    mm/zblock.[ch]
146
+
83
+
147
ZBUD COMPRESSED PAGE ALLOCATOR
84
ZD1211RW WIRELESS DRIVER
148
M:    Seth Jennings <sjenning@redhat.com>
85
L:    linux-wireless@vger.kernel.org
149
M:    Dan Streetman <ddstreet@ieee.org>
86
S:    Orphan
150
diff --git a/mm/Kconfig b/mm/Kconfig
87
diff --git a/mm/Kconfig b/mm/Kconfig
151
index XXXXXXX..XXXXXXX 100644
88
index XXXXXXX..XXXXXXX 100644
152
--- a/mm/Kconfig
89
--- a/mm/Kconfig
153
+++ b/mm/Kconfig
90
+++ b/mm/Kconfig
154
@@ -XXX,XX +XXX,XX @@ config Z3FOLD_DEPRECATED
91
@@ -XXX,XX +XXX,XX @@ config ZSWAP_ZPOOL_DEFAULT
155
     page. It is a ZBUD derivative so the simplicity and determinism are
92
default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
156
     still there.
93
default ""
157
94
158
+config ZBLOCK
95
+config ZBLOCK
159
+    tristate "Fast compression allocator with high density"
96
+    tristate "Fast compression allocator with high density"
160
+    depends on ZPOOL
97
+    depends on ZPOOL
161
+    help
98
+    help
162
+     A special purpose allocator for storing compressed pages.
99
+     A special purpose allocator for storing compressed pages.
163
+     It is designed to store same size compressed pages in blocks of
100
+     It stores integer number of same size compressed objects per
164
+     physical pages.
101
+     its block. These blocks consist of several physical pages
165
+
102
+     (2**n, i. e. 1/2/4/8).
166
config Z3FOLD
103
+
104
+     With zblock, it is possible to densely arrange objects of
105
+     various sizes resulting in low internal fragmentation.
106
+
107
config ZSMALLOC
167
    tristate
108
    tristate
168
    default y if Z3FOLD_DEPRECATED=y
109
    prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
169
diff --git a/mm/Makefile b/mm/Makefile
110
diff --git a/mm/Makefile b/mm/Makefile
170
index XXXXXXX..XXXXXXX 100644
111
index XXXXXXX..XXXXXXX 100644
171
--- a/mm/Makefile
112
--- a/mm/Makefile
172
+++ b/mm/Makefile
113
+++ b/mm/Makefile
173
@@ -XXX,XX +XXX,XX @@ obj-$(CONFIG_ZPOOL)    += zpool.o
114
@@ -XXX,XX +XXX,XX @@ obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
174
obj-$(CONFIG_ZBUD)    += zbud.o
115
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
116
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
117
obj-$(CONFIG_ZPOOL)    += zpool.o
118
+obj-$(CONFIG_ZBLOCK)    += zblock.o
175
obj-$(CONFIG_ZSMALLOC)    += zsmalloc.o
119
obj-$(CONFIG_ZSMALLOC)    += zsmalloc.o
176
obj-$(CONFIG_Z3FOLD)    += z3fold.o
177
+obj-$(CONFIG_ZBLOCK)    += zblock.o
178
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
120
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
179
obj-$(CONFIG_CMA)    += cma.o
121
obj-$(CONFIG_CMA)    += cma.o
180
obj-$(CONFIG_NUMA) += numa.o
181
diff --git a/mm/zblock.c b/mm/zblock.c
122
diff --git a/mm/zblock.c b/mm/zblock.c
182
new file mode 100644
123
new file mode 100644
183
index XXXXXXX..XXXXXXX
124
index XXXXXXX..XXXXXXX
184
--- /dev/null
125
--- /dev/null
185
+++ b/mm/zblock.c
126
+++ b/mm/zblock.c
...
...
212
+#include <linux/zpool.h>
153
+#include <linux/zpool.h>
213
+#include "zblock.h"
154
+#include "zblock.h"
214
+
155
+
215
+static struct rb_root block_desc_tree = RB_ROOT;
156
+static struct rb_root block_desc_tree = RB_ROOT;
216
+
157
+
217
+/* add a new block to the list */
218
+static inline void add_block(struct zblock_block *block,
219
+                struct block_list *block_list)
220
+{
221
+    list_add(&block->link, &block_list->list);
222
+}
223
+
224
+/*
158
+/*
225
+ * Find a block with at least one free slot and claim it.
159
+ * Find a block with at least one free slot and claim it.
226
+ * We make sure that the first block, if exists, will always work.
160
+ * We make sure that the first block, if exists, will always work.
227
+ */
161
+ */
228
+static inline struct zblock_block *find_block(struct block_list *block_list)
162
+static inline struct zblock_block *find_and_claim_block(struct block_list *blist)
229
+{
163
+{
230
+    struct list_head *l = &block_list->list;
164
+    struct list_head *l = &blist->active_list;
231
+
165
+
232
+    if (!list_empty(l)) {
166
+    if (!list_empty(l)) {
233
+        struct zblock_block *z = list_first_entry(l, typeof(*z), link);
167
+        struct zblock_block *z = list_first_entry(l, typeof(*z), link);
234
+
168
+
235
+        if (z->free_slots > 0) {
169
+        if (--z->free_slots == 0)
236
+            if (--z->free_slots == 0)
170
+            list_move(&z->link, &blist->full_list);
237
+                list_move_tail(&z->link, l);
171
+        return z;
238
+            return z;
239
+        }
240
+    }
172
+    }
241
+    return NULL;
173
+    return NULL;
242
+}
243
+
244
+/* remove block from the list */
245
+static inline void remove_block(struct zblock_block *block)
246
+{
247
+    list_del_init(&block->link);
248
+}
174
+}
249
+
175
+
250
+/* Encodes the handle of a particular slot in the pool using metadata */
176
+/* Encodes the handle of a particular slot in the pool using metadata */
251
+static inline unsigned long metadata_to_handle(struct zblock_block *block,
177
+static inline unsigned long metadata_to_handle(struct zblock_block *block,
252
+                unsigned int block_type, unsigned int slot)
178
+                unsigned int block_type, unsigned int slot)
...
...
285
+    memset(&block->slot_info, 0, sizeof(block->slot_info));
211
+    memset(&block->slot_info, 0, sizeof(block->slot_info));
286
+    set_bit(0, block->slot_info);
212
+    set_bit(0, block->slot_info);
287
+    *handle = metadata_to_handle(block, block_type, 0);
213
+    *handle = metadata_to_handle(block, block_type, 0);
288
+
214
+
289
+    spin_lock(&block_list->lock);
215
+    spin_lock(&block_list->lock);
290
+    add_block(block, block_list);
216
+    list_add(&block->link, &block_list->active_list);
291
+    block_list->block_count++;
217
+    block_list->block_count++;
292
+    spin_unlock(&block_list->lock);
218
+    spin_unlock(&block_list->lock);
293
+    return block;
219
+    return block;
294
+}
220
+}
295
+
221
+
...
...
316
+
242
+
317
+    /* init each block list */
243
+    /* init each block list */
318
+    for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
244
+    for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
319
+        block_list = &pool->block_lists[i];
245
+        block_list = &pool->block_lists[i];
320
+        spin_lock_init(&block_list->lock);
246
+        spin_lock_init(&block_list->lock);
321
+        INIT_LIST_HEAD(&block_list->list);
247
+        INIT_LIST_HEAD(&block_list->full_list);
248
+        INIT_LIST_HEAD(&block_list->active_list);
322
+        block_list->block_count = 0;
249
+        block_list->block_count = 0;
323
+    }
250
+    }
324
+    return pool;
251
+    return pool;
325
+}
252
+}
326
+
253
+
...
...
383
+        return -EINVAL;
310
+        return -EINVAL;
384
+
311
+
385
+    block_list = &pool->block_lists[block_type];
312
+    block_list = &pool->block_lists[block_type];
386
+
313
+
387
+    spin_lock(&block_list->lock);
314
+    spin_lock(&block_list->lock);
388
+    block = find_block(block_list);
315
+    block = find_and_claim_block(block_list);
389
+    spin_unlock(&block_list->lock);
316
+    spin_unlock(&block_list->lock);
390
+    if (block)
317
+    if (block)
391
+        goto found;
318
+        goto found;
392
+
319
+
393
+    /* not found block with free slots try to allocate new empty block */
320
+    /* not found block with free slots try to allocate new empty block */
394
+    block = alloc_block(pool, block_type, gfp, handle);
321
+    block = alloc_block(pool, block_type, gfp & ~(__GFP_MOVABLE | __GFP_HIGHMEM), handle);
395
+    return block ? 0 : -ENOMEM;
322
+    return block ? 0 : -ENOMEM;
396
+
323
+
397
+found:
324
+found:
398
+    /* find the first free slot in block */
325
+    /*
326
+     * If we got here, there is a slot in the block claimed for us
327
+     * by find_and_claim_block(). Find that slot and set the busy bit.
328
+     */
399
+    for (slot = find_first_zero_bit(block->slot_info,
329
+    for (slot = find_first_zero_bit(block->slot_info,
400
+                    block_desc[block_type].slots_per_block);
330
+                    block_desc[block_type].slots_per_block);
401
+     slot < block_desc[block_type].slots_per_block;
331
+     slot < block_desc[block_type].slots_per_block;
402
+     slot = find_next_zero_bit(block->slot_info,
332
+     slot = find_next_zero_bit(block->slot_info,
403
+                    block_desc[block_type].slots_per_block,
333
+                    block_desc[block_type].slots_per_block,
...
...
428
+
358
+
429
+    spin_lock(&block_list->lock);
359
+    spin_lock(&block_list->lock);
430
+    /* if all slots in block are empty delete whole block */
360
+    /* if all slots in block are empty delete whole block */
431
+    if (++block->free_slots == block_desc[block_type].slots_per_block) {
361
+    if (++block->free_slots == block_desc[block_type].slots_per_block) {
432
+        block_list->block_count--;
362
+        block_list->block_count--;
433
+        remove_block(block);
363
+        list_del(&block->link);
434
+        spin_unlock(&block_list->lock);
364
+        spin_unlock(&block_list->lock);
435
+        free_pages((unsigned long)block, block_desc[block_type].order);
365
+        free_pages((unsigned long)block, block_desc[block_type].order);
436
+        return;
366
+        return;
437
+    }
367
+    } else if (block->free_slots == 1)
368
+        list_move_tail(&block->link, &block_list->active_list);
369
+    clear_bit(slot, block->slot_info);
438
+    spin_unlock(&block_list->lock);
370
+    spin_unlock(&block_list->lock);
439
+
440
+    clear_bit(slot, block->slot_info);
441
+}
371
+}
442
+
372
+
443
+/**
373
+/**
444
+ * zblock_map() - maps the allocation associated with the given handle
374
+ * zblock_map() - maps the allocation associated with the given handle
445
+ * @pool:    pool in which the allocation resides
375
+ * @pool:    pool in which the allocation resides
...
...
469
+static void zblock_unmap(struct zblock_pool *pool, unsigned long handle)
399
+static void zblock_unmap(struct zblock_pool *pool, unsigned long handle)
470
+{
400
+{
471
+}
401
+}
472
+
402
+
473
+/**
403
+/**
404
+ * zblock_write() - write to the memory area defined by handle
405
+ * @pool:    pool in which the allocation resides
406
+ * @handle:    handle associated with the allocation
407
+ * @handle_mem: pointer to source memory block
408
+ * @mem_len:    length of the memory block to write
409
+ */
410
+static void zblock_write(struct zblock_pool *pool, unsigned long handle,
411
+             void *handle_mem, size_t mem_len)
412
+{
413
+    unsigned int block_type, slot;
414
+    struct zblock_block *block;
415
+    unsigned long offs;
416
+    void *p;
417
+
418
+    block = handle_to_metadata(handle, &block_type, &slot);
419
+    offs = ZBLOCK_HEADER_SIZE + slot * block_desc[block_type].slot_size;
420
+    p = (void *)block + offs;
421
+    memcpy(p, handle_mem, mem_len);
422
+}
423
+
424
+/**
474
+ * zblock_get_total_pages() - gets the zblock pool size in pages
425
+ * zblock_get_total_pages() - gets the zblock pool size in pages
475
+ * @pool:    pool being queried
426
+ * @pool:    pool being queried
476
+ *
427
+ *
477
+ * Returns: size in bytes of the given pool.
428
+ * Returns: size in bytes of the given pool.
478
+ */
429
+ */
...
...
511
+static void zblock_zpool_free(void *pool, unsigned long handle)
462
+static void zblock_zpool_free(void *pool, unsigned long handle)
512
+{
463
+{
513
+    zblock_free(pool, handle);
464
+    zblock_free(pool, handle);
514
+}
465
+}
515
+
466
+
516
+static void *zblock_zpool_map(void *pool, unsigned long handle,
467
+static void *zblock_zpool_read_begin(void *pool, unsigned long handle,
517
+            enum zpool_mapmode mm)
468
+                void *local_copy)
518
+{
469
+{
519
+    return zblock_map(pool, handle);
470
+    return zblock_map(pool, handle);
520
+}
471
+}
521
+
472
+
522
+static void zblock_zpool_unmap(void *pool, unsigned long handle)
473
+static void zblock_zpool_obj_write(void *pool, unsigned long handle,
474
+                void *handle_mem, size_t mem_len)
475
+{
476
+    zblock_write(pool, handle, handle_mem, mem_len);
477
+}
478
+
479
+static void zblock_zpool_read_end(void *pool, unsigned long handle,
480
+                void *handle_mem)
523
+{
481
+{
524
+    zblock_unmap(pool, handle);
482
+    zblock_unmap(pool, handle);
525
+}
483
+}
526
+
484
+
527
+static u64 zblock_zpool_total_pages(void *pool)
485
+static u64 zblock_zpool_total_pages(void *pool)
528
+{
486
+{
529
+    return zblock_get_total_pages(pool);
487
+    return zblock_get_total_pages(pool);
530
+}
488
+}
531
+
489
+
532
+static struct zpool_driver zblock_zpool_driver = {
490
+static struct zpool_driver zblock_zpool_driver = {
533
+    .type =        "zblock",
491
+    .type =            "zblock",
534
+    .owner =    THIS_MODULE,
492
+    .owner =        THIS_MODULE,
535
+    .create =    zblock_zpool_create,
493
+    .create =        zblock_zpool_create,
536
+    .destroy =    zblock_zpool_destroy,
494
+    .destroy =        zblock_zpool_destroy,
537
+    .malloc =    zblock_zpool_malloc,
495
+    .malloc =        zblock_zpool_malloc,
538
+    .free =        zblock_zpool_free,
496
+    .free =            zblock_zpool_free,
539
+    .map =        zblock_zpool_map,
497
+    .obj_read_begin =    zblock_zpool_read_begin,
540
+    .unmap =    zblock_zpool_unmap,
498
+    .obj_read_end =        zblock_zpool_read_end,
541
+    .total_pages =    zblock_zpool_total_pages,
499
+    .obj_write =        zblock_zpool_obj_write,
500
+    .total_pages =        zblock_zpool_total_pages,
542
+};
501
+};
543
+
502
+
544
+MODULE_ALIAS("zpool-zblock");
503
+MODULE_ALIAS("zpool-zblock");
545
+
504
+
546
+static void delete_rbtree(void)
505
+static void delete_rbtree(void)
...
...
718
+};
677
+};
719
+
678
+
720
+/**
679
+/**
721
+ * struct block_list - stores metadata of particular list
680
+ * struct block_list - stores metadata of particular list
722
+ * lock:        protects the list of blocks
681
+ * lock:        protects the list of blocks
723
+ * list:        linked list of blocks
682
+ * active_list:        linked list of active (non-full) blocks
683
+ * full_list:        linked list of full blocks
724
+ * block_count:        total number of blocks in the list
684
+ * block_count:        total number of blocks in the list
725
+ */
685
+ */
726
+struct block_list {
686
+struct block_list {
727
+    spinlock_t lock;
687
+    spinlock_t lock;
728
+    struct list_head list;
688
+    struct list_head active_list;
689
+    struct list_head full_list;
729
+    unsigned long block_count;
690
+    unsigned long block_count;
730
+};
691
+};
731
+
692
+
732
+/**
693
+/**
733
+ * struct zblock_pool - stores metadata for each zblock pool
694
+ * struct zblock_pool - stores metadata for each zblock pool
...
...
diff view generated by jsdifflib