1 | The following changes since commit 20d6c7312f1b812bb9c750f4087f69ac8485cc90: | 1 | The following changes since commit 3f3bbfc7cef4490c5ed5550766a81e7d18f08db1: |
---|---|---|---|
2 | 2 | ||
3 | Merge remote-tracking branch 'remotes/palmer/tags/riscv-for-master-3.2-part1' into staging (2019-01-03 13:26:30 +0000) | 3 | Merge remote-tracking branch 'remotes/huth-gitlab/tags/pull-request-2019-03-12' into staging (2019-03-12 21:06:26 +0000) |
4 | 4 | ||
5 | are available in the Git repository at: | 5 | are available in the Git repository at: |
6 | 6 | ||
7 | git://github.com/stefanha/qemu.git tags/block-pull-request | 7 | git://github.com/stefanha/qemu.git tags/block-pull-request |
8 | 8 | ||
9 | for you to fetch changes up to 39a0408e768cd00142f5b57d27ab234282bf4df5: | 9 | for you to fetch changes up to f357fcd890a8d6ced6d261338b859a41414561e9: |
10 | 10 | ||
11 | dmg: don't skip zero chunk (2019-01-04 11:15:09 +0000) | 11 | file-posix: add drop-cache=on|off option (2019-03-13 10:54:55 +0000) |
12 | 12 | ||
13 | ---------------------------------------------------------------- | 13 | ---------------------------------------------------------------- |
14 | Pull request | 14 | Pull request |
15 | 15 | ||
16 | Bug fixes for the .dmg image file format. | 16 | * Add 'drop-cache=on|off' option to file-posix.c. The default is on. |
17 | Disabling the option fixes a QEMU 3.0.0 performance regression when live | ||
18 | migrating on the same host with cache.direct=off. | ||
17 | 19 | ||
18 | ---------------------------------------------------------------- | 20 | ---------------------------------------------------------------- |
19 | 21 | ||
20 | Julio Faracco (1): | 22 | Stefan Hajnoczi (1): |
21 | dmg: Fixing wrong dmg block type value for block terminator. | 23 | file-posix: add drop-cache=on|off option |
22 | 24 | ||
23 | yuchenlin (3): | 25 | qapi/block-core.json | 6 ++++++ |
24 | dmg: fix binary search | 26 | block/file-posix.c | 16 ++++++++++++++++ |
25 | dmg: use enumeration type instead of hard coding number | 27 | 2 files changed, 22 insertions(+) |
26 | dmg: don't skip zero chunk | ||
27 | |||
28 | block/dmg.c | 31 ++++++++++++++++++++----------- | ||
29 | 1 file changed, 20 insertions(+), 11 deletions(-) | ||
30 | 28 | ||
31 | -- | 29 | -- |
32 | 2.20.1 | 30 | 2.20.1 |
33 | 31 | ||
34 | 32 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Julio Faracco <jcfaracco@gmail.com> | ||
2 | 1 | ||
3 | This is a trivial patch to fix a wrong value for block terminator. | ||
4 | The old value was 0x7fffffff which is wrong. It was not affecting the | ||
5 | code because QEMU dmg block is not handling block terminator right now. | ||
6 | Neverthless, it should be fixed. | ||
7 | |||
8 | Signed-off-by: Julio Faracco <jcfaracco@gmail.com> | ||
9 | Reviewed-by: yuchenlin <yuchenlin@synology.com> | ||
10 | Message-id: 20181228145055.18039-1-jcfaracco@gmail.com | ||
11 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
12 | --- | ||
13 | block/dmg.c | 2 +- | ||
14 | 1 file changed, 1 insertion(+), 1 deletion(-) | ||
15 | |||
16 | diff --git a/block/dmg.c b/block/dmg.c | ||
17 | index XXXXXXX..XXXXXXX 100644 | ||
18 | --- a/block/dmg.c | ||
19 | +++ b/block/dmg.c | ||
20 | @@ -XXX,XX +XXX,XX @@ enum { | ||
21 | UDBZ, | ||
22 | ULFO, | ||
23 | UDCM = 0x7ffffffe, /* Comments */ | ||
24 | - UDLE /* Last Entry */ | ||
25 | + UDLE = 0xffffffff /* Last Entry */ | ||
26 | }; | ||
27 | |||
28 | static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) | ||
29 | -- | ||
30 | 2.20.1 | ||
31 | |||
32 | diff view generated by jsdifflib |
1 | From: yuchenlin <npes87184@gmail.com> | 1 | Commit dd577a26ff03b6829721b1ffbbf9e7c411b72378 ("block/file-posix: |
---|---|---|---|
2 | implement bdrv_co_invalidate_cache() on Linux") introduced page cache | ||
3 | invalidation so that cache.direct=off live migration is safe on Linux. | ||
2 | 4 | ||
3 | There is a possible hang in original binary search implementation. That is | 5 | The invalidation takes a significant amount of time when the file is |
4 | if chunk1 = 4, chunk2 = 5, chunk3 = 4, and we go else case. | 6 | large and present in the page cache. Normally this is not the case for |
7 | cross-host live migration but it can happen when migrating between QEMU | ||
8 | processes on the same host. | ||
5 | 9 | ||
6 | The chunk1 will be still 4, and so on. | 10 | On same-host migration we don't need to invalidate pages for correctness |
11 | anyway, so an option to skip page cache invalidation is useful. I | ||
12 | investigated optimizing invalidation and detecting same-host migration, | ||
13 | but both are hard to achieve so a user-visible option will suffice. | ||
7 | 14 | ||
8 | Signed-off-by: yuchenlin <npes87184@gmail.com> | 15 | As a bonus this option means that the cache invalidation feature will |
9 | Message-id: 20190103114700.9686-2-npes87184@gmail.com | 16 | now be detectable by libvirt via QMP schema introspection. |
17 | |||
18 | Suggested-by: Neil Skrypuch <neil@tembosocial.com> | ||
19 | Tested-by: Neil Skrypuch <neil@tembosocial.com> | ||
20 | Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> | ||
21 | Reviewed-by: Eric Blake <eblake@redhat.com> | ||
22 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
23 | Message-id: 20190307164941.3322-1-stefanha@redhat.com | ||
24 | Message-Id: <20190307164941.3322-1-stefanha@redhat.com> | ||
10 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | 25 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> |
11 | --- | 26 | --- |
12 | block/dmg.c | 10 +++++++--- | 27 | qapi/block-core.json | 6 ++++++ |
13 | 1 file changed, 7 insertions(+), 3 deletions(-) | 28 | block/file-posix.c | 16 ++++++++++++++++ |
29 | 2 files changed, 22 insertions(+) | ||
14 | 30 | ||
15 | diff --git a/block/dmg.c b/block/dmg.c | 31 | diff --git a/qapi/block-core.json b/qapi/block-core.json |
16 | index XXXXXXX..XXXXXXX 100644 | 32 | index XXXXXXX..XXXXXXX 100644 |
17 | --- a/block/dmg.c | 33 | --- a/qapi/block-core.json |
18 | +++ b/block/dmg.c | 34 | +++ b/qapi/block-core.json |
19 | @@ -XXX,XX +XXX,XX @@ static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num) | 35 | @@ -XXX,XX +XXX,XX @@ |
20 | { | 36 | # @locking: whether to enable file locking. If set to 'auto', only enable |
21 | /* binary search */ | 37 | # when Open File Descriptor (OFD) locking API is available |
22 | uint32_t chunk1 = 0, chunk2 = s->n_chunks, chunk3; | 38 | # (default: auto, since 2.10) |
23 | - while (chunk1 != chunk2) { | 39 | +# @drop-cache: invalidate page cache during live migration. This prevents |
24 | + while (chunk1 <= chunk2) { | 40 | +# stale data on the migration destination with cache.direct=off. |
25 | chunk3 = (chunk1 + chunk2) / 2; | 41 | +# Currently only supported on Linux hosts. |
26 | if (s->sectors[chunk3] > sector_num) { | 42 | +# (default: on, since: 4.0) |
27 | - chunk2 = chunk3; | 43 | # @x-check-cache-dropped: whether to check that page cache was dropped on live |
28 | + if (chunk3 == 0) { | 44 | # migration. May cause noticeable delays if the image |
29 | + goto err; | 45 | # file is large, do not use in production. |
30 | + } | 46 | @@ -XXX,XX +XXX,XX @@ |
31 | + chunk2 = chunk3 - 1; | 47 | '*pr-manager': 'str', |
32 | } else if (s->sectors[chunk3] + s->sectorcounts[chunk3] > sector_num) { | 48 | '*locking': 'OnOffAuto', |
33 | return chunk3; | 49 | '*aio': 'BlockdevAioOptions', |
34 | } else { | 50 | + '*drop-cache': {'type': 'bool', |
35 | - chunk1 = chunk3; | 51 | + 'if': 'defined(CONFIG_LINUX)'}, |
36 | + chunk1 = chunk3 + 1; | 52 | '*x-check-cache-dropped': 'bool' } } |
53 | |||
54 | ## | ||
55 | diff --git a/block/file-posix.c b/block/file-posix.c | ||
56 | index XXXXXXX..XXXXXXX 100644 | ||
57 | --- a/block/file-posix.c | ||
58 | +++ b/block/file-posix.c | ||
59 | @@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState { | ||
60 | bool page_cache_inconsistent:1; | ||
61 | bool has_fallocate; | ||
62 | bool needs_alignment; | ||
63 | + bool drop_cache; | ||
64 | bool check_cache_dropped; | ||
65 | |||
66 | PRManager *pr_mgr; | ||
67 | @@ -XXX,XX +XXX,XX @@ typedef struct BDRVRawState { | ||
68 | typedef struct BDRVRawReopenState { | ||
69 | int fd; | ||
70 | int open_flags; | ||
71 | + bool drop_cache; | ||
72 | bool check_cache_dropped; | ||
73 | } BDRVRawReopenState; | ||
74 | |||
75 | @@ -XXX,XX +XXX,XX @@ static QemuOptsList raw_runtime_opts = { | ||
76 | .type = QEMU_OPT_STRING, | ||
77 | .help = "id of persistent reservation manager object (default: none)", | ||
78 | }, | ||
79 | +#if defined(__linux__) | ||
80 | + { | ||
81 | + .name = "drop-cache", | ||
82 | + .type = QEMU_OPT_BOOL, | ||
83 | + .help = "invalidate page cache during live migration (default: on)", | ||
84 | + }, | ||
85 | +#endif | ||
86 | { | ||
87 | .name = "x-check-cache-dropped", | ||
88 | .type = QEMU_OPT_BOOL, | ||
89 | @@ -XXX,XX +XXX,XX @@ static int raw_open_common(BlockDriverState *bs, QDict *options, | ||
37 | } | 90 | } |
38 | } | 91 | } |
39 | +err: | 92 | |
40 | return s->n_chunks; /* error */ | 93 | + s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true); |
41 | } | 94 | s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped", |
42 | 95 | false); | |
96 | |||
97 | @@ -XXX,XX +XXX,XX @@ static int raw_reopen_prepare(BDRVReopenState *state, | ||
98 | goto out; | ||
99 | } | ||
100 | |||
101 | + rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true); | ||
102 | rs->check_cache_dropped = | ||
103 | qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false); | ||
104 | |||
105 | @@ -XXX,XX +XXX,XX @@ static void raw_reopen_commit(BDRVReopenState *state) | ||
106 | BDRVRawState *s = state->bs->opaque; | ||
107 | Error *local_err = NULL; | ||
108 | |||
109 | + s->drop_cache = rs->drop_cache; | ||
110 | s->check_cache_dropped = rs->check_cache_dropped; | ||
111 | s->open_flags = rs->open_flags; | ||
112 | |||
113 | @@ -XXX,XX +XXX,XX @@ static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs, | ||
114 | return; | ||
115 | } | ||
116 | |||
117 | + if (!s->drop_cache) { | ||
118 | + return; | ||
119 | + } | ||
120 | + | ||
121 | if (s->open_flags & O_DIRECT) { | ||
122 | return; /* No host kernel page cache */ | ||
123 | } | ||
43 | -- | 124 | -- |
44 | 2.20.1 | 125 | 2.20.1 |
45 | 126 | ||
46 | 127 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: yuchenlin <npes87184@gmail.com> | ||
2 | 1 | ||
3 | Signed-off-by: yuchenlin <npes87184@gmail.com> | ||
4 | Reviewed-by: Julio Faracco <jcfaracco@gmail.com> | ||
5 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
6 | Message-id: 20190103114700.9686-3-npes87184@gmail.com | ||
7 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
8 | --- | ||
9 | block/dmg.c | 4 ++-- | ||
10 | 1 file changed, 2 insertions(+), 2 deletions(-) | ||
11 | |||
12 | diff --git a/block/dmg.c b/block/dmg.c | ||
13 | index XXXXXXX..XXXXXXX 100644 | ||
14 | --- a/block/dmg.c | ||
15 | +++ b/block/dmg.c | ||
16 | @@ -XXX,XX +XXX,XX @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, | ||
17 | |||
18 | /* all-zeroes sector (type 2) does not need to be "uncompressed" and can | ||
19 | * therefore be unbounded. */ | ||
20 | - if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { | ||
21 | + if (s->types[i] != UDIG && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { | ||
22 | error_report("sector count %" PRIu64 " for chunk %" PRIu32 | ||
23 | " is larger than max (%u)", | ||
24 | s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); | ||
25 | @@ -XXX,XX +XXX,XX @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, | ||
26 | /* Special case: current chunk is all zeroes. Do not perform a memcpy as | ||
27 | * s->uncompressed_chunk may be too small to cover the large all-zeroes | ||
28 | * section. dmg_read_chunk is called to find s->current_chunk */ | ||
29 | - if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ | ||
30 | + if (s->types[s->current_chunk] == UDIG) { /* all zeroes block entry */ | ||
31 | qemu_iovec_memset(qiov, i * 512, 0, 512); | ||
32 | continue; | ||
33 | } | ||
34 | -- | ||
35 | 2.20.1 | ||
36 | |||
37 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: yuchenlin <npes87184@gmail.com> | ||
2 | 1 | ||
3 | The dmg file has many tables which describe: "start from sector XXX to | ||
4 | sector XXX, the compression method is XXX and where the compressed data | ||
5 | resides on". | ||
6 | |||
7 | Each sector in the expanded file should be covered by a table. The table | ||
8 | will describe the offset of compressed data (or raw depends on the type) | ||
9 | in the dmg. | ||
10 | |||
11 | For example: | ||
12 | |||
13 | [-----------The expanded file------------] | ||
14 | [---bzip table ---]/* zeros */[---zlib---] | ||
15 | ^ | ||
16 | | if we want to read this sector. | ||
17 | |||
18 | we will find bzip table which contains this sector, and get the | ||
19 | compressed data offset, read it from dmg, uncompress it, finally write to | ||
20 | expanded file. | ||
21 | |||
22 | If we skip zero chunk (table), some sector cannot find the table which | ||
23 | will cause search_chunk() return s->n_chunks, dmg_read_chunk() return -1 | ||
24 | and finally causing dmg_co_preadv() return EIO. | ||
25 | |||
26 | See: | ||
27 | |||
28 | [-----------The expanded file------------] | ||
29 | [---bzip table ---]/* zeros */[---zlib---] | ||
30 | ^ | ||
31 | | if we want to read this sector. | ||
32 | |||
33 | Oops, we cannot find the table contains it... | ||
34 | |||
35 | In the original implementation, we don't have zero table. When we try to | ||
36 | read sector inside the zero chunk. We will get EIO, and skip reading. | ||
37 | |||
38 | After this patch, we treat zero chunk the same as ignore chunk, it will | ||
39 | directly write zero and avoid some sector may not find the table. | ||
40 | |||
41 | After this patch: | ||
42 | |||
43 | [-----------The expanded file------------] | ||
44 | [---bzip table ---][--zeros--][---zlib---] | ||
45 | |||
46 | Signed-off-by: yuchenlin <npes87184@gmail.com> | ||
47 | Reviewed-by: Julio Faracco <jcfaracco@gmail.com> | ||
48 | Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
49 | Message-id: 20190103114700.9686-4-npes87184@gmail.com | ||
50 | Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> | ||
51 | --- | ||
52 | block/dmg.c | 19 ++++++++++++------- | ||
53 | 1 file changed, 12 insertions(+), 7 deletions(-) | ||
54 | |||
55 | diff --git a/block/dmg.c b/block/dmg.c | ||
56 | index XXXXXXX..XXXXXXX 100644 | ||
57 | --- a/block/dmg.c | ||
58 | +++ b/block/dmg.c | ||
59 | @@ -XXX,XX +XXX,XX @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, | ||
60 | case UDRW: /* copy */ | ||
61 | uncompressed_sectors = DIV_ROUND_UP(s->lengths[chunk], 512); | ||
62 | break; | ||
63 | - case UDIG: /* zero */ | ||
64 | + case UDZE: /* zero */ | ||
65 | + case UDIG: /* ignore */ | ||
66 | /* as the all-zeroes block may be large, it is treated specially: the | ||
67 | * sector is not copied from a large buffer, a simple memset is used | ||
68 | * instead. Therefore uncompressed_sectors does not need to be set. */ | ||
69 | @@ -XXX,XX +XXX,XX @@ typedef struct DmgHeaderState { | ||
70 | static bool dmg_is_known_block_type(uint32_t entry_type) | ||
71 | { | ||
72 | switch (entry_type) { | ||
73 | + case UDZE: /* zeros */ | ||
74 | case UDRW: /* uncompressed */ | ||
75 | - case UDIG: /* zeroes */ | ||
76 | + case UDIG: /* ignore */ | ||
77 | case UDZO: /* zlib */ | ||
78 | return true; | ||
79 | case UDBZ: /* bzip2 */ | ||
80 | @@ -XXX,XX +XXX,XX @@ static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, | ||
81 | /* sector count */ | ||
82 | s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10); | ||
83 | |||
84 | - /* all-zeroes sector (type 2) does not need to be "uncompressed" and can | ||
85 | - * therefore be unbounded. */ | ||
86 | - if (s->types[i] != UDIG && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { | ||
87 | + /* all-zeroes sector (type UDZE and UDIG) does not need to be | ||
88 | + * "uncompressed" and can therefore be unbounded. */ | ||
89 | + if (s->types[i] != UDZE && s->types[i] != UDIG | ||
90 | + && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { | ||
91 | error_report("sector count %" PRIu64 " for chunk %" PRIu32 | ||
92 | " is larger than max (%u)", | ||
93 | s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); | ||
94 | @@ -XXX,XX +XXX,XX @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) | ||
95 | return -1; | ||
96 | } | ||
97 | break; | ||
98 | - case UDIG: /* zero */ | ||
99 | + case UDZE: /* zeros */ | ||
100 | + case UDIG: /* ignore */ | ||
101 | /* see dmg_read, it is treated specially. No buffer needs to be | ||
102 | * pre-filled, the zeroes can be set directly. */ | ||
103 | break; | ||
104 | @@ -XXX,XX +XXX,XX @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, | ||
105 | /* Special case: current chunk is all zeroes. Do not perform a memcpy as | ||
106 | * s->uncompressed_chunk may be too small to cover the large all-zeroes | ||
107 | * section. dmg_read_chunk is called to find s->current_chunk */ | ||
108 | - if (s->types[s->current_chunk] == UDIG) { /* all zeroes block entry */ | ||
109 | + if (s->types[s->current_chunk] == UDZE | ||
110 | + || s->types[s->current_chunk] == UDIG) { /* all zeroes block entry */ | ||
111 | qemu_iovec_memset(qiov, i * 512, 0, 512); | ||
112 | continue; | ||
113 | } | ||
114 | -- | ||
115 | 2.20.1 | ||
116 | |||
117 | diff view generated by jsdifflib |