On Linux posix_fadvise(POSIX_FADV_DONTNEED) invalidates pages*. Use
this to drop page cache on the destination host during shared storage
migration. This way the destination host will read the latest copy of
the data and will not use stale data from the page cache.
The flow is as follows:
1. Source host writes out all dirty pages and inactivates drives.
2. QEMU_VM_EOF is sent on migration stream.
3. Destination host invalidates caches before accessing drives.
This patch enables live migration even with -drive cache.direct=off.
* Terms and conditions may apply, please see patch for details.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/file-posix.c | 39 +++++++++++++++++++++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/block/file-posix.c b/block/file-posix.c
index 3794c0007a..df4f52919f 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2236,6 +2236,42 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
return ret | BDRV_BLOCK_OFFSET_VALID;
}
+static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
+ Error **errp)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ ret = fd_open(bs);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "The file descriptor is not open");
+ return;
+ }
+
+ if (s->open_flags & O_DIRECT) {
+ return; /* No host kernel page cache */
+ }
+
+#if defined(__linux__)
+ /* This sets the scene for the next syscall... */
+ ret = bdrv_co_flush(bs);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "flush failed");
+ return;
+ }
+
+ /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
+ * process. These limitations are okay because we just fsynced the file,
+ * we don't use mmap, and the file should not be in use by other processes.
+ */
+ ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
+ if (ret != 0) { /* the return value is a positive errno */
+ error_setg_errno(errp, ret, "fadvise failed");
+ return;
+ }
+#endif /* __linux__ */
+}
+
static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
int64_t offset, int bytes,
BlockCompletionFunc *cb, void *opaque)
@@ -2328,6 +2364,7 @@ BlockDriver bdrv_file = {
.bdrv_co_create_opts = raw_co_create_opts,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
.bdrv_co_block_status = raw_co_block_status,
+ .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
.bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
.bdrv_co_preadv = raw_co_preadv,
@@ -2805,6 +2842,7 @@ static BlockDriver bdrv_host_device = {
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_co_create_opts = hdev_co_create_opts,
.create_opts = &raw_create_opts,
+ .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
.bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
.bdrv_co_preadv = raw_co_preadv,
@@ -2927,6 +2965,7 @@ static BlockDriver bdrv_host_cdrom = {
.bdrv_reopen_abort = raw_reopen_abort,
.bdrv_co_create_opts = hdev_co_create_opts,
.create_opts = &raw_create_opts,
+ .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
.bdrv_co_preadv = raw_co_preadv,
--
2.14.3
On Thu, 04/19 15:52, Stefan Hajnoczi wrote:
> On Linux posix_fadvise(POSIX_FADV_DONTNEED) invalidates pages*. Use
> this to drop page cache on the destination host during shared storage
> migration. This way the destination host will read the latest copy of
> the data and will not use stale data from the page cache.
>
> The flow is as follows:
>
> 1. Source host writes out all dirty pages and inactivates drives.
> 2. QEMU_VM_EOF is sent on migration stream.
> 3. Destination host invalidates caches before accessing drives.
>
> This patch enables live migration even with -drive cache.direct=off.
>
> * Terms and conditions may apply, please see patch for details.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
> block/file-posix.c | 39 +++++++++++++++++++++++++++++++++++++++
> 1 file changed, 39 insertions(+)
>
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 3794c0007a..df4f52919f 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -2236,6 +2236,42 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
> return ret | BDRV_BLOCK_OFFSET_VALID;
> }
>
> +static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
> + Error **errp)
> +{
> + BDRVRawState *s = bs->opaque;
> + int ret;
> +
> + ret = fd_open(bs);
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "The file descriptor is not open");
> + return;
> + }
> +
> + if (s->open_flags & O_DIRECT) {
> + return; /* No host kernel page cache */
> + }
> +
> +#if defined(__linux__)
> + /* This sets the scene for the next syscall... */
> + ret = bdrv_co_flush(bs);
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "flush failed");
> + return;
> + }
> +
> + /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
> + * process. These limitations are okay because we just fsynced the file,
> + * we don't use mmap, and the file should not be in use by other processes.
> + */
> + ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
> + if (ret != 0) { /* the return value is a positive errno */
> + error_setg_errno(errp, ret, "fadvise failed");
> + return;
> + }
> +#endif /* __linux__ */
What about the #else branch? It doesn't automatically work, I guess?
Fam
> +}
> +
> static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
> int64_t offset, int bytes,
> BlockCompletionFunc *cb, void *opaque)
> @@ -2328,6 +2364,7 @@ BlockDriver bdrv_file = {
> .bdrv_co_create_opts = raw_co_create_opts,
> .bdrv_has_zero_init = bdrv_has_zero_init_1,
> .bdrv_co_block_status = raw_co_block_status,
> + .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
>
> .bdrv_co_preadv = raw_co_preadv,
> @@ -2805,6 +2842,7 @@ static BlockDriver bdrv_host_device = {
> .bdrv_reopen_abort = raw_reopen_abort,
> .bdrv_co_create_opts = hdev_co_create_opts,
> .create_opts = &raw_create_opts,
> + .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>
> .bdrv_co_preadv = raw_co_preadv,
> @@ -2927,6 +2965,7 @@ static BlockDriver bdrv_host_cdrom = {
> .bdrv_reopen_abort = raw_reopen_abort,
> .bdrv_co_create_opts = hdev_co_create_opts,
> .create_opts = &raw_create_opts,
> + .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>
>
> .bdrv_co_preadv = raw_co_preadv,
> --
> 2.14.3
>
>
On Thu, Apr 19, 2018 at 04:13:44PM +0800, Fam Zheng wrote:
> On Thu, 04/19 15:52, Stefan Hajnoczi wrote:
> > On Linux posix_fadvise(POSIX_FADV_DONTNEED) invalidates pages*. Use
> > this to drop page cache on the destination host during shared storage
> > migration. This way the destination host will read the latest copy of
> > the data and will not use stale data from the page cache.
> >
> > The flow is as follows:
> >
> > 1. Source host writes out all dirty pages and inactivates drives.
> > 2. QEMU_VM_EOF is sent on migration stream.
> > 3. Destination host invalidates caches before accessing drives.
> >
> > This patch enables live migration even with -drive cache.direct=off.
> >
> > * Terms and conditions may apply, please see patch for details.
> >
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> > block/file-posix.c | 39 +++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 39 insertions(+)
> >
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 3794c0007a..df4f52919f 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -2236,6 +2236,42 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
> > return ret | BDRV_BLOCK_OFFSET_VALID;
> > }
> >
> > +static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
> > + Error **errp)
> > +{
> > + BDRVRawState *s = bs->opaque;
> > + int ret;
> > +
> > + ret = fd_open(bs);
> > + if (ret < 0) {
> > + error_setg_errno(errp, -ret, "The file descriptor is not open");
> > + return;
> > + }
> > +
> > + if (s->open_flags & O_DIRECT) {
> > + return; /* No host kernel page cache */
> > + }
> > +
> > +#if defined(__linux__)
> > + /* This sets the scene for the next syscall... */
> > + ret = bdrv_co_flush(bs);
> > + if (ret < 0) {
> > + error_setg_errno(errp, -ret, "flush failed");
> > + return;
> > + }
> > +
> > + /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
> > + * process. These limitations are okay because we just fsynced the file,
> > + * we don't use mmap, and the file should not be in use by other processes.
> > + */
> > + ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
> > + if (ret != 0) { /* the return value is a positive errno */
> > + error_setg_errno(errp, ret, "fadvise failed");
> > + return;
> > + }
> > +#endif /* __linux__ */
>
> What about the #else branch? It doesn't automatically work, I guess?
Right, no error is reported. This is existing QEMU behavior.
If we want to change behavior then it must be done consistently (i.e. by
auditing the other block drivers) and we need to be prepared for bug
reports (just like file locking, it may expose interesting use cases
that we cannot easily dismiss as wrong). I didn't want to go there.
If there is consensus then I will change the behavior.
Stefan
On Fri, 04/20 11:15, Stefan Hajnoczi wrote:
> On Thu, Apr 19, 2018 at 04:13:44PM +0800, Fam Zheng wrote:
> > On Thu, 04/19 15:52, Stefan Hajnoczi wrote:
> > > On Linux posix_fadvise(POSIX_FADV_DONTNEED) invalidates pages*. Use
> > > this to drop page cache on the destination host during shared storage
> > > migration. This way the destination host will read the latest copy of
> > > the data and will not use stale data from the page cache.
> > >
> > > The flow is as follows:
> > >
> > > 1. Source host writes out all dirty pages and inactivates drives.
> > > 2. QEMU_VM_EOF is sent on migration stream.
> > > 3. Destination host invalidates caches before accessing drives.
> > >
> > > This patch enables live migration even with -drive cache.direct=off.
> > >
> > > * Terms and conditions may apply, please see patch for details.
> > >
> > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > ---
> > > block/file-posix.c | 39 +++++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 39 insertions(+)
> > >
> > > diff --git a/block/file-posix.c b/block/file-posix.c
> > > index 3794c0007a..df4f52919f 100644
> > > --- a/block/file-posix.c
> > > +++ b/block/file-posix.c
> > > @@ -2236,6 +2236,42 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
> > > return ret | BDRV_BLOCK_OFFSET_VALID;
> > > }
> > >
> > > +static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
> > > + Error **errp)
> > > +{
> > > + BDRVRawState *s = bs->opaque;
> > > + int ret;
> > > +
> > > + ret = fd_open(bs);
> > > + if (ret < 0) {
> > > + error_setg_errno(errp, -ret, "The file descriptor is not open");
> > > + return;
> > > + }
> > > +
> > > + if (s->open_flags & O_DIRECT) {
> > > + return; /* No host kernel page cache */
> > > + }
> > > +
> > > +#if defined(__linux__)
> > > + /* This sets the scene for the next syscall... */
> > > + ret = bdrv_co_flush(bs);
> > > + if (ret < 0) {
> > > + error_setg_errno(errp, -ret, "flush failed");
> > > + return;
> > > + }
> > > +
> > > + /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
> > > + * process. These limitations are okay because we just fsynced the file,
> > > + * we don't use mmap, and the file should not be in use by other processes.
> > > + */
> > > + ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
> > > + if (ret != 0) { /* the return value is a positive errno */
> > > + error_setg_errno(errp, ret, "fadvise failed");
> > > + return;
> > > + }
> > > +#endif /* __linux__ */
> >
> > What about the #else branch? It doesn't automatically work, I guess?
>
> Right, no error is reported. This is existing QEMU behavior.
>
> If we want to change behavior then it must be done consistently (i.e. by
> auditing the other block drivers) and we need to be prepared for bug
> reports (just like file locking, it may expose interesting use cases
> that we cannot easily dismiss as wrong). I didn't want to go there.
>
> If there is consensus then I will change the behavior.
No need to change behavior (reporting error), at least not in this patch. But a
#else
/* TODO: ... */
#endif
to remember adding similar code to invalidate system cache on other *nix systems
cannot hurt. E.g BSDes have posix_fadvise() too, though I have no idea if
POSIX_FADV_DONTNEED works the same.
(I'm sure there are some tricks on Windows to but do we care? :)
Fam
Am 20.04.2018 um 05:15 hat Stefan Hajnoczi geschrieben:
> On Thu, Apr 19, 2018 at 04:13:44PM +0800, Fam Zheng wrote:
> > On Thu, 04/19 15:52, Stefan Hajnoczi wrote:
> > > On Linux posix_fadvise(POSIX_FADV_DONTNEED) invalidates pages*. Use
> > > this to drop page cache on the destination host during shared storage
> > > migration. This way the destination host will read the latest copy of
> > > the data and will not use stale data from the page cache.
> > >
> > > The flow is as follows:
> > >
> > > 1. Source host writes out all dirty pages and inactivates drives.
> > > 2. QEMU_VM_EOF is sent on migration stream.
> > > 3. Destination host invalidates caches before accessing drives.
> > >
> > > This patch enables live migration even with -drive cache.direct=off.
> > >
> > > * Terms and conditions may apply, please see patch for details.
> > >
> > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > ---
> > > block/file-posix.c | 39 +++++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 39 insertions(+)
> > >
> > > diff --git a/block/file-posix.c b/block/file-posix.c
> > > index 3794c0007a..df4f52919f 100644
> > > --- a/block/file-posix.c
> > > +++ b/block/file-posix.c
> > > @@ -2236,6 +2236,42 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
> > > return ret | BDRV_BLOCK_OFFSET_VALID;
> > > }
> > >
> > > +static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
> > > + Error **errp)
> > > +{
> > > + BDRVRawState *s = bs->opaque;
> > > + int ret;
> > > +
> > > + ret = fd_open(bs);
> > > + if (ret < 0) {
> > > + error_setg_errno(errp, -ret, "The file descriptor is not open");
> > > + return;
> > > + }
> > > +
> > > + if (s->open_flags & O_DIRECT) {
> > > + return; /* No host kernel page cache */
> > > + }
> > > +
> > > +#if defined(__linux__)
> > > + /* This sets the scene for the next syscall... */
> > > + ret = bdrv_co_flush(bs);
> > > + if (ret < 0) {
> > > + error_setg_errno(errp, -ret, "flush failed");
> > > + return;
> > > + }
> > > +
> > > + /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
> > > + * process. These limitations are okay because we just fsynced the file,
> > > + * we don't use mmap, and the file should not be in use by other processes.
> > > + */
> > > + ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
> > > + if (ret != 0) { /* the return value is a positive errno */
> > > + error_setg_errno(errp, ret, "fadvise failed");
> > > + return;
> > > + }
> > > +#endif /* __linux__ */
> >
> > What about the #else branch? It doesn't automatically work, I guess?
>
> Right, no error is reported. This is existing QEMU behavior.
>
> If we want to change behavior then it must be done consistently (i.e. by
> auditing the other block drivers) and we need to be prepared for bug
> reports (just like file locking, it may expose interesting use cases
> that we cannot easily dismiss as wrong). I didn't want to go there.
>
> If there is consensus then I will change the behavior.
I think either way that would be for a separate patch.
I'm also not sure how useful that change would actually be because it
might give you a false sense of safety: Even with this patch, you still
need to be exactly aware of the conditions that make live migration with
shared storage work correctly. If we error out on some unsafe cases,
but not on others, this might be confusing.
On the other hand, the problematic image format drivers have been
setting migration blockers for a long time, so you could also argue that
file-posix is inconsistent with them because it completely ignores
unsafe scenarios.
Kevin
* Stefan Hajnoczi (stefanha@redhat.com) wrote:
> On Linux posix_fadvise(POSIX_FADV_DONTNEED) invalidates pages*. Use
> this to drop page cache on the destination host during shared storage
> migration. This way the destination host will read the latest copy of
> the data and will not use stale data from the page cache.
>
> The flow is as follows:
>
> 1. Source host writes out all dirty pages and inactivates drives.
> 2. QEMU_VM_EOF is sent on migration stream.
> 3. Destination host invalidates caches before accessing drives.
>
> This patch enables live migration even with -drive cache.direct=off.
>
> * Terms and conditions may apply, please see patch for details.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
> block/file-posix.c | 39 +++++++++++++++++++++++++++++++++++++++
> 1 file changed, 39 insertions(+)
>
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 3794c0007a..df4f52919f 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -2236,6 +2236,42 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
> return ret | BDRV_BLOCK_OFFSET_VALID;
> }
>
> +static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
> + Error **errp)
> +{
> + BDRVRawState *s = bs->opaque;
> + int ret;
> +
> + ret = fd_open(bs);
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "The file descriptor is not open");
> + return;
> + }
> +
> + if (s->open_flags & O_DIRECT) {
> + return; /* No host kernel page cache */
> + }
> +
> +#if defined(__linux__)
> + /* This sets the scene for the next syscall... */
> + ret = bdrv_co_flush(bs);
> + if (ret < 0) {
> + error_setg_errno(errp, -ret, "flush failed");
> + return;
> + }
> +
> + /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
> + * process. These limitations are okay because we just fsynced the file,
> + * we don't use mmap, and the file should not be in use by other processes.
> + */
> + ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
What happens if I try a migrate between two qemu's on the same host?
(Which I, and avocado, both use for testing; I think think users
occasionally do for QEMU updates).
Dave
> + if (ret != 0) { /* the return value is a positive errno */
> + error_setg_errno(errp, ret, "fadvise failed");
> + return;
> + }
> +#endif /* __linux__ */
> +}
> +
> static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
> int64_t offset, int bytes,
> BlockCompletionFunc *cb, void *opaque)
> @@ -2328,6 +2364,7 @@ BlockDriver bdrv_file = {
> .bdrv_co_create_opts = raw_co_create_opts,
> .bdrv_has_zero_init = bdrv_has_zero_init_1,
> .bdrv_co_block_status = raw_co_block_status,
> + .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
>
> .bdrv_co_preadv = raw_co_preadv,
> @@ -2805,6 +2842,7 @@ static BlockDriver bdrv_host_device = {
> .bdrv_reopen_abort = raw_reopen_abort,
> .bdrv_co_create_opts = hdev_co_create_opts,
> .create_opts = &raw_create_opts,
> + .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>
> .bdrv_co_preadv = raw_co_preadv,
> @@ -2927,6 +2965,7 @@ static BlockDriver bdrv_host_cdrom = {
> .bdrv_reopen_abort = raw_reopen_abort,
> .bdrv_co_create_opts = hdev_co_create_opts,
> .create_opts = &raw_create_opts,
> + .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>
>
> .bdrv_co_preadv = raw_co_preadv,
> --
> 2.14.3
>
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
On Thu, Apr 19, 2018 at 10:18:33AM +0100, Dr. David Alan Gilbert wrote:
> * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > On Linux posix_fadvise(POSIX_FADV_DONTNEED) invalidates pages*. Use
> > this to drop page cache on the destination host during shared storage
> > migration. This way the destination host will read the latest copy of
> > the data and will not use stale data from the page cache.
> >
> > The flow is as follows:
> >
> > 1. Source host writes out all dirty pages and inactivates drives.
> > 2. QEMU_VM_EOF is sent on migration stream.
> > 3. Destination host invalidates caches before accessing drives.
> >
> > This patch enables live migration even with -drive cache.direct=off.
> >
> > * Terms and conditions may apply, please see patch for details.
> >
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> > block/file-posix.c | 39 +++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 39 insertions(+)
> >
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 3794c0007a..df4f52919f 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -2236,6 +2236,42 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
> > return ret | BDRV_BLOCK_OFFSET_VALID;
> > }
> >
> > +static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
> > + Error **errp)
> > +{
> > + BDRVRawState *s = bs->opaque;
> > + int ret;
> > +
> > + ret = fd_open(bs);
> > + if (ret < 0) {
> > + error_setg_errno(errp, -ret, "The file descriptor is not open");
> > + return;
> > + }
> > +
> > + if (s->open_flags & O_DIRECT) {
> > + return; /* No host kernel page cache */
> > + }
> > +
> > +#if defined(__linux__)
> > + /* This sets the scene for the next syscall... */
> > + ret = bdrv_co_flush(bs);
> > + if (ret < 0) {
> > + error_setg_errno(errp, -ret, "flush failed");
> > + return;
> > + }
> > +
> > + /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
> > + * process. These limitations are okay because we just fsynced the file,
> > + * we don't use mmap, and the file should not be in use by other processes.
> > + */
> > + ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
>
> What happens if I try a migrate between two qemu's on the same host?
> (Which I, and avocado, both use for testing; I think think users
> occasionally do for QEMU updates).
The steps quoted from the commit description:
1. Source host writes out all dirty pages and inactivates drives.
2. QEMU_VM_EOF is sent on migration stream.
3. Destination host invalidates caches before accessing drives.
When we reach Step 3 the source QEMU is not doing I/O (no pages are
locked). The destination QEMU does bdrv_co_flush() so even if pages are
still dirty (that shouldn't happen since the source already drained and
flushed) they will be written out and pages will be clean. Therefore
fadvise really invalidates all resident pages.
FWIW when writing this patch I tested with both QEMUs on the same host.
Stefan
Am 20.04.2018 um 05:21 hat Stefan Hajnoczi geschrieben:
> On Thu, Apr 19, 2018 at 10:18:33AM +0100, Dr. David Alan Gilbert wrote:
> > * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > > On Linux posix_fadvise(POSIX_FADV_DONTNEED) invalidates pages*. Use
> > > this to drop page cache on the destination host during shared storage
> > > migration. This way the destination host will read the latest copy of
> > > the data and will not use stale data from the page cache.
> > >
> > > The flow is as follows:
> > >
> > > 1. Source host writes out all dirty pages and inactivates drives.
> > > 2. QEMU_VM_EOF is sent on migration stream.
> > > 3. Destination host invalidates caches before accessing drives.
> > >
> > > This patch enables live migration even with -drive cache.direct=off.
> > >
> > > * Terms and conditions may apply, please see patch for details.
> > >
> > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > ---
> > > block/file-posix.c | 39 +++++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 39 insertions(+)
> > >
> > > diff --git a/block/file-posix.c b/block/file-posix.c
> > > index 3794c0007a..df4f52919f 100644
> > > --- a/block/file-posix.c
> > > +++ b/block/file-posix.c
> > > @@ -2236,6 +2236,42 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
> > > return ret | BDRV_BLOCK_OFFSET_VALID;
> > > }
> > >
> > > +static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
> > > + Error **errp)
> > > +{
> > > + BDRVRawState *s = bs->opaque;
> > > + int ret;
> > > +
> > > + ret = fd_open(bs);
> > > + if (ret < 0) {
> > > + error_setg_errno(errp, -ret, "The file descriptor is not open");
> > > + return;
> > > + }
> > > +
> > > + if (s->open_flags & O_DIRECT) {
> > > + return; /* No host kernel page cache */
> > > + }
> > > +
> > > +#if defined(__linux__)
> > > + /* This sets the scene for the next syscall... */
> > > + ret = bdrv_co_flush(bs);
> > > + if (ret < 0) {
> > > + error_setg_errno(errp, -ret, "flush failed");
> > > + return;
> > > + }
> > > +
> > > + /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
> > > + * process. These limitations are okay because we just fsynced the file,
> > > + * we don't use mmap, and the file should not be in use by other processes.
> > > + */
> > > + ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
> >
> > What happens if I try a migrate between two qemu's on the same host?
> > (Which I, and avocado, both use for testing; I think think users
> > occasionally do for QEMU updates).
>
> The steps quoted from the commit description:
>
> 1. Source host writes out all dirty pages and inactivates drives.
> 2. QEMU_VM_EOF is sent on migration stream.
> 3. Destination host invalidates caches before accessing drives.
>
> When we reach Step 3 the source QEMU is not doing I/O (no pages are
> locked). The destination QEMU does bdrv_co_flush() so even if pages are
> still dirty (that shouldn't happen since the source already drained and
> flushed) they will be written out and pages will be clean. Therefore
> fadvise really invalidates all resident pages.
>
> FWIW when writing this patch I tested with both QEMUs on the same host.
Which is actually unnecessary overhead on localhost because the local
kernel page cache can't be incoherent with itself. But I don't think
it's a real problem either.
Kevin
© 2016 - 2025 Red Hat, Inc.