[v1] migration: Fixes to the 'background-snapshot' code

[PATCH 3/3] migration: Pre-fault memory before starting background snasphot

Posted by Andrey Gruzdev 4 years, 10 months ago

This commit solves the issue with userfault_fd WP feature that
background snapshot is based on. For any never poluated or discarded
memory page, the UFFDIO_WRITEPROTECT ioctl() would skip updating
PTE for that page, thereby loosing WP setting for it.

So we need to pre-fault pages for each RAM block to be protected
before making a userfault_fd wr-protect ioctl().

Signed-off-by: Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
---
 migration/migration.c |  6 +++++
 migration/ram.c       | 51 +++++++++++++++++++++++++++++++++++++++++++
 migration/ram.h       |  1 +
 3 files changed, 58 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index 656d6249a6..496e88cbda 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3872,6 +3872,12 @@ static void *bg_migration_thread(void *opaque)
 
     update_iteration_initial_status(s);
 
+    /*
+     * Prepare for tracking memory writes with UFFD-WP - populate
+     * RAM pages before protecting.
+     */
+    ram_write_tracking_prepare();
+
     qemu_savevm_state_header(s->to_dst_file);
     qemu_savevm_state_setup(s->to_dst_file);
 
diff --git a/migration/ram.c b/migration/ram.c
index 52537f14ac..825eb80030 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1560,6 +1560,57 @@ out:
     return ret;
 }
 
+/*
+ * ram_block_populate_pages: populate memory in the RAM block by reading
+ *   an integer from the beginning of each page.
+ *
+ * Since it's solely used for userfault_fd WP feature, here we just
+ *   hardcode page size to TARGET_PAGE_SIZE.
+ *
+ * @bs: RAM block to populate
+ */
+volatile int ram_block_populate_pages__tmp;
+static void ram_block_populate_pages(RAMBlock *bs)
+{
+    ram_addr_t offset = 0;
+    int tmp = 0;
+
+    for (char *ptr = (char *) bs->host; offset < bs->used_length;
+            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
+        /* Try to do it without memory writes */
+        tmp += *(volatile int *) ptr;
+    }
+    /* Create dependency on 'extern volatile int' to avoid optimizing out */
+    ram_block_populate_pages__tmp += tmp;
+}
+
+/*
+ * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
+ */
+void ram_write_tracking_prepare(void)
+{
+    RAMBlock *bs;
+
+    RCU_READ_LOCK_GUARD();
+
+    RAMBLOCK_FOREACH_NOT_IGNORED(bs) {
+        /* Nothing to do with read-only and MMIO-writable regions */
+        if (bs->mr->readonly || bs->mr->rom_device) {
+            continue;
+        }
+
+        /*
+         * Populate pages of the RAM block before enabling userfault_fd
+         * write protection.
+         *
+         * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
+         * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
+         * pages with pte_none() entries in page table.
+         */
+        ram_block_populate_pages(bs);
+    }
+}
+
 /*
  * ram_write_tracking_start: start UFFD-WP memory tracking
  *
diff --git a/migration/ram.h b/migration/ram.h
index 6378bb3ebc..4833e9fd5b 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -82,6 +82,7 @@ void colo_incoming_start_dirty_log(void);
 /* Background snapshot */
 bool ram_write_tracking_available(void);
 bool ram_write_tracking_compatible(void);
+void ram_write_tracking_prepare(void);
 int ram_write_tracking_start(void);
 void ram_write_tracking_stop(void);
 
-- 
2.25.1

Re: [PATCH 3/3] migration: Pre-fault memory before starting background snasphot

Posted by David Hildenbrand 4 years, 10 months ago

> +/*
> + * ram_block_populate_pages: populate memory in the RAM block by reading
> + *   an integer from the beginning of each page.
> + *
> + * Since it's solely used for userfault_fd WP feature, here we just
> + *   hardcode page size to TARGET_PAGE_SIZE.
> + *
> + * @bs: RAM block to populate
> + */
> +volatile int ram_block_populate_pages__tmp;
> +static void ram_block_populate_pages(RAMBlock *bs)
> +{
> +    ram_addr_t offset = 0;
> +    int tmp = 0;
> +
> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {

You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE

> +        /* Try to do it without memory writes */
> +        tmp += *(volatile int *) ptr;
> +    }


The following is slightly simpler and doesn't rely on volatile semantics [1].
Should work on any arch I guess.

static void ram_block_populate_pages(RAMBlock *bs)
{
     char *ptr = (char *) bs->host;
     ram_addr_t offset;

     for (offset = 0; offset < bs->used_length;
          offset += qemu_real_host_page_size) {
	char tmp = *(volatile char *)(ptr + offset)

	/* Don't optimize the read out. */
	asm volatile ("" : "+r" (tmp));
}

Compiles to

     for (offset = 0; offset < bs->used_length;
     316d:       48 8b 4b 30             mov    0x30(%rbx),%rcx
     char *ptr = (char *) bs->host;
     3171:       48 8b 73 18             mov    0x18(%rbx),%rsi
     for (offset = 0; offset < bs->used_length;
     3175:       48 85 c9                test   %rcx,%rcx
     3178:       74 ce                   je     3148 <ram_write_tracking_prepare+0x58>
          offset += qemu_real_host_page_size) {
     317a:       48 8b 05 00 00 00 00    mov    0x0(%rip),%rax        # 3181 <ram_write_tracking_prepare+0x91>
     3181:       48 8b 38                mov    (%rax),%rdi
     3184:       0f 1f 40 00             nopl   0x0(%rax)
         char tmp = *(volatile char *)(ptr + offset);
     3188:       48 8d 04 16             lea    (%rsi,%rdx,1),%rax
     318c:       0f b6 00                movzbl (%rax),%eax
          offset += qemu_real_host_page_size) {
     318f:       48 01 fa                add    %rdi,%rdx
     for (offset = 0; offset < bs->used_length;
     3192:       48 39 ca                cmp    %rcx,%rdx
     3195:       72 f1                   jb     3188 <ram_write_tracking_prepare+0x98>


[1] https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/


I'll send patches soon to take care of virtio-mem via RamDiscardManager -
to skip populating the parts that are supposed to remain discarded and not migrated.
Unfortunately, the RamDiscardManager patches are still stuck waiting for
acks ... and now we're in soft-freeze.

-- 
Thanks,

David / dhildenb

Re: [PATCH 3/3] migration: Pre-fault memory before starting background snasphot

Posted by David Hildenbrand 4 years, 10 months ago

On 19.03.21 10:28, David Hildenbrand wrote:
>> +/*
>> + * ram_block_populate_pages: populate memory in the RAM block by reading
>> + *   an integer from the beginning of each page.
>> + *
>> + * Since it's solely used for userfault_fd WP feature, here we just
>> + *   hardcode page size to TARGET_PAGE_SIZE.
>> + *
>> + * @bs: RAM block to populate
>> + */
>> +volatile int ram_block_populate_pages__tmp;
>> +static void ram_block_populate_pages(RAMBlock *bs)
>> +{
>> +    ram_addr_t offset = 0;
>> +    int tmp = 0;
>> +
>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
> 
> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
> 
>> +        /* Try to do it without memory writes */
>> +        tmp += *(volatile int *) ptr;
>> +    }
> 
> 
> The following is slightly simpler and doesn't rely on volatile semantics [1].
> Should work on any arch I guess.
> 
> static void ram_block_populate_pages(RAMBlock *bs)
> {
>       char *ptr = (char *) bs->host;
>       ram_addr_t offset;
> 
>       for (offset = 0; offset < bs->used_length;
>            offset += qemu_real_host_page_size) {
> 	char tmp = *(volatile char *)(ptr + offset)

I wanted to do a "= *(ptr + offset)" here.

> 
> 	/* Don't optimize the read out. */
> 	asm volatile ("" : "+r" (tmp));

So this is the only volatile thing that the compiler must guarantee to 
not optimize away.


-- 
Thanks,

David / dhildenb

Re: [PATCH 3/3] migration: Pre-fault memory before starting background snasphot

Posted by Andrey Gruzdev 4 years, 10 months ago

On 19.03.2021 12:32, David Hildenbrand wrote:
> On 19.03.21 10:28, David Hildenbrand wrote:
>>> +/*
>>> + * ram_block_populate_pages: populate memory in the RAM block by 
>>> reading
>>> + *   an integer from the beginning of each page.
>>> + *
>>> + * Since it's solely used for userfault_fd WP feature, here we just
>>> + *   hardcode page size to TARGET_PAGE_SIZE.
>>> + *
>>> + * @bs: RAM block to populate
>>> + */
>>> +volatile int ram_block_populate_pages__tmp;
>>> +static void ram_block_populate_pages(RAMBlock *bs)
>>> +{
>>> +    ram_addr_t offset = 0;
>>> +    int tmp = 0;
>>> +
>>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
>>
>> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
>>
>>> +        /* Try to do it without memory writes */
>>> +        tmp += *(volatile int *) ptr;
>>> +    }
>>
>>
>> The following is slightly simpler and doesn't rely on volatile 
>> semantics [1].
>> Should work on any arch I guess.
>>
>> static void ram_block_populate_pages(RAMBlock *bs)
>> {
>>       char *ptr = (char *) bs->host;
>>       ram_addr_t offset;
>>
>>       for (offset = 0; offset < bs->used_length;
>>            offset += qemu_real_host_page_size) {
>>     char tmp = *(volatile char *)(ptr + offset)
>
> I wanted to do a "= *(ptr + offset)" here.
>
Yep
>>
>>     /* Don't optimize the read out. */
>>     asm volatile ("" : "+r" (tmp));
>
> So this is the only volatile thing that the compiler must guarantee to 
> not optimize away.
>
>

-- 
Andrey Gruzdev, Principal Engineer
Virtuozzo GmbH  +7-903-247-6397
                 virtuzzo.com

Re: [PATCH 3/3] migration: Pre-fault memory before starting background snasphot

Posted by Andrey Gruzdev 4 years, 10 months ago

On 19.03.2021 12:28, David Hildenbrand wrote:
>> +/*
>> + * ram_block_populate_pages: populate memory in the RAM block by 
>> reading
>> + *   an integer from the beginning of each page.
>> + *
>> + * Since it's solely used for userfault_fd WP feature, here we just
>> + *   hardcode page size to TARGET_PAGE_SIZE.
>> + *
>> + * @bs: RAM block to populate
>> + */
>> +volatile int ram_block_populate_pages__tmp;
>> +static void ram_block_populate_pages(RAMBlock *bs)
>> +{
>> +    ram_addr_t offset = 0;
>> +    int tmp = 0;
>> +
>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
>
> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
>
Ok.
>> +        /* Try to do it without memory writes */
>> +        tmp += *(volatile int *) ptr;
>> +    }
>
>
> The following is slightly simpler and doesn't rely on volatile 
> semantics [1].
> Should work on any arch I guess.
>
> static void ram_block_populate_pages(RAMBlock *bs)
> {
>     char *ptr = (char *) bs->host;
>     ram_addr_t offset;
>
>     for (offset = 0; offset < bs->used_length;
>          offset += qemu_real_host_page_size) {
>     char tmp = *(volatile char *)(ptr + offset)
>
>     /* Don't optimize the read out. */
>     asm volatile ("" : "+r" (tmp));
> }
>
Thanks, good option, I'll change the code.

> Compiles to
>
>     for (offset = 0; offset < bs->used_length;
>     316d:       48 8b 4b 30             mov    0x30(%rbx),%rcx
>     char *ptr = (char *) bs->host;
>     3171:       48 8b 73 18             mov    0x18(%rbx),%rsi
>     for (offset = 0; offset < bs->used_length;
>     3175:       48 85 c9                test   %rcx,%rcx
>     3178:       74 ce                   je     3148 
> <ram_write_tracking_prepare+0x58>
>          offset += qemu_real_host_page_size) {
>     317a:       48 8b 05 00 00 00 00    mov 0x0(%rip),%rax        # 
> 3181 <ram_write_tracking_prepare+0x91>
>     3181:       48 8b 38                mov    (%rax),%rdi
>     3184:       0f 1f 40 00             nopl   0x0(%rax)
>         char tmp = *(volatile char *)(ptr + offset);
>     3188:       48 8d 04 16             lea    (%rsi,%rdx,1),%rax
>     318c:       0f b6 00                movzbl (%rax),%eax
>          offset += qemu_real_host_page_size) {
>     318f:       48 01 fa                add    %rdi,%rdx
>     for (offset = 0; offset < bs->used_length;
>     3192:       48 39 ca                cmp    %rcx,%rdx
>     3195:       72 f1                   jb     3188 
> <ram_write_tracking_prepare+0x98>
>
>
> [1] 
> https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/
>
>
> I'll send patches soon to take care of virtio-mem via RamDiscardManager -
> to skip populating the parts that are supposed to remain discarded and 
> not migrated.
> Unfortunately, the RamDiscardManager patches are still stuck waiting for
> acks ... and now we're in soft-freeze.
>
RamDiscardManager patches - do they also modify migration code?
I mean which part is responsible of not migrating discarded ranges.

-- 
Andrey Gruzdev, Principal Engineer
Virtuozzo GmbH  +7-903-247-6397
                 virtuzzo.com

Re: [PATCH 3/3] migration: Pre-fault memory before starting background snasphot

Posted by David Hildenbrand 4 years, 10 months ago

On 19.03.21 12:05, Andrey Gruzdev wrote:
> On 19.03.2021 12:28, David Hildenbrand wrote:
>>> +/*
>>> + * ram_block_populate_pages: populate memory in the RAM block by
>>> reading
>>> + *   an integer from the beginning of each page.
>>> + *
>>> + * Since it's solely used for userfault_fd WP feature, here we just
>>> + *   hardcode page size to TARGET_PAGE_SIZE.
>>> + *
>>> + * @bs: RAM block to populate
>>> + */
>>> +volatile int ram_block_populate_pages__tmp;
>>> +static void ram_block_populate_pages(RAMBlock *bs)
>>> +{
>>> +    ram_addr_t offset = 0;
>>> +    int tmp = 0;
>>> +
>>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
>>
>> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
>>
> Ok.
>>> +        /* Try to do it without memory writes */
>>> +        tmp += *(volatile int *) ptr;
>>> +    }
>>
>>
>> The following is slightly simpler and doesn't rely on volatile
>> semantics [1].
>> Should work on any arch I guess.
>>
>> static void ram_block_populate_pages(RAMBlock *bs)
>> {
>>      char *ptr = (char *) bs->host;
>>      ram_addr_t offset;
>>
>>      for (offset = 0; offset < bs->used_length;
>>           offset += qemu_real_host_page_size) {
>>      char tmp = *(volatile char *)(ptr + offset)
>>
>>      /* Don't optimize the read out. */
>>      asm volatile ("" : "+r" (tmp));
>> }
>>
> Thanks, good option, I'll change the code.
> 
>> Compiles to
>>
>>      for (offset = 0; offset < bs->used_length;
>>      316d:       48 8b 4b 30             mov    0x30(%rbx),%rcx
>>      char *ptr = (char *) bs->host;
>>      3171:       48 8b 73 18             mov    0x18(%rbx),%rsi
>>      for (offset = 0; offset < bs->used_length;
>>      3175:       48 85 c9                test   %rcx,%rcx
>>      3178:       74 ce                   je     3148
>> <ram_write_tracking_prepare+0x58>
>>           offset += qemu_real_host_page_size) {
>>      317a:       48 8b 05 00 00 00 00    mov 0x0(%rip),%rax        #
>> 3181 <ram_write_tracking_prepare+0x91>
>>      3181:       48 8b 38                mov    (%rax),%rdi
>>      3184:       0f 1f 40 00             nopl   0x0(%rax)
>>          char tmp = *(volatile char *)(ptr + offset);
>>      3188:       48 8d 04 16             lea    (%rsi,%rdx,1),%rax
>>      318c:       0f b6 00                movzbl (%rax),%eax
>>           offset += qemu_real_host_page_size) {
>>      318f:       48 01 fa                add    %rdi,%rdx
>>      for (offset = 0; offset < bs->used_length;
>>      3192:       48 39 ca                cmp    %rcx,%rdx
>>      3195:       72 f1                   jb     3188
>> <ram_write_tracking_prepare+0x98>
>>
>>
>> [1]
>> https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/
>>
>>
>> I'll send patches soon to take care of virtio-mem via RamDiscardManager -
>> to skip populating the parts that are supposed to remain discarded and
>> not migrated.
>> Unfortunately, the RamDiscardManager patches are still stuck waiting for
>> acks ... and now we're in soft-freeze.
>>
> RamDiscardManager patches - do they also modify migration code?
> I mean which part is responsible of not migrating discarded ranges.

I haven't shared relevant patches yet that touch migration code. I'm 
planning on doing that once the generic RamDiscardManager has all 
relevant acks. I'll put you on cc.

-- 
Thanks,

David / dhildenb

Re: [PATCH 3/3] migration: Pre-fault memory before starting background snasphot

Posted by Andrey Gruzdev 4 years, 10 months ago

On 19.03.2021 14:27, David Hildenbrand wrote:
> On 19.03.21 12:05, Andrey Gruzdev wrote:
>> On 19.03.2021 12:28, David Hildenbrand wrote:
>>>> +/*
>>>> + * ram_block_populate_pages: populate memory in the RAM block by
>>>> reading
>>>> + *   an integer from the beginning of each page.
>>>> + *
>>>> + * Since it's solely used for userfault_fd WP feature, here we just
>>>> + *   hardcode page size to TARGET_PAGE_SIZE.
>>>> + *
>>>> + * @bs: RAM block to populate
>>>> + */
>>>> +volatile int ram_block_populate_pages__tmp;
>>>> +static void ram_block_populate_pages(RAMBlock *bs)
>>>> +{
>>>> +    ram_addr_t offset = 0;
>>>> +    int tmp = 0;
>>>> +
>>>> +    for (char *ptr = (char *) bs->host; offset < bs->used_length;
>>>> +            ptr += TARGET_PAGE_SIZE, offset += TARGET_PAGE_SIZE) {
>>>
>>> You'll want qemu_real_host_page_size instead of TARGET_PAGE_SIZE
>>>
>> Ok.
>>>> +        /* Try to do it without memory writes */
>>>> +        tmp += *(volatile int *) ptr;
>>>> +    }
>>>
>>>
>>> The following is slightly simpler and doesn't rely on volatile
>>> semantics [1].
>>> Should work on any arch I guess.
>>>
>>> static void ram_block_populate_pages(RAMBlock *bs)
>>> {
>>>      char *ptr = (char *) bs->host;
>>>      ram_addr_t offset;
>>>
>>>      for (offset = 0; offset < bs->used_length;
>>>           offset += qemu_real_host_page_size) {
>>>      char tmp = *(volatile char *)(ptr + offset)
>>>
>>>      /* Don't optimize the read out. */
>>>      asm volatile ("" : "+r" (tmp));
>>> }
>>>
>> Thanks, good option, I'll change the code.
>>
>>> Compiles to
>>>
>>>      for (offset = 0; offset < bs->used_length;
>>>      316d:       48 8b 4b 30             mov 0x30(%rbx),%rcx
>>>      char *ptr = (char *) bs->host;
>>>      3171:       48 8b 73 18             mov 0x18(%rbx),%rsi
>>>      for (offset = 0; offset < bs->used_length;
>>>      3175:       48 85 c9                test   %rcx,%rcx
>>>      3178:       74 ce                   je     3148
>>> <ram_write_tracking_prepare+0x58>
>>>           offset += qemu_real_host_page_size) {
>>>      317a:       48 8b 05 00 00 00 00    mov 0x0(%rip),%rax        #
>>> 3181 <ram_write_tracking_prepare+0x91>
>>>      3181:       48 8b 38                mov    (%rax),%rdi
>>>      3184:       0f 1f 40 00             nopl   0x0(%rax)
>>>          char tmp = *(volatile char *)(ptr + offset);
>>>      3188:       48 8d 04 16             lea (%rsi,%rdx,1),%rax
>>>      318c:       0f b6 00                movzbl (%rax),%eax
>>>           offset += qemu_real_host_page_size) {
>>>      318f:       48 01 fa                add    %rdi,%rdx
>>>      for (offset = 0; offset < bs->used_length;
>>>      3192:       48 39 ca                cmp    %rcx,%rdx
>>>      3195:       72 f1                   jb     3188
>>> <ram_write_tracking_prepare+0x98>
>>>
>>>
>>> [1]
>>> https://programfan.github.io/blog/2015/04/27/prevent-gcc-optimize-away-code/ 
>>>
>>>
>>>
>>> I'll send patches soon to take care of virtio-mem via 
>>> RamDiscardManager -
>>> to skip populating the parts that are supposed to remain discarded and
>>> not migrated.
>>> Unfortunately, the RamDiscardManager patches are still stuck waiting 
>>> for
>>> acks ... and now we're in soft-freeze.
>>>
>> RamDiscardManager patches - do they also modify migration code?
>> I mean which part is responsible of not migrating discarded ranges.
>
> I haven't shared relevant patches yet that touch migration code. I'm 
> planning on doing that once the generic RamDiscardManager has all 
> relevant acks. I'll put you on cc.
>
Got it, thanks.

-- 
Andrey Gruzdev, Principal Engineer
Virtuozzo GmbH  +7-903-247-6397
                 virtuzzo.com