[v1] migration: Make multifd not experimental

[Qemu-devel] [PATCH 2/4] multifd: Drop x-multifd-page-count parameter

Posted by Juan Quintela 7 years ago

Libvirt don't want to expose (and explain it).  And testing looks like
128 is good for all use cases, so just drop it.

Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 hmp.c                 |  7 -------
 migration/migration.c | 30 ------------------------------
 migration/migration.h |  1 -
 migration/ram.c       | 13 ++++++++-----
 qapi/migration.json   | 13 +------------
 5 files changed, 9 insertions(+), 55 deletions(-)

diff --git a/hmp.c b/hmp.c
index 63019729ed..73b8443a8e 100644
--- a/hmp.c
+++ b/hmp.c
@@ -426,9 +426,6 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict)
         monitor_printf(mon, "%s: %u\n",
             MigrationParameter_str(MIGRATION_PARAMETER_X_MULTIFD_CHANNELS),
             params->x_multifd_channels);
-        monitor_printf(mon, "%s: %u\n",
-            MigrationParameter_str(MIGRATION_PARAMETER_X_MULTIFD_PAGE_COUNT),
-            params->x_multifd_page_count);
         monitor_printf(mon, "%s: %" PRIu64 "\n",
             MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE),
             params->xbzrle_cache_size);
@@ -1776,10 +1773,6 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict)
         p->has_x_multifd_channels = true;
         visit_type_int(v, param, &p->x_multifd_channels, &err);
         break;
-    case MIGRATION_PARAMETER_X_MULTIFD_PAGE_COUNT:
-        p->has_x_multifd_page_count = true;
-        visit_type_int(v, param, &p->x_multifd_page_count, &err);
-        break;
     case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE:
         p->has_xbzrle_cache_size = true;
         visit_type_size(v, param, &cache_size, &err);
diff --git a/migration/migration.c b/migration/migration.c
index f673486679..65df9b566e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -81,7 +81,6 @@
 /* The delay time (in ms) between two COLO checkpoints */
 #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
 #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
-#define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 128
 
 /* Background transfer rate for postcopy, 0 means unlimited, note
  * that page requests can still exceed this limit.
@@ -749,8 +748,6 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     params->block_incremental = s->parameters.block_incremental;
     params->has_x_multifd_channels = true;
     params->x_multifd_channels = s->parameters.x_multifd_channels;
-    params->has_x_multifd_page_count = true;
-    params->x_multifd_page_count = s->parameters.x_multifd_page_count;
     params->has_xbzrle_cache_size = true;
     params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
     params->has_max_postcopy_bandwidth = true;
@@ -1112,14 +1109,6 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
                    "is invalid, it should be in the range of 1 to 255");
         return false;
     }
-    if (params->has_x_multifd_page_count &&
-        (params->x_multifd_page_count < 1 ||
-         params->x_multifd_page_count > 10000)) {
-        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
-                   "multifd_page_count",
-                   "is invalid, it should be in the range of 1 to 10000");
-        return false;
-    }
 
     if (params->has_xbzrle_cache_size &&
         (params->xbzrle_cache_size < qemu_target_page_size() ||
@@ -1202,9 +1191,6 @@ static void migrate_params_test_apply(MigrateSetParameters *params,
     if (params->has_x_multifd_channels) {
         dest->x_multifd_channels = params->x_multifd_channels;
     }
-    if (params->has_x_multifd_page_count) {
-        dest->x_multifd_page_count = params->x_multifd_page_count;
-    }
     if (params->has_xbzrle_cache_size) {
         dest->xbzrle_cache_size = params->xbzrle_cache_size;
     }
@@ -1283,9 +1269,6 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
     if (params->has_x_multifd_channels) {
         s->parameters.x_multifd_channels = params->x_multifd_channels;
     }
-    if (params->has_x_multifd_page_count) {
-        s->parameters.x_multifd_page_count = params->x_multifd_page_count;
-    }
     if (params->has_xbzrle_cache_size) {
         s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
         xbzrle_cache_resize(params->xbzrle_cache_size, errp);
@@ -2044,15 +2027,6 @@ int migrate_multifd_channels(void)
     return s->parameters.x_multifd_channels;
 }
 
-int migrate_multifd_page_count(void)
-{
-    MigrationState *s;
-
-    s = migrate_get_current();
-
-    return s->parameters.x_multifd_page_count;
-}
-
 int migrate_use_xbzrle(void)
 {
     MigrationState *s;
@@ -3286,9 +3260,6 @@ static Property migration_properties[] = {
     DEFINE_PROP_UINT8("x-multifd-channels", MigrationState,
                       parameters.x_multifd_channels,
                       DEFAULT_MIGRATE_MULTIFD_CHANNELS),
-    DEFINE_PROP_UINT32("x-multifd-page-count", MigrationState,
-                      parameters.x_multifd_page_count,
-                      DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT),
     DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
                       parameters.xbzrle_cache_size,
                       DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
@@ -3366,7 +3337,6 @@ static void migration_instance_init(Object *obj)
     params->has_x_checkpoint_delay = true;
     params->has_block_incremental = true;
     params->has_x_multifd_channels = true;
-    params->has_x_multifd_page_count = true;
     params->has_xbzrle_cache_size = true;
     params->has_max_postcopy_bandwidth = true;
     params->has_max_cpu_throttle = true;
diff --git a/migration/migration.h b/migration/migration.h
index bd41b57af9..5e2b004a6c 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -269,7 +269,6 @@ bool migrate_auto_converge(void);
 bool migrate_use_multifd(void);
 bool migrate_pause_before_switchover(void);
 int migrate_multifd_channels(void);
-int migrate_multifd_page_count(void);
 
 int migrate_use_xbzrle(void);
 int64_t migrate_xbzrle_cache_size(void);
diff --git a/migration/ram.c b/migration/ram.c
index 59191c1ed2..ebe893e356 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -520,6 +520,9 @@ exit:
 
 #define MULTIFD_FLAG_SYNC (1 << 0)
 
+#define MULTIFD_PAGE_COUNT 128
+
+
 typedef struct {
     uint32_t magic;
     uint32_t version;
@@ -718,7 +721,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
     packet->magic = cpu_to_be32(MULTIFD_MAGIC);
     packet->version = cpu_to_be32(MULTIFD_VERSION);
     packet->flags = cpu_to_be32(p->flags);
-    packet->size = cpu_to_be32(migrate_multifd_page_count());
+    packet->size = cpu_to_be32(MULTIFD_PAGE_COUNT);
     packet->used = cpu_to_be32(p->pages->used);
     packet->packet_num = cpu_to_be64(p->packet_num);
 
@@ -756,10 +759,10 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
     p->flags = be32_to_cpu(packet->flags);
 
     packet->size = be32_to_cpu(packet->size);
-    if (packet->size > migrate_multifd_page_count()) {
+    if (packet->size > MULTIFD_PAGE_COUNT) {
         error_setg(errp, "multifd: received packet "
                    "with size %d and expected maximum size %d",
-                   packet->size, migrate_multifd_page_count()) ;
+                   packet->size, MULTIFD_PAGE_COUNT) ;
         return -1;
     }
 
@@ -1085,7 +1088,7 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
 int multifd_save_setup(void)
 {
     int thread_count;
-    uint32_t page_count = migrate_multifd_page_count();
+    uint32_t page_count = MULTIFD_PAGE_COUNT;
     uint8_t i;
 
     if (!migrate_use_multifd()) {
@@ -1281,7 +1284,7 @@ static void *multifd_recv_thread(void *opaque)
 int multifd_load_setup(void)
 {
     int thread_count;
-    uint32_t page_count = migrate_multifd_page_count();
+    uint32_t page_count = MULTIFD_PAGE_COUNT;
     uint8_t i;
 
     if (!migrate_use_multifd()) {
diff --git a/qapi/migration.json b/qapi/migration.json
index b62947791f..8c5db60406 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -547,9 +547,6 @@
 #                     number of sockets used for migration.  The
 #                     default value is 2 (since 2.11)
 #
-# @x-multifd-page-count: Number of pages sent together to a thread.
-#                        The default value is 16 (since 2.11)
-#
 # @xbzrle-cache-size: cache size to be used by XBZRLE migration.  It
 #                     needs to be a multiple of the target page size
 #                     and a power of 2
@@ -569,7 +566,7 @@
            'cpu-throttle-initial', 'cpu-throttle-increment',
            'tls-creds', 'tls-hostname', 'max-bandwidth',
            'downtime-limit', 'x-checkpoint-delay', 'block-incremental',
-           'x-multifd-channels', 'x-multifd-page-count',
+           'x-multifd-channels',
            'xbzrle-cache-size', 'max-postcopy-bandwidth',
            'max-cpu-throttle' ] }
 
@@ -637,9 +634,6 @@
 #                     number of sockets used for migration.  The
 #                     default value is 2 (since 2.11)
 #
-# @x-multifd-page-count: Number of pages sent together to a thread.
-#                        The default value is 16 (since 2.11)
-#
 # @xbzrle-cache-size: cache size to be used by XBZRLE migration.  It
 #                     needs to be a multiple of the target page size
 #                     and a power of 2
@@ -670,7 +664,6 @@
             '*x-checkpoint-delay': 'int',
             '*block-incremental': 'bool',
             '*x-multifd-channels': 'int',
-            '*x-multifd-page-count': 'int',
             '*xbzrle-cache-size': 'size',
             '*max-postcopy-bandwidth': 'size',
 	    '*max-cpu-throttle': 'int' } }
@@ -754,9 +747,6 @@
 #                     number of sockets used for migration.
 #                     The default value is 2 (since 2.11)
 #
-# @x-multifd-page-count: Number of pages sent together to a thread.
-#                        The default value is 16 (since 2.11)
-#
 # @xbzrle-cache-size: cache size to be used by XBZRLE migration.  It
 #                     needs to be a multiple of the target page size
 #                     and a power of 2
@@ -786,7 +776,6 @@
             '*x-checkpoint-delay': 'uint32',
             '*block-incremental': 'bool' ,
             '*x-multifd-channels': 'uint8',
-            '*x-multifd-page-count': 'uint32',
             '*xbzrle-cache-size': 'size',
 	    '*max-postcopy-bandwidth': 'size',
             '*max-cpu-throttle':'uint8'} }
-- 
2.20.1

Re: [Qemu-devel] [PATCH 2/4] multifd: Drop x-multifd-page-count parameter

Posted by Laurent Vivier 7 years ago

On 06/02/2019 14:23, Juan Quintela wrote:
> Libvirt don't want to expose (and explain it).  And testing looks like
> 128 is good for all use cases, so just drop it.
> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> ---
>  hmp.c                 |  7 -------
>  migration/migration.c | 30 ------------------------------
>  migration/migration.h |  1 -
>  migration/ram.c       | 13 ++++++++-----
>  qapi/migration.json   | 13 +------------
>  5 files changed, 9 insertions(+), 55 deletions(-)
> 
...
> diff --git a/migration/migration.c b/migration/migration.c
> index f673486679..65df9b566e 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -81,7 +81,6 @@
>  /* The delay time (in ms) between two COLO checkpoints */
>  #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
>  #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
> -#define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 128

Why do you update it in the previous patch to remove it in this one?

Thanks,
Laurent

Re: [Qemu-devel] [PATCH 2/4] multifd: Drop x-multifd-page-count parameter

Posted by Juan Quintela 7 years ago

Laurent Vivier <lvivier@redhat.com> wrote:
> On 06/02/2019 14:23, Juan Quintela wrote:
>> Libvirt don't want to expose (and explain it).  And testing looks like
>> 128 is good for all use cases, so just drop it.
>> 
>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>> ---
>>  hmp.c                 |  7 -------
>>  migration/migration.c | 30 ------------------------------
>>  migration/migration.h |  1 -
>>  migration/ram.c       | 13 ++++++++-----
>>  qapi/migration.json   | 13 +------------
>>  5 files changed, 9 insertions(+), 55 deletions(-)
>> 
> ...
>> diff --git a/migration/migration.c b/migration/migration.c
>> index f673486679..65df9b566e 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -81,7 +81,6 @@
>>  /* The delay time (in ms) between two COLO checkpoints */
>>  #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
>>  #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
>> -#define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 128
>
> Why do you update it in the previous patch to remove it in this one?

To make clear that I change the default.  Otherwise it gets hidden into
the whole patch.  if you preffer I could have done the other way around.

Later, Juan.

Re: [Qemu-devel] [PATCH 2/4] multifd: Drop x-multifd-page-count parameter

Posted by Laurent Vivier 7 years ago

On 06/02/2019 18:58, Juan Quintela wrote:
> Laurent Vivier <lvivier@redhat.com> wrote:
>> On 06/02/2019 14:23, Juan Quintela wrote:
>>> Libvirt don't want to expose (and explain it).  And testing looks like
>>> 128 is good for all use cases, so just drop it.
>>>
>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>>> ---
>>>  hmp.c                 |  7 -------
>>>  migration/migration.c | 30 ------------------------------
>>>  migration/migration.h |  1 -
>>>  migration/ram.c       | 13 ++++++++-----
>>>  qapi/migration.json   | 13 +------------
>>>  5 files changed, 9 insertions(+), 55 deletions(-)
>>>
>> ...
>>> diff --git a/migration/migration.c b/migration/migration.c
>>> index f673486679..65df9b566e 100644
>>> --- a/migration/migration.c
>>> +++ b/migration/migration.c
>>> @@ -81,7 +81,6 @@
>>>  /* The delay time (in ms) between two COLO checkpoints */
>>>  #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
>>>  #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
>>> -#define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 128
>>
>> Why do you update it in the previous patch to remove it in this one?
> 
> To make clear that I change the default.  Otherwise it gets hidden into
> the whole patch.  if you preffer I could have done the other way around.

OK, I understand. It's not really clear because the new default
(MULTIFD_PAGE_COUNT) is hidden in the patch.

Moreover, in the first patch you update the value, but you don't update
the comments in qapi/migration.json (I've seen that because you remove
them in this patch).

Perhaps you can proceed in the reverse order: remove the parameter and
then set the new default... or merge the two patches and saying in the
commit message you change the default value.

Thanks,
Laurent

Re: [Qemu-devel] [PATCH 2/4] multifd: Drop x-multifd-page-count parameter

Posted by Juan Quintela 7 years ago

Laurent Vivier <lvivier@redhat.com> wrote:
> On 06/02/2019 18:58, Juan Quintela wrote:
>> Laurent Vivier <lvivier@redhat.com> wrote:
>>> On 06/02/2019 14:23, Juan Quintela wrote:
>>>> Libvirt don't want to expose (and explain it).  And testing looks like
>>>> 128 is good for all use cases, so just drop it.
>>>>
>>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>>>> ---
>>>>  hmp.c                 |  7 -------
>>>>  migration/migration.c | 30 ------------------------------
>>>>  migration/migration.h |  1 -
>>>>  migration/ram.c       | 13 ++++++++-----
>>>>  qapi/migration.json   | 13 +------------
>>>>  5 files changed, 9 insertions(+), 55 deletions(-)
>>>>
>>> ...
>>>> diff --git a/migration/migration.c b/migration/migration.c
>>>> index f673486679..65df9b566e 100644
>>>> --- a/migration/migration.c
>>>> +++ b/migration/migration.c
>>>> @@ -81,7 +81,6 @@
>>>>  /* The delay time (in ms) between two COLO checkpoints */
>>>>  #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
>>>>  #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
>>>> -#define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 128
>>>
>>> Why do you update it in the previous patch to remove it in this one?
>> 
>> To make clear that I change the default.  Otherwise it gets hidden into
>> the whole patch.  if you preffer I could have done the other way around.
>
> OK, I understand. It's not really clear because the new default
> (MULTIFD_PAGE_COUNT) is hidden in the patch.
>
> Moreover, in the first patch you update the value, but you don't update
> the comments in qapi/migration.json (I've seen that because you remove
> them in this patch).

Aha, I knew I was forgetting something.

> Perhaps you can proceed in the reverse order: remove the parameter and
> then set the new default... or merge the two patches and saying in the
> commit message you change the default value.

Ok.

> Thanks,
> Laurent

Later, Juan.

Re: [Qemu-devel] [PATCH 2/4] multifd: Drop x-multifd-page-count parameter

Posted by Daniel P. Berrangé 7 years ago

On Wed, Feb 06, 2019 at 02:23:29PM +0100, Juan Quintela wrote:
> Libvirt don't want to expose (and explain it).  And testing looks like
> 128 is good for all use cases, so just drop it.

One significant concern inline...

> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> ---
>  hmp.c                 |  7 -------
>  migration/migration.c | 30 ------------------------------
>  migration/migration.h |  1 -
>  migration/ram.c       | 13 ++++++++-----
>  qapi/migration.json   | 13 +------------
>  5 files changed, 9 insertions(+), 55 deletions(-)

> @@ -718,7 +721,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
>      packet->magic = cpu_to_be32(MULTIFD_MAGIC);
>      packet->version = cpu_to_be32(MULTIFD_VERSION);
>      packet->flags = cpu_to_be32(p->flags);
> -    packet->size = cpu_to_be32(migrate_multifd_page_count());
> +    packet->size = cpu_to_be32(MULTIFD_PAGE_COUNT);
>      packet->used = cpu_to_be32(p->pages->used);
>      packet->packet_num = cpu_to_be64(p->packet_num);
>

Here the source QEMU sends the page size - which is now
a hardcoded constant - to the target QEMU.

> @@ -756,10 +759,10 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>      p->flags = be32_to_cpu(packet->flags);
>  
>      packet->size = be32_to_cpu(packet->size);
> -    if (packet->size > migrate_multifd_page_count()) {
> +    if (packet->size > MULTIFD_PAGE_COUNT) {
>          error_setg(errp, "multifd: received packet "
>                     "with size %d and expected maximum size %d",
> -                   packet->size, migrate_multifd_page_count()) ;
> +                   packet->size, MULTIFD_PAGE_COUNT) ;
>          return -1;
>      }
>

Here the dest QEMU receives the page size that the source QEMU used, and
checks that it is not larger than its constant.

IIUC, the implication here is that if we ever increase the size of this
constant in future QEMU, we will break live migration from new to old
QEMU due to this check.  In fact your previous patch in this series has
done exactly that, so this appears to mean QEMU 4.0 -> QEMU 3.2
multifd migration is broken now.

Alternatively if we decrease the size of the constant in future
QEMU, we will break live migration from old QEMU to new QEMU which
is even worse.

This problem existed before this patch, if the management app was
not explicitly using migrate-set-parameters to set the page count
on both sides of QEMU. So we're already broken, but at least the
feature was marked experimental.

What is the purpose of this packet size check ?  Is it something
we can safely remove, so that we can increase or decrease the
size at will without breaking migration compat.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [Qemu-devel] [PATCH 2/4] multifd: Drop x-multifd-page-count parameter

Posted by Juan Quintela 6 years, 12 months ago

Daniel P. Berrangé <berrange@redhat.com> wrote:
> On Wed, Feb 06, 2019 at 02:23:29PM +0100, Juan Quintela wrote:
>> Libvirt don't want to expose (and explain it).  And testing looks like
>> 128 is good for all use cases, so just drop it.
>
> One significant concern inline...
>
>> 
>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>> ---
>>  hmp.c                 |  7 -------
>>  migration/migration.c | 30 ------------------------------
>>  migration/migration.h |  1 -
>>  migration/ram.c       | 13 ++++++++-----
>>  qapi/migration.json   | 13 +------------
>>  5 files changed, 9 insertions(+), 55 deletions(-)
>
>
>> @@ -718,7 +721,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
>>      packet->magic = cpu_to_be32(MULTIFD_MAGIC);
>>      packet->version = cpu_to_be32(MULTIFD_VERSION);
>>      packet->flags = cpu_to_be32(p->flags);
>> -    packet->size = cpu_to_be32(migrate_multifd_page_count());
>> +    packet->size = cpu_to_be32(MULTIFD_PAGE_COUNT);
>>      packet->used = cpu_to_be32(p->pages->used);
>>      packet->packet_num = cpu_to_be64(p->packet_num);
>>
>
> Here the source QEMU sends the page size - which is now
> a hardcoded constant - to the target QEMU.
>
>> @@ -756,10 +759,10 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>>      p->flags = be32_to_cpu(packet->flags);
>>  
>>      packet->size = be32_to_cpu(packet->size);
>> -    if (packet->size > migrate_multifd_page_count()) {
>> +    if (packet->size > MULTIFD_PAGE_COUNT) {
>>          error_setg(errp, "multifd: received packet "
>>                     "with size %d and expected maximum size %d",
>> -                   packet->size, migrate_multifd_page_count()) ;
>> +                   packet->size, MULTIFD_PAGE_COUNT) ;
>>          return -1;
>>      }
>>
>
> Here the dest QEMU receives the page size that the source QEMU used, and
> checks that it is not larger than its constant.
>
> IIUC, the implication here is that if we ever increase the size of this
> constant in future QEMU, we will break live migration from new to old
> QEMU due to this check.  In fact your previous patch in this series has
> done exactly that, so this appears to mean QEMU 4.0 -> QEMU 3.2
> multifd migration is broken now.
>
> Alternatively if we decrease the size of the constant in future
> QEMU, we will break live migration from old QEMU to new QEMU which
> is even worse.
>
> This problem existed before this patch, if the management app was
> not explicitly using migrate-set-parameters to set the page count
> on both sides of QEMU. So we're already broken, but at least the
> feature was marked experimental.
>
> What is the purpose of this packet size check ?  Is it something
> we can safely remove, so that we can increase or decrease the
> size at will without breaking migration compat.

We have a "dinamyc" array of pages of that size.  What we check is that
the array fits into the part that we have assigned.

We "could" wait until this moment to create the arrays, I need to look
into that.  Notice that what the check does is making sure that whatewer
we receive is not bigger than the space that we have allocated.

At this point, that check can only fail if we are "being" attacked and
we have a malformed string.  We check during negotiation that this value
is ok.

We should check this *also* in the initial packet, and then this check
should never be true.

From a management point of view, what do you preffer here?

Later, Juan.

Re: [Qemu-devel] [PATCH 2/4] multifd: Drop x-multifd-page-count parameter

Posted by Daniel P. Berrangé 6 years, 12 months ago

On Tue, Feb 12, 2019 at 10:34:35AM +0100, Juan Quintela wrote:
> Daniel P. Berrangé <berrange@redhat.com> wrote:
> > On Wed, Feb 06, 2019 at 02:23:29PM +0100, Juan Quintela wrote:
> >> Libvirt don't want to expose (and explain it).  And testing looks like
> >> 128 is good for all use cases, so just drop it.
> >
> > One significant concern inline...
> >
> >> 
> >> Signed-off-by: Juan Quintela <quintela@redhat.com>
> >> ---
> >>  hmp.c                 |  7 -------
> >>  migration/migration.c | 30 ------------------------------
> >>  migration/migration.h |  1 -
> >>  migration/ram.c       | 13 ++++++++-----
> >>  qapi/migration.json   | 13 +------------
> >>  5 files changed, 9 insertions(+), 55 deletions(-)
> >
> >
> >> @@ -718,7 +721,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p)
> >>      packet->magic = cpu_to_be32(MULTIFD_MAGIC);
> >>      packet->version = cpu_to_be32(MULTIFD_VERSION);
> >>      packet->flags = cpu_to_be32(p->flags);
> >> -    packet->size = cpu_to_be32(migrate_multifd_page_count());
> >> +    packet->size = cpu_to_be32(MULTIFD_PAGE_COUNT);
> >>      packet->used = cpu_to_be32(p->pages->used);
> >>      packet->packet_num = cpu_to_be64(p->packet_num);
> >>
> >
> > Here the source QEMU sends the page size - which is now
> > a hardcoded constant - to the target QEMU.
> >
> >> @@ -756,10 +759,10 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >>      p->flags = be32_to_cpu(packet->flags);
> >>  
> >>      packet->size = be32_to_cpu(packet->size);
> >> -    if (packet->size > migrate_multifd_page_count()) {
> >> +    if (packet->size > MULTIFD_PAGE_COUNT) {
> >>          error_setg(errp, "multifd: received packet "
> >>                     "with size %d and expected maximum size %d",
> >> -                   packet->size, migrate_multifd_page_count()) ;
> >> +                   packet->size, MULTIFD_PAGE_COUNT) ;
> >>          return -1;
> >>      }
> >>
> >
> > Here the dest QEMU receives the page size that the source QEMU used, and
> > checks that it is not larger than its constant.
> >
> > IIUC, the implication here is that if we ever increase the size of this
> > constant in future QEMU, we will break live migration from new to old
> > QEMU due to this check.  In fact your previous patch in this series has
> > done exactly that, so this appears to mean QEMU 4.0 -> QEMU 3.2
> > multifd migration is broken now.
> >
> > Alternatively if we decrease the size of the constant in future
> > QEMU, we will break live migration from old QEMU to new QEMU which
> > is even worse.
> >
> > This problem existed before this patch, if the management app was
> > not explicitly using migrate-set-parameters to set the page count
> > on both sides of QEMU. So we're already broken, but at least the
> > feature was marked experimental.
> >
> > What is the purpose of this packet size check ?  Is it something
> > we can safely remove, so that we can increase or decrease the
> > size at will without breaking migration compat.
> 
> We have a "dinamyc" array of pages of that size.  What we check is that
> the array fits into the part that we have assigned.
> 
> We "could" wait until this moment to create the arrays, I need to look
> into that.  Notice that what the check does is making sure that whatewer
> we receive is not bigger than the space that we have allocated.
> 
> At this point, that check can only fail if we are "being" attacked and
> we have a malformed string.  We check during negotiation that this value
> is ok.
> 
> We should check this *also* in the initial packet, and then this check
> should never be true.

Right but checking earlier will still have the same problem I describe
with back compatibility if we ever change the page count in future QEMU,
as the limit checked by one QEMU may be smaller than what the other QEMU
is intentionally sending.  I don't see where we have a bi-directional
channel that would allow the 2 QEMUs to negotiate a mutually acceptable
page count :-(

> From a management point of view, what do you preffer here?

The earlier we check requirements the better, as it means we get the
error reported sooner, before wasting time of data transmisions.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

[Qemu-devel] [PATCH 1/4] multifd: Change page count default to 128
[Qemu-devel] [PATCH 2/4] multifd: Drop x-multifd-page-count parameter
[Qemu-devel] [PATCH 3/4] multifd: Drop x-
[Qemu-devel] [PATCH 4/4] tests: Add migration multifd test