From: Juraj Marcin <jmarcin@redhat.com>
During migration switchover both the source and the destination machines
are paused (compute downtime). During this period network still routes
network packets to the source machine, as this is the last place where
the recipient MAC address has been seen. Once the destination side
starts and sends network announcement, all subsequent frames are routed
correctly. However, frames delivered to the source machine are never
processed and lost. This causes also a network downtime with roughly the
same duration as compute downtime.
This can cause problems not only for protocols that cannot handle packet
loss, but can also introduce delays in protocols that can handle them.
To resolve this, this feature instantiates a network filter for each
network backend present during migration setup on both migration sides.
On the source side, this filter caches all packets received from the
backend during switchover. Once the destination machine starts, all
cached packets are sent through the migration channel and the respective
filter object on the destination side injects them to the NIC attached
to the backend.
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
---
include/migration/vmstate.h | 6 +
include/net/net.h | 5 +
migration/meson.build | 1 +
migration/migration.c | 49 ++++++-
migration/migration.h | 2 +
migration/netpass.c | 246 ++++++++++++++++++++++++++++++++++++
migration/netpass.h | 14 ++
migration/options.c | 21 +++
migration/options.h | 1 +
migration/savevm.c | 37 ++++++
migration/savevm.h | 2 +
migration/trace-events | 9 ++
net/net.c | 11 ++
net/tap.c | 11 +-
qapi/migration.json | 7 +-
15 files changed, 418 insertions(+), 4 deletions(-)
create mode 100644 migration/netpass.c
create mode 100644 migration/netpass.h
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 62d7e9fe38..7987e6c85a 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -200,6 +200,12 @@ typedef enum {
* save_setup() in VMSD structures.
*/
VMS_PHASE_EARLY_SETUP,
+ /*
+ * Specifies a netpass VMSD, these devices are copied right after the
+ * destination is started regardless of precopy/postcopy. Failure in this
+ * phase does not fail the migration in case of precopy.
+ */
+ VMS_PHASE_NETPASS,
} VMStateSavePhase;
struct VMStateDescription {
diff --git a/include/net/net.h b/include/net/net.h
index 45bc86fc86..510908845b 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -82,6 +82,7 @@ typedef void (NetAnnounce)(NetClientState *);
typedef bool (SetSteeringEBPF)(NetClientState *, int);
typedef bool (NetCheckPeerType)(NetClientState *, ObjectClass *, Error **);
typedef struct vhost_net *(GetVHostNet)(NetClientState *nc);
+typedef void (NetpassEnabledNotify)(NetClientState *nc, void *opaque);
typedef struct NetClientInfo {
NetClientDriver type;
@@ -130,6 +131,9 @@ struct NetClientState {
bool is_netdev;
bool do_not_pad; /* do not pad to the minimum ethernet frame length */
bool is_datapath;
+ bool netpass_enabled;
+ NetpassEnabledNotify *netpass_enabled_notify;
+ void *netpass_enabled_notify_opaque;
QTAILQ_HEAD(, NetFilterState) filters;
};
@@ -198,6 +202,7 @@ void qemu_flush_queued_packets(NetClientState *nc);
void qemu_flush_or_purge_queued_packets(NetClientState *nc, bool purge);
void qemu_set_info_str(NetClientState *nc,
const char *fmt, ...) G_GNUC_PRINTF(2, 3);
+void qemu_set_netpass_enabled(NetClientState *nc, bool enabled);
void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]);
bool qemu_has_ufo(NetClientState *nc);
bool qemu_has_uso(NetClientState *nc);
diff --git a/migration/meson.build b/migration/meson.build
index c7f39bdb55..a501256979 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -30,6 +30,7 @@ system_ss.add(files(
'multifd-nocomp.c',
'multifd-zlib.c',
'multifd-zero-page.c',
+ 'netpass.c',
'options.c',
'postcopy-ram.c',
'ram.c',
diff --git a/migration/migration.c b/migration/migration.c
index 4871db2365..959719dd61 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -63,6 +63,7 @@
#include "system/dirtylimit.h"
#include "qemu/sockets.h"
#include "system/kvm.h"
+#include "netpass.h"
#define NOTIFIER_ELEM_INIT(array, elem) \
[elem] = NOTIFIER_WITH_RETURN_LIST_INITIALIZER((array)[elem])
@@ -488,6 +489,10 @@ void migration_incoming_state_destroy(void)
mis->postcopy_qemufile_dst = NULL;
}
+ if (migrate_netpass()) {
+ migration_netpass_cleanup();
+ }
+
cpr_set_incoming_mode(MIG_MODE_NONE);
yank_unregister_instance(MIGRATION_YANK_INSTANCE);
}
@@ -755,6 +760,10 @@ static void process_incoming_migration_bh(void *opaque)
migrate_send_rp_vm_started(mis);
}
+ if (migrate_netpass()) {
+ qemu_loadvm_state_netpass(mis->from_src_file, mis);
+ }
+
/*
* This must happen after any state changes since as soon as an external
* observer sees this event they might start to prod at the VM assuming
@@ -775,6 +784,13 @@ process_incoming_migration_co(void *opaque)
assert(mis->from_src_file);
+ if (migrate_netpass()) {
+ ret = migration_netpass_setup(&local_err);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
mis->largest_page_size = qemu_ram_pagesize_largest();
postcopy_state_set(POSTCOPY_INCOMING_NONE);
migrate_set_state(&mis->state, MIGRATION_STATUS_SETUP,
@@ -811,8 +827,7 @@ process_incoming_migration_co(void *opaque)
goto out;
fail:
- migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
- MIGRATION_STATUS_FAILED);
+ migrate_set_state(&mis->state, mis->state, MIGRATION_STATUS_FAILED);
migrate_error_propagate(s, local_err);
migration_incoming_state_destroy();
@@ -1336,6 +1351,10 @@ static void migration_cleanup(MigrationState *s)
qemu_fclose(tmp);
}
+ if (migrate_netpass()) {
+ migration_netpass_cleanup();
+ }
+
assert(!migration_is_active());
if (s->state == MIGRATION_STATUS_CANCELLING) {
@@ -1673,6 +1692,8 @@ int migrate_init(MigrationState *s, Error **errp)
s->dest_vm_started = false;
qemu_event_reset(&s->dest_vm_started_event);
+ s->netpass_state_sent = false;
+
return 0;
}
@@ -2729,6 +2750,10 @@ static bool migration_switchover_start(MigrationState *s, Error **errp)
{
ERRP_GUARD();
+ if (migrate_netpass()) {
+ migration_netpass_activate();
+ }
+
if (!migration_switchover_prepare(s)) {
error_setg(errp, "Switchover is interrupted");
return false;
@@ -2821,6 +2846,14 @@ static void migration_completion(MigrationState *s)
goto fail;
}
+ if (migrate_netpass() && !s->netpass_state_sent) {
+ qemu_event_wait(&s->dest_vm_started_event);
+ qemu_savevm_state_netpass(s->to_dst_file);
+ s->netpass_state_sent = true;
+ qemu_put_byte(s->to_dst_file, QEMU_VM_EOF);
+ qemu_fflush(s->to_dst_file);
+ }
+
if (close_return_path_on_source(s)) {
goto fail;
}
@@ -3251,6 +3284,11 @@ static MigIterateState migration_iteration_run(MigrationState *s)
migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_DEVICE,
MIGRATION_STATUS_POSTCOPY_ACTIVE);
}
+
+ if (s->dest_vm_started && migrate_netpass() && !s->netpass_state_sent) {
+ qemu_savevm_state_netpass(s->to_dst_file);
+ s->netpass_state_sent = true;
+ }
} else {
/*
* Exact pending reporting is only needed for precopy. Taking RAM
@@ -3774,6 +3812,13 @@ void migration_start_outgoing(MigrationState *s)
s->expected_downtime = migrate_downtime_limit();
+ if (migrate_netpass()) {
+ ret = migration_netpass_setup(&local_err);
+ if (ret < 0) {
+ goto fail;
+ }
+ }
+
if (resume) {
/* This is a resumed migration */
rate_limit = migrate_max_postcopy_bandwidth();
diff --git a/migration/migration.h b/migration/migration.h
index a3fab4f27e..a0d9560254 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -530,6 +530,8 @@ struct MigrationState {
bool send_vm_started;
bool dest_vm_started;
QemuEvent dest_vm_started_event;
+
+ bool netpass_state_sent;
};
void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
diff --git a/migration/netpass.c b/migration/netpass.c
new file mode 100644
index 0000000000..92b2522c83
--- /dev/null
+++ b/migration/netpass.c
@@ -0,0 +1,246 @@
+#include "qemu/osdep.h"
+#include "netpass.h"
+
+#include "migration/migration.h"
+#include "migration/vmstate.h"
+#include "net/queue.h"
+#include "net/filter.h"
+#include "net/net.h"
+#include "net/vhost_net.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/iov.h"
+#include "qemu/typedefs.h"
+#include "qom/object.h"
+#include "trace.h"
+
+struct NetPassState {
+ NetFilterState parent_obj;
+ bool active;
+ size_t packet_count;
+ uint32_t qlength;
+ uint32_t qcapacity;
+ uint8_t *qbuffer;
+ SocketReadState rs;
+ QTAILQ_ENTRY(NetPassState) next;
+};
+
+static void netpass_queue_clear(NetPassState *s)
+{
+ g_free(s->qbuffer);
+ s->qbuffer = NULL;
+ s->qcapacity = 0;
+ s->qlength = 0;
+ s->packet_count = 0;
+}
+
+OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(NetPassState, filter_netpass,
+ FILTER_NETPASS, NETFILTER,
+ { TYPE_VMSTATE_IF }, { } )
+
+static bool netpass_vmstate_pre_save(void *opaque, Error **errp)
+{
+ NetPassState *s = opaque;
+ s->active = false;
+ return true;
+}
+
+static int netpass_vmstate_post_save(void *opaque)
+{
+ NetPassState *s = opaque;
+ trace_migration_netpass_passed_packet_count(NETFILTER(s)->netdev_id, s->packet_count);
+ netpass_queue_clear(s);
+ return 0;
+}
+
+static void netpass_vmstate_post_load_bh(void *opaque)
+{
+ NetPassState *s = opaque;
+
+ int ret = net_fill_rstate(&s->rs, s->qbuffer, s->qlength);
+ if (ret == -1) {
+ warn_report("migration: Failed to fill netpass rstate during load");
+ }
+ trace_migration_netpass_received_packet_count(NETFILTER(s)->netdev_id, s->packet_count);
+ netpass_queue_clear(s);
+}
+
+static bool netpass_vmstate_post_load(void *opaque, int version_id, Error **errp)
+{
+ /*
+ * Schedule on the main thread in case this function is running on the
+ * postcopy listen thread and there is a fault during packet injection.
+ */
+ migration_bh_schedule(netpass_vmstate_post_load_bh, opaque);
+ return true;
+}
+
+static char *filter_netpass_vmstate_if_get_id(VMStateIf *obj)
+{
+ NetFilterState *nf = NETFILTER(obj);
+ return g_strconcat("filter-netpass/", nf->netdev_id, NULL);
+}
+
+static const VMStateDescription vmstate_netpass = {
+ .name = "filter-netpass",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .phase = VMS_PHASE_NETPASS,
+ .fields = (const VMStateField[]) {
+ VMSTATE_UINT32(qlength, NetPassState),
+ VMSTATE_UINT32(qcapacity, NetPassState),
+ VMSTATE_VBUFFER_ALLOC_UINT32(qbuffer, NetPassState, 0, NULL, qcapacity),
+ VMSTATE_END_OF_LIST(),
+ },
+ .pre_save_errp = netpass_vmstate_pre_save,
+ .post_save = netpass_vmstate_post_save,
+ .post_load_errp = netpass_vmstate_post_load,
+};
+
+QTAILQ_HEAD(, NetPassState) filters = QTAILQ_HEAD_INITIALIZER(filters);
+
+static void netpass_rs_finalize(SocketReadState *rs)
+{
+ NetPassState *s = container_of(rs, NetPassState, rs);
+ NetFilterState *nf = NETFILTER(s);
+
+ struct iovec iov = {
+ .iov_len = rs->packet_len,
+ .iov_base = rs->buf,
+ };
+ qemu_netfilter_pass_to_next(nf->netdev, 0, &iov, 1, nf);
+ s->packet_count++;
+}
+
+static void filter_netpass_setup(NetFilterState *nf, Error **errp)
+{
+ NetPassState *s = FILTER_NETPASS(nf);
+
+ s->active = false;
+ s->qbuffer = NULL;
+ s->qcapacity = 0;
+ s->qlength = 0;
+ s->packet_count = 0;
+ net_socket_rs_init(&s->rs, netpass_rs_finalize, true);
+}
+
+static void filter_netpass_cleanup(NetFilterState *nf)
+{
+ NetPassState *s = FILTER_NETPASS(nf);
+
+ s->active = false;
+ netpass_queue_clear(s);
+ if (nf->netdev) {
+ qemu_set_netpass_enabled(nf->netdev, false);
+ }
+}
+
+static ssize_t filter_netpass_receive_iov(NetFilterState *nf,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ NetPassState *s = FILTER_NETPASS(nf);
+
+ if (!s->active) {
+ return 0;
+ }
+
+ uint32_t total_size = iov_size(iov, iovcnt);
+ size_t req_cap = sizeof(uint32_t) + sizeof(uint32_t) + total_size;
+ if (s->qcapacity - s->qlength < req_cap) {
+ size_t new_capacity = s->qcapacity;
+ while (new_capacity - s->qlength < req_cap) {
+ new_capacity += 4096;
+ }
+ s->qbuffer = g_realloc(s->qbuffer, new_capacity);
+ s->qcapacity = new_capacity;
+ }
+ uint32_t total_size_be = htonl(total_size);
+ memcpy(&s->qbuffer[s->qlength], &total_size_be, sizeof(uint32_t));
+ s->qlength += sizeof(uint32_t);
+ uint32_t vnet_hdr_len_be = htonl(sender->vnet_hdr_len);
+ memcpy(&s->qbuffer[s->qlength], &vnet_hdr_len_be, sizeof(uint32_t));
+ s->qlength += sizeof(uint32_t);
+ iov_to_buf_full(iov, iovcnt, 0, &s->qbuffer[s->qlength], total_size);
+ s->qlength += total_size;
+ s->packet_count++;
+
+ return 0;
+}
+
+static void filter_netpass_class_init(ObjectClass *oc, const void *data)
+{
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+ VMStateIfClass *vc = VMSTATE_IF_CLASS(oc);
+
+ nfc->setup = filter_netpass_setup;
+ nfc->cleanup = filter_netpass_cleanup;
+ nfc->receive_iov = filter_netpass_receive_iov;
+
+ vc->get_id = filter_netpass_vmstate_if_get_id;
+}
+
+static void filter_netpass_init(Object *obj)
+{
+}
+
+static void filter_netpass_finalize(Object *obj)
+{
+ NetPassState *s = FILTER_NETPASS(obj);
+ (void)s;
+}
+
+int migration_netpass_setup(Error **errp)
+{
+ NetClientState *nc;
+
+ QTAILQ_FOREACH(nc, &net_clients, next) {
+ if (!nc->is_netdev) {
+ continue;
+ }
+ if (get_vhost_net(nc)) {
+ warn_report("migration: netpass is not supported with vhost=on");
+ continue;
+ }
+ g_autofree char *filter_id = g_strconcat("netpass-", nc->name, NULL);
+ Object *obj = object_new_with_props(TYPE_FILTER_NETPASS,
+ object_get_objects_root(),
+ filter_id, errp,
+ "netdev", nc->name,
+ "queue", "tx",
+ NULL);
+ if (!obj) {
+ error_prepend(errp, "Failed to setup migration netpass");
+ return -1;
+ }
+ trace_migration_netpass_setup_created_filter(nc->name);
+ object_ref(obj);
+ QTAILQ_INSERT_TAIL(&filters, FILTER_NETPASS(obj), next);
+ vmstate_register(VMSTATE_IF(obj), VMSTATE_INSTANCE_ID_ANY,
+ &vmstate_netpass, obj);
+ }
+ return 0;
+}
+
+void migration_netpass_activate(void)
+{
+ NetPassState *s;
+ QTAILQ_FOREACH(s, &filters, next) {
+ s->packet_count = 0;
+ s->active = true;
+ qemu_set_netpass_enabled(NETFILTER(s)->netdev, true);
+ }
+}
+
+void migration_netpass_cleanup(void)
+{
+ NetPassState *s, *ns;
+ QTAILQ_FOREACH_SAFE(s, &filters, next, ns) {
+ QTAILQ_REMOVE(&filters, s, next);
+ vmstate_unregister(VMSTATE_IF(s), &vmstate_netpass, s);
+ object_unref(s);
+ }
+}
diff --git a/migration/netpass.h b/migration/netpass.h
new file mode 100644
index 0000000000..8618cf4c73
--- /dev/null
+++ b/migration/netpass.h
@@ -0,0 +1,14 @@
+#ifndef QEMU_MIGRATION_NETPASS_H
+#define QEMU_MIGRATION_NETPASS_H
+
+#include "qemu/typedefs.h"
+#include "qom/object.h"
+
+#define TYPE_FILTER_NETPASS "filter-netpass"
+OBJECT_DECLARE_SIMPLE_TYPE(NetPassState, FILTER_NETPASS)
+
+int migration_netpass_setup(Error **errp);
+void migration_netpass_activate(void);
+void migration_netpass_cleanup(void);
+
+#endif
diff --git a/migration/options.c b/migration/options.c
index a5a233183b..e6e2d441b0 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -211,6 +211,7 @@ const Property migration_properties[] = {
DEFINE_PROP_MIG_CAP("mapped-ram", MIGRATION_CAPABILITY_MAPPED_RAM),
DEFINE_PROP_MIG_CAP("x-ignore-shared",
MIGRATION_CAPABILITY_X_IGNORE_SHARED),
+ DEFINE_PROP_MIG_CAP("netpass", MIGRATION_CAPABILITY_NETPASS),
};
const size_t migration_properties_count = ARRAY_SIZE(migration_properties);
@@ -442,6 +443,13 @@ bool migrate_send_vm_started(void)
return s->send_vm_started;
}
+bool migrate_netpass(void)
+{
+ MigrationState *s = migrate_get_current();
+
+ return s->capabilities[MIGRATION_CAPABILITY_NETPASS];
+}
+
/* pseudo capabilities */
bool migrate_multifd_flush_after_each_section(void)
@@ -723,6 +731,19 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
}
}
+ if (new_caps[MIGRATION_CAPABILITY_NETPASS]) {
+ if (!new_caps[MIGRATION_CAPABILITY_RETURN_PATH]) {
+ error_setg(errp, "Capability 'netpass' requires capability "
+ "'return-path'");
+ return false;
+ }
+ if (!migrate_send_vm_started()) {
+ error_setg(errp, "Capability 'netpass' requires support for VM_STARTED "
+ "return-path message");
+ return false;
+ }
+ }
+
/*
* On destination side, check the cases that capability is being set
* after incoming thread has started.
diff --git a/migration/options.h b/migration/options.h
index 5fdc8fc6fe..151eaef86c 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -43,6 +43,7 @@ bool migrate_validate_uuid(void);
bool migrate_xbzrle(void);
bool migrate_zero_copy_send(void);
bool migrate_send_vm_started(void);
+bool migrate_netpass(void);
/*
* pseudo capabilities
diff --git a/migration/savevm.c b/migration/savevm.c
index 78eb1d6165..b930f27fa9 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -279,6 +279,7 @@ static bool should_validate_capability(int capability)
switch (capability) {
case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
case MIGRATION_CAPABILITY_MAPPED_RAM:
+ case MIGRATION_CAPABILITY_NETPASS:
return true;
default:
return false;
@@ -1731,6 +1732,29 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
return qemu_fflush(f);
}
+void qemu_savevm_state_netpass(QEMUFile *f)
+{
+ MigrationState *ms = migrate_get_current();
+ JSONWriter *vmdesc = ms->vmdesc;
+ SaveStateEntry *se;
+ Error *local_err = NULL;
+ int ret;
+
+ trace_savevm_state_netpass_begin();
+ QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+ if (!se->vmsd || se->vmsd->phase != VMS_PHASE_NETPASS) {
+ continue;
+ }
+ ret = vmstate_save(f, se, vmdesc, &local_err);
+ if (ret) {
+ warn_report_err(local_err);
+ qemu_file_clear_error(f);
+ break;
+ }
+ }
+ trace_savevm_state_netpass_end(ret);
+}
+
/* Give an estimate of the amount left to be transferred,
* the result is split into the amount for units that can and
* for units that can't do postcopy.
@@ -3148,6 +3172,19 @@ int qemu_load_device_state(QEMUFile *f, Error **errp)
return 0;
}
+void qemu_loadvm_state_netpass(QEMUFile *f, MigrationIncomingState *mis)
+{
+ Error *local_errp;
+ trace_loadvm_state_netpass_begin();
+ int ret = qemu_loadvm_state_main(mis->from_src_file, mis, &local_errp);
+ trace_loadvm_state_netpass_end(ret);
+ if (ret < 0) {
+ warn_reportf_err(local_errp,
+ "Error while loading netpass data, this error will be ignored");
+ qemu_file_clear_error(f);
+ }
+}
+
int qemu_loadvm_approve_switchover(void)
{
MigrationIncomingState *mis = migration_incoming_get_current();
diff --git a/migration/savevm.h b/migration/savevm.h
index 125a2507b7..53220c40cf 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -42,6 +42,7 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
void qemu_savevm_state_cleanup(void);
void qemu_savevm_state_complete_postcopy(QEMUFile *f);
int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only);
+void qemu_savevm_state_netpass(QEMUFile *f);
void qemu_savevm_state_pending_exact(uint64_t *must_precopy,
uint64_t *can_postcopy);
void qemu_savevm_state_pending_estimate(uint64_t *must_precopy,
@@ -71,6 +72,7 @@ void qemu_loadvm_state_cleanup(MigrationIncomingState *mis);
int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis,
Error **errp);
int qemu_load_device_state(QEMUFile *f, Error **errp);
+void qemu_loadvm_state_netpass(QEMUFile *f, MigrationIncomingState *mis);
int qemu_loadvm_approve_switchover(void);
int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
bool in_postcopy);
diff --git a/migration/trace-events b/migration/trace-events
index 91d7506634..eb25944d1b 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -10,6 +10,8 @@ qemu_savevm_send_packaged(void) ""
loadvm_state_switchover_ack_needed(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
loadvm_state_setup(void) ""
loadvm_state_cleanup(void) ""
+loadvm_state_netpass_begin(void) ""
+loadvm_state_netpass_end(int ret) "ret=%d"
loadvm_handle_cmd_packaged(unsigned int length) "%u"
loadvm_handle_cmd_packaged_main(int ret) "%d"
loadvm_handle_cmd_packaged_received(int ret) "%d"
@@ -45,6 +47,8 @@ savevm_state_resume_prepare(void) ""
savevm_state_header(void) ""
savevm_state_iterate(void) ""
savevm_state_cleanup(void) ""
+savevm_state_netpass_begin(void) ""
+savevm_state_netpass_end(int ret) "ret=%d"
vmstate_save(const char *idstr, const char *vmsd_name) "%s, %s"
vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s"
vmstate_downtime_save(const char *type, const char *idstr, uint32_t instance_id, int64_t downtime) "type=%s idstr=%s instance_id=%d downtime=%"PRIi64
@@ -401,3 +405,8 @@ cpu_throttle_dirty_sync(void) ""
# block-active.c
migration_block_activation(const char *name) "%s"
+
+# netpass.c
+migration_netpass_setup_created_filter(const char *netdev) "netdev=%s"
+migration_netpass_passed_packet_count(const char *netdev, size_t count) "netdev=%s count=%zu"
+migration_netpass_received_packet_count(const char *netdev, size_t count) "netdev=%s count=%zu"
diff --git a/net/net.c b/net/net.c
index a176936f9b..81540fefc1 100644
--- a/net/net.c
+++ b/net/net.c
@@ -158,6 +158,14 @@ void qemu_set_info_str(NetClientState *nc, const char *fmt, ...)
va_end(ap);
}
+void qemu_set_netpass_enabled(NetClientState *nc, bool enabled)
+{
+ nc->netpass_enabled = enabled;
+ if (nc->netpass_enabled_notify) {
+ nc->netpass_enabled_notify(nc, nc->netpass_enabled_notify_opaque);
+ }
+}
+
void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6])
{
qemu_set_info_str(nc, "model=%s,macaddr=%02x:%02x:%02x:%02x:%02x:%02x",
@@ -287,6 +295,9 @@ static void qemu_net_client_setup(NetClientState *nc,
nc->incoming_queue = qemu_new_net_queue(qemu_deliver_packet_iov, nc);
nc->destructor = destructor;
nc->is_datapath = is_datapath;
+ nc->netpass_enabled = false;
+ nc->netpass_enabled_notify = NULL;
+ nc->netpass_enabled_notify_opaque = NULL;
QTAILQ_INIT(&nc->filters);
}
diff --git a/net/tap.c b/net/tap.c
index 8d7ab6ba6f..dcc03a3f03 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -109,7 +109,8 @@ static char *tap_parse_script(const char *script_arg, const char *default_path)
static void tap_update_fd_handler(TAPState *s)
{
qemu_set_fd_handler(s->fd,
- s->read_poll && s->enabled ? tap_send : NULL,
+ (s->read_poll || s->nc.netpass_enabled) && s->enabled ?
+ tap_send : NULL,
s->write_poll && s->enabled ? tap_writable : NULL,
s);
}
@@ -412,6 +413,11 @@ static NetClientInfo net_tap_info = {
.get_vhost_net = tap_get_vhost_net,
};
+static void tap_netpass_enabled_nofity(NetClientState *nc, void *opaque)
+{
+ tap_update_fd_handler(opaque);
+}
+
static TAPState *net_tap_fd_init(NetClientState *peer,
const char *model,
const char *name,
@@ -444,6 +450,9 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
tap_read_poll(s, true);
s->vhost_net = NULL;
+ nc->netpass_enabled_notify = &tap_netpass_enabled_nofity;
+ nc->netpass_enabled_notify_opaque = s;
+
return s;
}
diff --git a/qapi/migration.json b/qapi/migration.json
index f925e5541b..d637b22c80 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -520,6 +520,11 @@
# each RAM page. Requires a migration URI that supports seeking,
# such as a file. (since 9.0)
#
+# @netpass: Collect packets received by network backedns after source
+# VM is paused and send them to the destination once it resumes.
+# This (almost) completely eliminates packet loss caused by
+# switchover. (since 11.0)
+#
# Features:
#
# @unstable: Members @x-colo and @x-ignore-shared are experimental.
@@ -536,7 +541,7 @@
{ 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
'validate-uuid', 'background-snapshot',
'zero-copy-send', 'postcopy-preempt', 'switchover-ack',
- 'dirty-limit', 'mapped-ram'] }
+ 'dirty-limit', 'mapped-ram', 'netpass'] }
##
# @MigrationCapabilityStatus:
--
2.52.0
On Tue, Jan 27, 2026 at 10:04 PM Juraj Marcin <jmarcin@redhat.com> wrote:
>
> From: Juraj Marcin <jmarcin@redhat.com>
>
> During migration switchover both the source and the destination machines
> are paused (compute downtime). During this period network still routes
> network packets to the source machine, as this is the last place where
> the recipient MAC address has been seen. Once the destination side
> starts and sends network announcement, all subsequent frames are routed
> correctly. However, frames delivered to the source machine are never
> processed and lost. This causes also a network downtime with roughly the
> same duration as compute downtime.
>
> This can cause problems not only for protocols that cannot handle packet
> loss, but can also introduce delays in protocols that can handle them.
>
> To resolve this, this feature instantiates a network filter for each
> network backend present during migration setup on both migration sides.
> On the source side, this filter caches all packets received from the
> backend during switchover. Once the destination machine starts, all
> cached packets are sent through the migration channel and the respective
> filter object on the destination side injects them to the NIC attached
> to the backend.
>
> Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
> ---
> include/migration/vmstate.h | 6 +
> include/net/net.h | 5 +
> migration/meson.build | 1 +
> migration/migration.c | 49 ++++++-
> migration/migration.h | 2 +
> migration/netpass.c | 246 ++++++++++++++++++++++++++++++++++++
> migration/netpass.h | 14 ++
> migration/options.c | 21 +++
> migration/options.h | 1 +
> migration/savevm.c | 37 ++++++
> migration/savevm.h | 2 +
> migration/trace-events | 9 ++
> net/net.c | 11 ++
> net/tap.c | 11 +-
> qapi/migration.json | 7 +-
> 15 files changed, 418 insertions(+), 4 deletions(-)
> create mode 100644 migration/netpass.c
> create mode 100644 migration/netpass.h
>
> diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
> index 62d7e9fe38..7987e6c85a 100644
> --- a/include/migration/vmstate.h
> +++ b/include/migration/vmstate.h
> @@ -200,6 +200,12 @@ typedef enum {
> * save_setup() in VMSD structures.
> */
> VMS_PHASE_EARLY_SETUP,
> + /*
> + * Specifies a netpass VMSD, these devices are copied right after the
> + * destination is started regardless of precopy/postcopy. Failure in this
> + * phase does not fail the migration in case of precopy.
> + */
> + VMS_PHASE_NETPASS,
> } VMStateSavePhase;
>
> struct VMStateDescription {
> diff --git a/include/net/net.h b/include/net/net.h
> index 45bc86fc86..510908845b 100644
> --- a/include/net/net.h
> +++ b/include/net/net.h
> @@ -82,6 +82,7 @@ typedef void (NetAnnounce)(NetClientState *);
> typedef bool (SetSteeringEBPF)(NetClientState *, int);
> typedef bool (NetCheckPeerType)(NetClientState *, ObjectClass *, Error **);
> typedef struct vhost_net *(GetVHostNet)(NetClientState *nc);
> +typedef void (NetpassEnabledNotify)(NetClientState *nc, void *opaque);
>
> typedef struct NetClientInfo {
> NetClientDriver type;
> @@ -130,6 +131,9 @@ struct NetClientState {
> bool is_netdev;
> bool do_not_pad; /* do not pad to the minimum ethernet frame length */
> bool is_datapath;
> + bool netpass_enabled;
> + NetpassEnabledNotify *netpass_enabled_notify;
> + void *netpass_enabled_notify_opaque;
> QTAILQ_HEAD(, NetFilterState) filters;
> };
>
Adding Cindy, Eugenio can Chen.
I think we can simple reuse the existing filters:
redirector: which can redirect traffic from the source to the
destination via chardev
buffer: which can hold the packets until the destination is released
And let the libvirt install/uninstall those filters at the correct time.
Which means:
On the source: there would be a redirector that can be enabled when vm
is paused, and it redirect the traffic to a socket/chardev
On the destination: there would be a redirector as well as the buffer,
redirector receives packets from the socket and send it to buffer,
buffer will hold those packets until VM in the destination is resumed.
The current filters need some tweaks (e.g letting filters (redirector)
work when VM is paused). The advantages of this are:
1) reuse the existing filters
2) don't need to care about the vhost support on the source as vhost
is disabled, for vDPA we can reuse shadow virtqueue
3) for the destination we can install a redirector to packet socket to
let vhost works like socket -> redirector -> buffer -> redirector ->
packet socket.
Thanks
Hi Jason,
On 2026-01-28 10:55, Jason Wang wrote:
> On Tue, Jan 27, 2026 at 10:04 PM Juraj Marcin <jmarcin@redhat.com> wrote:
> >
> > From: Juraj Marcin <jmarcin@redhat.com>
> >
> > During migration switchover both the source and the destination machines
> > are paused (compute downtime). During this period network still routes
> > network packets to the source machine, as this is the last place where
> > the recipient MAC address has been seen. Once the destination side
> > starts and sends network announcement, all subsequent frames are routed
> > correctly. However, frames delivered to the source machine are never
> > processed and lost. This causes also a network downtime with roughly the
> > same duration as compute downtime.
> >
> > This can cause problems not only for protocols that cannot handle packet
> > loss, but can also introduce delays in protocols that can handle them.
> >
> > To resolve this, this feature instantiates a network filter for each
> > network backend present during migration setup on both migration sides.
> > On the source side, this filter caches all packets received from the
> > backend during switchover. Once the destination machine starts, all
> > cached packets are sent through the migration channel and the respective
> > filter object on the destination side injects them to the NIC attached
> > to the backend.
> >
> > Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
> > ---
> > include/migration/vmstate.h | 6 +
> > include/net/net.h | 5 +
> > migration/meson.build | 1 +
> > migration/migration.c | 49 ++++++-
> > migration/migration.h | 2 +
> > migration/netpass.c | 246 ++++++++++++++++++++++++++++++++++++
> > migration/netpass.h | 14 ++
> > migration/options.c | 21 +++
> > migration/options.h | 1 +
> > migration/savevm.c | 37 ++++++
> > migration/savevm.h | 2 +
> > migration/trace-events | 9 ++
> > net/net.c | 11 ++
> > net/tap.c | 11 +-
> > qapi/migration.json | 7 +-
> > 15 files changed, 418 insertions(+), 4 deletions(-)
> > create mode 100644 migration/netpass.c
> > create mode 100644 migration/netpass.h
> >
> > diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
> > index 62d7e9fe38..7987e6c85a 100644
> > --- a/include/migration/vmstate.h
> > +++ b/include/migration/vmstate.h
> > @@ -200,6 +200,12 @@ typedef enum {
> > * save_setup() in VMSD structures.
> > */
> > VMS_PHASE_EARLY_SETUP,
> > + /*
> > + * Specifies a netpass VMSD, these devices are copied right after the
> > + * destination is started regardless of precopy/postcopy. Failure in this
> > + * phase does not fail the migration in case of precopy.
> > + */
> > + VMS_PHASE_NETPASS,
> > } VMStateSavePhase;
> >
> > struct VMStateDescription {
> > diff --git a/include/net/net.h b/include/net/net.h
> > index 45bc86fc86..510908845b 100644
> > --- a/include/net/net.h
> > +++ b/include/net/net.h
> > @@ -82,6 +82,7 @@ typedef void (NetAnnounce)(NetClientState *);
> > typedef bool (SetSteeringEBPF)(NetClientState *, int);
> > typedef bool (NetCheckPeerType)(NetClientState *, ObjectClass *, Error **);
> > typedef struct vhost_net *(GetVHostNet)(NetClientState *nc);
> > +typedef void (NetpassEnabledNotify)(NetClientState *nc, void *opaque);
> >
> > typedef struct NetClientInfo {
> > NetClientDriver type;
> > @@ -130,6 +131,9 @@ struct NetClientState {
> > bool is_netdev;
> > bool do_not_pad; /* do not pad to the minimum ethernet frame length */
> > bool is_datapath;
> > + bool netpass_enabled;
> > + NetpassEnabledNotify *netpass_enabled_notify;
> > + void *netpass_enabled_notify_opaque;
> > QTAILQ_HEAD(, NetFilterState) filters;
> > };
> >
>
> Adding Cindy, Eugenio can Chen.
>
> I think we can simple reuse the existing filters:
>
> redirector: which can redirect traffic from the source to the
> destination via chardev
> buffer: which can hold the packets until the destination is released
>
> And let the libvirt install/uninstall those filters at the correct time.
>
> Which means:
>
> On the source: there would be a redirector that can be enabled when vm
> is paused, and it redirect the traffic to a socket/chardev
> On the destination: there would be a redirector as well as the buffer,
> redirector receives packets from the socket and send it to buffer,
> buffer will hold those packets until VM in the destination is resumed.
>
> The current filters need some tweaks (e.g letting filters (redirector)
> work when VM is paused). The advantages of this are:
I tested the idea of filters for the forwarding using the existing
filters first and it does work with mentioned tweaks, however this
requires additional channel between chardevs attached to filters.
In my opinion it was better to reuse already existing migration channel,
so there are no necessary changes in higher layers. Furthermore, by
implementing this directly in QEMU, this feature can be used anywhere,
even if libvirt is not used.
>
> 1) reuse the existing filters
> 2) don't need to care about the vhost support on the source as vhost
> is disabled, for vDPA we can reuse shadow virtqueue
Can you elaborate more on vhost being disabled? IUUC QEMU net filters
don't support vhost=on, including the redirector filter.
> 3) for the destination we can install a redirector to packet socket to
> let vhost works like socket -> redirector -> buffer -> redirector ->
> packet socket.
>
> Thanks
>
On Wed, Jan 28, 2026 at 9:49 PM Juraj Marcin <jmarcin@redhat.com> wrote:
>
> Hi Jason,
>
> On 2026-01-28 10:55, Jason Wang wrote:
> > On Tue, Jan 27, 2026 at 10:04 PM Juraj Marcin <jmarcin@redhat.com> wrote:
> > >
> > > From: Juraj Marcin <jmarcin@redhat.com>
> > >
> > > During migration switchover both the source and the destination machines
> > > are paused (compute downtime). During this period network still routes
> > > network packets to the source machine, as this is the last place where
> > > the recipient MAC address has been seen. Once the destination side
> > > starts and sends network announcement, all subsequent frames are routed
> > > correctly. However, frames delivered to the source machine are never
> > > processed and lost. This causes also a network downtime with roughly the
> > > same duration as compute downtime.
> > >
> > > This can cause problems not only for protocols that cannot handle packet
> > > loss, but can also introduce delays in protocols that can handle them.
> > >
> > > To resolve this, this feature instantiates a network filter for each
> > > network backend present during migration setup on both migration sides.
> > > On the source side, this filter caches all packets received from the
> > > backend during switchover. Once the destination machine starts, all
> > > cached packets are sent through the migration channel and the respective
> > > filter object on the destination side injects them to the NIC attached
> > > to the backend.
> > >
> > > Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
> > > ---
> > > include/migration/vmstate.h | 6 +
> > > include/net/net.h | 5 +
> > > migration/meson.build | 1 +
> > > migration/migration.c | 49 ++++++-
> > > migration/migration.h | 2 +
> > > migration/netpass.c | 246 ++++++++++++++++++++++++++++++++++++
> > > migration/netpass.h | 14 ++
> > > migration/options.c | 21 +++
> > > migration/options.h | 1 +
> > > migration/savevm.c | 37 ++++++
> > > migration/savevm.h | 2 +
> > > migration/trace-events | 9 ++
> > > net/net.c | 11 ++
> > > net/tap.c | 11 +-
> > > qapi/migration.json | 7 +-
> > > 15 files changed, 418 insertions(+), 4 deletions(-)
> > > create mode 100644 migration/netpass.c
> > > create mode 100644 migration/netpass.h
> > >
> > > diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
> > > index 62d7e9fe38..7987e6c85a 100644
> > > --- a/include/migration/vmstate.h
> > > +++ b/include/migration/vmstate.h
> > > @@ -200,6 +200,12 @@ typedef enum {
> > > * save_setup() in VMSD structures.
> > > */
> > > VMS_PHASE_EARLY_SETUP,
> > > + /*
> > > + * Specifies a netpass VMSD, these devices are copied right after the
> > > + * destination is started regardless of precopy/postcopy. Failure in this
> > > + * phase does not fail the migration in case of precopy.
> > > + */
> > > + VMS_PHASE_NETPASS,
> > > } VMStateSavePhase;
> > >
> > > struct VMStateDescription {
> > > diff --git a/include/net/net.h b/include/net/net.h
> > > index 45bc86fc86..510908845b 100644
> > > --- a/include/net/net.h
> > > +++ b/include/net/net.h
> > > @@ -82,6 +82,7 @@ typedef void (NetAnnounce)(NetClientState *);
> > > typedef bool (SetSteeringEBPF)(NetClientState *, int);
> > > typedef bool (NetCheckPeerType)(NetClientState *, ObjectClass *, Error **);
> > > typedef struct vhost_net *(GetVHostNet)(NetClientState *nc);
> > > +typedef void (NetpassEnabledNotify)(NetClientState *nc, void *opaque);
> > >
> > > typedef struct NetClientInfo {
> > > NetClientDriver type;
> > > @@ -130,6 +131,9 @@ struct NetClientState {
> > > bool is_netdev;
> > > bool do_not_pad; /* do not pad to the minimum ethernet frame length */
> > > bool is_datapath;
> > > + bool netpass_enabled;
> > > + NetpassEnabledNotify *netpass_enabled_notify;
> > > + void *netpass_enabled_notify_opaque;
> > > QTAILQ_HEAD(, NetFilterState) filters;
> > > };
> > >
> >
> > Adding Cindy, Eugenio can Chen.
> >
> > I think we can simple reuse the existing filters:
> >
> > redirector: which can redirect traffic from the source to the
> > destination via chardev
> > buffer: which can hold the packets until the destination is released
> >
> > And let the libvirt install/uninstall those filters at the correct time.
> >
> > Which means:
> >
> > On the source: there would be a redirector that can be enabled when vm
> > is paused, and it redirect the traffic to a socket/chardev
> > On the destination: there would be a redirector as well as the buffer,
> > redirector receives packets from the socket and send it to buffer,
> > buffer will hold those packets until VM in the destination is resumed.
> >
> > The current filters need some tweaks (e.g letting filters (redirector)
> > work when VM is paused). The advantages of this are:
>
> I tested the idea of filters for the forwarding using the existing
> filters first and it does work with mentioned tweaks, however this
> requires additional channel between chardevs attached to filters.
It requires some changes in the redirector. One of the major but
trivial changes is to make it work when the VM is paused.
>
> In my opinion it was better to reuse already existing migration channel,
> so there are no necessary changes in higher layers. Furthermore, by
> implementing this directly in QEMU, this feature can be used anywhere,
> even if libvirt is not used.
This requires more thought, leaving the policy to the upper may give
us flexibility.
>
> >
> > 1) reuse the existing filters
> > 2) don't need to care about the vhost support on the source as vhost
> > is disabled, for vDPA we can reuse shadow virtqueue
>
> Can you elaborate more on vhost being disabled? IUUC QEMU net filters
> don't support vhost=on, including the redirector filter.
We only need the filter work when vm is paused, in this case vhost is
disabled. We can use packet socket to read or inject packet to tap.
>
> > 3) for the destination we can install a redirector to packet socket to
> > let vhost works like socket -> redirector -> buffer -> redirector ->
> > packet socket.
> >
> > Thanks
> >
>
Thanks
On Thu, Jan 29, 2026 at 9:06 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Jan 28, 2026 at 9:49 PM Juraj Marcin <jmarcin@redhat.com> wrote:
> >
> > Hi Jason,
> >
> > On 2026-01-28 10:55, Jason Wang wrote:
> > > On Tue, Jan 27, 2026 at 10:04 PM Juraj Marcin <jmarcin@redhat.com> wrote:
> > > >
> > > > From: Juraj Marcin <jmarcin@redhat.com>
> > > >
> > > > During migration switchover both the source and the destination machines
> > > > are paused (compute downtime). During this period network still routes
> > > > network packets to the source machine, as this is the last place where
> > > > the recipient MAC address has been seen. Once the destination side
> > > > starts and sends network announcement, all subsequent frames are routed
> > > > correctly. However, frames delivered to the source machine are never
> > > > processed and lost. This causes also a network downtime with roughly the
> > > > same duration as compute downtime.
> > > >
> > > > This can cause problems not only for protocols that cannot handle packet
> > > > loss, but can also introduce delays in protocols that can handle them.
> > > >
> > > > To resolve this, this feature instantiates a network filter for each
> > > > network backend present during migration setup on both migration sides.
> > > > On the source side, this filter caches all packets received from the
> > > > backend during switchover. Once the destination machine starts, all
> > > > cached packets are sent through the migration channel and the respective
> > > > filter object on the destination side injects them to the NIC attached
> > > > to the backend.
> > > >
> > > > Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
> > > > ---
> > > > include/migration/vmstate.h | 6 +
> > > > include/net/net.h | 5 +
> > > > migration/meson.build | 1 +
> > > > migration/migration.c | 49 ++++++-
> > > > migration/migration.h | 2 +
> > > > migration/netpass.c | 246 ++++++++++++++++++++++++++++++++++++
> > > > migration/netpass.h | 14 ++
> > > > migration/options.c | 21 +++
> > > > migration/options.h | 1 +
> > > > migration/savevm.c | 37 ++++++
> > > > migration/savevm.h | 2 +
> > > > migration/trace-events | 9 ++
> > > > net/net.c | 11 ++
> > > > net/tap.c | 11 +-
> > > > qapi/migration.json | 7 +-
> > > > 15 files changed, 418 insertions(+), 4 deletions(-)
> > > > create mode 100644 migration/netpass.c
> > > > create mode 100644 migration/netpass.h
> > > >
> > > > diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
> > > > index 62d7e9fe38..7987e6c85a 100644
> > > > --- a/include/migration/vmstate.h
> > > > +++ b/include/migration/vmstate.h
> > > > @@ -200,6 +200,12 @@ typedef enum {
> > > > * save_setup() in VMSD structures.
> > > > */
> > > > VMS_PHASE_EARLY_SETUP,
> > > > + /*
> > > > + * Specifies a netpass VMSD, these devices are copied right after the
> > > > + * destination is started regardless of precopy/postcopy. Failure in this
> > > > + * phase does not fail the migration in case of precopy.
> > > > + */
> > > > + VMS_PHASE_NETPASS,
> > > > } VMStateSavePhase;
> > > >
> > > > struct VMStateDescription {
> > > > diff --git a/include/net/net.h b/include/net/net.h
> > > > index 45bc86fc86..510908845b 100644
> > > > --- a/include/net/net.h
> > > > +++ b/include/net/net.h
> > > > @@ -82,6 +82,7 @@ typedef void (NetAnnounce)(NetClientState *);
> > > > typedef bool (SetSteeringEBPF)(NetClientState *, int);
> > > > typedef bool (NetCheckPeerType)(NetClientState *, ObjectClass *, Error **);
> > > > typedef struct vhost_net *(GetVHostNet)(NetClientState *nc);
> > > > +typedef void (NetpassEnabledNotify)(NetClientState *nc, void *opaque);
> > > >
> > > > typedef struct NetClientInfo {
> > > > NetClientDriver type;
> > > > @@ -130,6 +131,9 @@ struct NetClientState {
> > > > bool is_netdev;
> > > > bool do_not_pad; /* do not pad to the minimum ethernet frame length */
> > > > bool is_datapath;
> > > > + bool netpass_enabled;
> > > > + NetpassEnabledNotify *netpass_enabled_notify;
> > > > + void *netpass_enabled_notify_opaque;
> > > > QTAILQ_HEAD(, NetFilterState) filters;
> > > > };
> > > >
> > >
> > > Adding Cindy, Eugenio can Chen.
> > >
> > > I think we can simple reuse the existing filters:
> > >
> > > redirector: which can redirect traffic from the source to the
> > > destination via chardev
> > > buffer: which can hold the packets until the destination is released
> > >
> > > And let the libvirt install/uninstall those filters at the correct time.
> > >
> > > Which means:
> > >
> > > On the source: there would be a redirector that can be enabled when vm
> > > is paused, and it redirect the traffic to a socket/chardev
> > > On the destination: there would be a redirector as well as the buffer,
> > > redirector receives packets from the socket and send it to buffer,
> > > buffer will hold those packets until VM in the destination is resumed.
> > >
> > > The current filters need some tweaks (e.g letting filters (redirector)
> > > work when VM is paused). The advantages of this are:
> >
> > I tested the idea of filters for the forwarding using the existing
> > filters first and it does work with mentioned tweaks, however this
> > requires additional channel between chardevs attached to filters.
>
> It requires some changes in the redirector. One of the major but
> trivial changes is to make it work when the VM is paused.
Actually, current netfilter doesn't depend on the running or paused
state of the virtual machine. It just handle packet in qemu side.
>
> >
> > In my opinion it was better to reuse already existing migration channel,
> > so there are no necessary changes in higher layers. Furthermore, by
> > implementing this directly in QEMU, this feature can be used anywhere,
> > even if libvirt is not used.
>
> This requires more thought, leaving the policy to the upper may give
> us flexibility.
Agree, the use cases for this series is very similar to COLO project.
https://wiki.qemu.org/Features/COLO
We handled network related issue by introduce the qemu network filters.
It's even possible to transparently modify TCP packet headers without
the virtual machine being aware of it.
https://github.com/qemu/qemu/blob/master/docs/colo-proxy.txt
>
> >
> > >
> > > 1) reuse the existing filters
> > > 2) don't need to care about the vhost support on the source as vhost
> > > is disabled, for vDPA we can reuse shadow virtqueue
> >
> > Can you elaborate more on vhost being disabled? IUUC QEMU net filters
> > don't support vhost=on, including the redirector filter.
>
> We only need the filter work when vm is paused, in this case vhost is
> disabled. We can use packet socket to read or inject packet to tap.
Agree, COLO proxy use the filter-redirctor inject network packet to dest VM.
By the way, COLO is a HA/FT project in QEMU, based on our previous practical
experience, the VM stop time will handled by netfilter and TCP/IP retransmission
mechanism, for the stateless protocol like UDP, the application will
handle it by default.
The only difference with COLO is the normal live migration need
filter-buffer in dest side.
And the filter-buffer and VM paused time can also cause problems with
network transmission.
Another issue is when VM paused time became longer, filter-redirector
still running,
but the source side QEMU read packet from tap device path need to be
double checked.
You can know how the multiple network filters co-work in the COLO-proxy docs.
Thanks
Chen
>
> >
> > > 3) for the destination we can install a redirector to packet socket to
> > > let vhost works like socket -> redirector -> buffer -> redirector ->
> > > packet socket.
> > >
> > > Thanks
> > >
> >
>
> Thanks
>
On Wed, Jan 28, 2026 at 10:55 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Tue, Jan 27, 2026 at 10:04 PM Juraj Marcin <jmarcin@redhat.com> wrote:
> >
> > From: Juraj Marcin <jmarcin@redhat.com>
> >
> > During migration switchover both the source and the destination machines
> > are paused (compute downtime). During this period network still routes
> > network packets to the source machine, as this is the last place where
> > the recipient MAC address has been seen. Once the destination side
> > starts and sends network announcement, all subsequent frames are routed
> > correctly. However, frames delivered to the source machine are never
> > processed and lost. This causes also a network downtime with roughly the
> > same duration as compute downtime.
> >
> > This can cause problems not only for protocols that cannot handle packet
> > loss, but can also introduce delays in protocols that can handle them.
> >
> > To resolve this, this feature instantiates a network filter for each
> > network backend present during migration setup on both migration sides.
> > On the source side, this filter caches all packets received from the
> > backend during switchover. Once the destination machine starts, all
> > cached packets are sent through the migration channel and the respective
> > filter object on the destination side injects them to the NIC attached
> > to the backend.
> >
> > Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
> > ---
> > include/migration/vmstate.h | 6 +
> > include/net/net.h | 5 +
> > migration/meson.build | 1 +
> > migration/migration.c | 49 ++++++-
> > migration/migration.h | 2 +
> > migration/netpass.c | 246 ++++++++++++++++++++++++++++++++++++
> > migration/netpass.h | 14 ++
> > migration/options.c | 21 +++
> > migration/options.h | 1 +
> > migration/savevm.c | 37 ++++++
> > migration/savevm.h | 2 +
> > migration/trace-events | 9 ++
> > net/net.c | 11 ++
> > net/tap.c | 11 +-
> > qapi/migration.json | 7 +-
> > 15 files changed, 418 insertions(+), 4 deletions(-)
> > create mode 100644 migration/netpass.c
> > create mode 100644 migration/netpass.h
> >
> > diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
> > index 62d7e9fe38..7987e6c85a 100644
> > --- a/include/migration/vmstate.h
> > +++ b/include/migration/vmstate.h
> > @@ -200,6 +200,12 @@ typedef enum {
> > * save_setup() in VMSD structures.
> > */
> > VMS_PHASE_EARLY_SETUP,
> > + /*
> > + * Specifies a netpass VMSD, these devices are copied right after the
> > + * destination is started regardless of precopy/postcopy. Failure in this
> > + * phase does not fail the migration in case of precopy.
> > + */
> > + VMS_PHASE_NETPASS,
> > } VMStateSavePhase;
> >
> > struct VMStateDescription {
> > diff --git a/include/net/net.h b/include/net/net.h
> > index 45bc86fc86..510908845b 100644
> > --- a/include/net/net.h
> > +++ b/include/net/net.h
> > @@ -82,6 +82,7 @@ typedef void (NetAnnounce)(NetClientState *);
> > typedef bool (SetSteeringEBPF)(NetClientState *, int);
> > typedef bool (NetCheckPeerType)(NetClientState *, ObjectClass *, Error **);
> > typedef struct vhost_net *(GetVHostNet)(NetClientState *nc);
> > +typedef void (NetpassEnabledNotify)(NetClientState *nc, void *opaque);
> >
> > typedef struct NetClientInfo {
> > NetClientDriver type;
> > @@ -130,6 +131,9 @@ struct NetClientState {
> > bool is_netdev;
> > bool do_not_pad; /* do not pad to the minimum ethernet frame length */
> > bool is_datapath;
> > + bool netpass_enabled;
> > + NetpassEnabledNotify *netpass_enabled_notify;
> > + void *netpass_enabled_notify_opaque;
> > QTAILQ_HEAD(, NetFilterState) filters;
> > };
> >
>
> Adding Cindy, Eugenio can Chen.
>
> I think we can simple reuse the existing filters:
>
> redirector: which can redirect traffic from the source to the
> destination via chardev
> buffer: which can hold the packets until the destination is released
>
> And let the libvirt install/uninstall those filters at the correct time.
>
> Which means:
>
> On the source: there would be a redirector that can be enabled when vm
> is paused, and it redirect the traffic to a socket/chardev
> On the destination: there would be a redirector as well as the buffer,
> redirector receives packets from the socket and send it to buffer,
> buffer will hold those packets until VM in the destination is resumed.
>
> The current filters need some tweaks (e.g letting filters (redirector)
> work when VM is paused). The advantages of this are:
>
> 1) reuse the existing filters
> 2) don't need to care about the vhost support on the source as vhost
> is disabled, for vDPA we can reuse shadow virtqueue
> 3) for the destination we can install a redirector to packet socket to
> let vhost works like socket -> redirector -> buffer -> redirector ->
> packet socket.
and 4) there's no need to touch migration code in Qemu.
Thanks
>
> Thanks
On Wed, Jan 28, 2026 at 10:57 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Jan 28, 2026 at 10:55 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Tue, Jan 27, 2026 at 10:04 PM Juraj Marcin <jmarcin@redhat.com> wrote:
> > >
> > > From: Juraj Marcin <jmarcin@redhat.com>
> > >
> > > During migration switchover both the source and the destination machines
> > > are paused (compute downtime). During this period network still routes
> > > network packets to the source machine, as this is the last place where
> > > the recipient MAC address has been seen. Once the destination side
> > > starts and sends network announcement, all subsequent frames are routed
> > > correctly. However, frames delivered to the source machine are never
> > > processed and lost. This causes also a network downtime with roughly the
> > > same duration as compute downtime.
> > >
> > > This can cause problems not only for protocols that cannot handle packet
> > > loss, but can also introduce delays in protocols that can handle them.
> > >
> > > To resolve this, this feature instantiates a network filter for each
> > > network backend present during migration setup on both migration sides.
> > > On the source side, this filter caches all packets received from the
> > > backend during switchover. Once the destination machine starts, all
> > > cached packets are sent through the migration channel and the respective
> > > filter object on the destination side injects them to the NIC attached
> > > to the backend.
> > >
> > > Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
> > > ---
> > > include/migration/vmstate.h | 6 +
> > > include/net/net.h | 5 +
> > > migration/meson.build | 1 +
> > > migration/migration.c | 49 ++++++-
> > > migration/migration.h | 2 +
> > > migration/netpass.c | 246 ++++++++++++++++++++++++++++++++++++
> > > migration/netpass.h | 14 ++
> > > migration/options.c | 21 +++
> > > migration/options.h | 1 +
> > > migration/savevm.c | 37 ++++++
> > > migration/savevm.h | 2 +
> > > migration/trace-events | 9 ++
> > > net/net.c | 11 ++
> > > net/tap.c | 11 +-
> > > qapi/migration.json | 7 +-
> > > 15 files changed, 418 insertions(+), 4 deletions(-)
> > > create mode 100644 migration/netpass.c
> > > create mode 100644 migration/netpass.h
> > >
> > > diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
> > > index 62d7e9fe38..7987e6c85a 100644
> > > --- a/include/migration/vmstate.h
> > > +++ b/include/migration/vmstate.h
> > > @@ -200,6 +200,12 @@ typedef enum {
> > > * save_setup() in VMSD structures.
> > > */
> > > VMS_PHASE_EARLY_SETUP,
> > > + /*
> > > + * Specifies a netpass VMSD, these devices are copied right after the
> > > + * destination is started regardless of precopy/postcopy. Failure in this
> > > + * phase does not fail the migration in case of precopy.
> > > + */
> > > + VMS_PHASE_NETPASS,
> > > } VMStateSavePhase;
> > >
> > > struct VMStateDescription {
> > > diff --git a/include/net/net.h b/include/net/net.h
> > > index 45bc86fc86..510908845b 100644
> > > --- a/include/net/net.h
> > > +++ b/include/net/net.h
> > > @@ -82,6 +82,7 @@ typedef void (NetAnnounce)(NetClientState *);
> > > typedef bool (SetSteeringEBPF)(NetClientState *, int);
> > > typedef bool (NetCheckPeerType)(NetClientState *, ObjectClass *, Error **);
> > > typedef struct vhost_net *(GetVHostNet)(NetClientState *nc);
> > > +typedef void (NetpassEnabledNotify)(NetClientState *nc, void *opaque);
> > >
> > > typedef struct NetClientInfo {
> > > NetClientDriver type;
> > > @@ -130,6 +131,9 @@ struct NetClientState {
> > > bool is_netdev;
> > > bool do_not_pad; /* do not pad to the minimum ethernet frame length */
> > > bool is_datapath;
> > > + bool netpass_enabled;
> > > + NetpassEnabledNotify *netpass_enabled_notify;
> > > + void *netpass_enabled_notify_opaque;
> > > QTAILQ_HEAD(, NetFilterState) filters;
> > > };
> > >
> >
> > Adding Cindy, Eugenio can Chen.
> >
> > I think we can simple reuse the existing filters:
> >
> > redirector: which can redirect traffic from the source to the
> > destination via chardev
> > buffer: which can hold the packets until the destination is released
> >
> > And let the libvirt install/uninstall those filters at the correct time.
> >
> > Which means:
> >
> > On the source: there would be a redirector that can be enabled when vm
> > is paused, and it redirect the traffic to a socket/chardev
> > On the destination: there would be a redirector as well as the buffer,
> > redirector receives packets from the socket and send it to buffer,
> > buffer will hold those packets until VM in the destination is resumed.
> >
> > The current filters need some tweaks (e.g letting filters (redirector)
> > work when VM is paused). The advantages of this are:
> >
> > 1) reuse the existing filters
> > 2) don't need to care about the vhost support on the source as vhost
> > is disabled, for vDPA we can reuse shadow virtqueue
> > 3) for the destination we can install a redirector to packet socket to
> > let vhost works like socket -> redirector -> buffer -> redirector ->
> > packet socket.
Actually, I've already started working on this to support vhost. there
would be based on filter working as this explain and coding is on
going , hope we can have a draft for this soon
Thanks
Cindy
>
> and 4) there's no need to touch migration code in Qemu.
>
> Thanks
>
> >
> > Thanks
>
On Tue, Jan 27, 2026 at 03:03:10PM +0100, Juraj Marcin wrote:
> From: Juraj Marcin <jmarcin@redhat.com>
>
> During migration switchover both the source and the destination machines
> are paused (compute downtime). During this period network still routes
> network packets to the source machine, as this is the last place where
> the recipient MAC address has been seen. Once the destination side
> starts and sends network announcement, all subsequent frames are routed
> correctly. However, frames delivered to the source machine are never
> processed and lost. This causes also a network downtime with roughly the
> same duration as compute downtime.
>
> This can cause problems not only for protocols that cannot handle packet
> loss, but can also introduce delays in protocols that can handle them.
>
> To resolve this, this feature instantiates a network filter for each
> network backend present during migration setup on both migration sides.
> On the source side, this filter caches all packets received from the
> backend during switchover. Once the destination machine starts, all
> cached packets are sent through the migration channel and the respective
> filter object on the destination side injects them to the NIC attached
> to the backend.
If the dest QEMU has started, I presume this means that the ARP
announcement has been sent ? IOW, the packets being forwarded
over the migration stream are guaranteed to be delivered "out
of order" wrt the sender. Should be safe for TCP, but may have
an impact on other protocols. Though apps should be aware of
that risk in general, they may not frequently encounter it, and
it could still cause service disruption
> diff --git a/qapi/migration.json b/qapi/migration.json
> index f925e5541b..d637b22c80 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -520,6 +520,11 @@
> # each RAM page. Requires a migration URI that supports seeking,
> # such as a file. (since 9.0)
> #
> +# @netpass: Collect packets received by network backedns after source
> +# VM is paused and send them to the destination once it resumes.
> +# This (almost) completely eliminates packet loss caused by
> +# switchover. (since 11.0)
Should mention they will be deliver "out of order"
> +#
> # Features:
> #
> # @unstable: Members @x-colo and @x-ignore-shared are experimental.
> @@ -536,7 +541,7 @@
> { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
> 'validate-uuid', 'background-snapshot',
> 'zero-copy-send', 'postcopy-preempt', 'switchover-ack',
> - 'dirty-limit', 'mapped-ram'] }
> + 'dirty-limit', 'mapped-ram', 'netpass'] }
>
> ##
> # @MigrationCapabilityStatus:
> --
> 2.52.0
>
>
With regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Hi Daniel,
On 2026-01-27 14:25, Daniel P. Berrangé wrote:
> On Tue, Jan 27, 2026 at 03:03:10PM +0100, Juraj Marcin wrote:
> > From: Juraj Marcin <jmarcin@redhat.com>
> >
> > During migration switchover both the source and the destination machines
> > are paused (compute downtime). During this period network still routes
> > network packets to the source machine, as this is the last place where
> > the recipient MAC address has been seen. Once the destination side
> > starts and sends network announcement, all subsequent frames are routed
> > correctly. However, frames delivered to the source machine are never
> > processed and lost. This causes also a network downtime with roughly the
> > same duration as compute downtime.
> >
> > This can cause problems not only for protocols that cannot handle packet
> > loss, but can also introduce delays in protocols that can handle them.
> >
> > To resolve this, this feature instantiates a network filter for each
> > network backend present during migration setup on both migration sides.
> > On the source side, this filter caches all packets received from the
> > backend during switchover. Once the destination machine starts, all
> > cached packets are sent through the migration channel and the respective
> > filter object on the destination side injects them to the NIC attached
> > to the backend.
>
> If the dest QEMU has started, I presume this means that the ARP
> announcement has been sent ? IOW, the packets being forwarded
> over the migration stream are guaranteed to be delivered "out
> of order" wrt the sender. Should be safe for TCP, but may have
> an impact on other protocols. Though apps should be aware of
> that risk in general, they may not frequently encounter it, and
> it could still cause service disruption
Yes, after ARP announcement from dest. Forwarded packets could get
delivered out-of-order, although it would depend on the traffic rate, in
my testing I encountered out-of-order packets only a couple of times. As
is, this feature allows choosing between risk of packet loss or out of
order delivery, both of which could also happen outside the migration
scope.
I could also update it and defer the delivery of new packets on the
destination until packets from the source side are processed as Michael
suggested, that should prevent out of order delivery.
>
> > diff --git a/qapi/migration.json b/qapi/migration.json
> > index f925e5541b..d637b22c80 100644
> > --- a/qapi/migration.json
> > +++ b/qapi/migration.json
> > @@ -520,6 +520,11 @@
> > # each RAM page. Requires a migration URI that supports seeking,
> > # such as a file. (since 9.0)
> > #
> > +# @netpass: Collect packets received by network backedns after source
> > +# VM is paused and send them to the destination once it resumes.
> > +# This (almost) completely eliminates packet loss caused by
> > +# switchover. (since 11.0)
>
> Should mention they will be deliver "out of order"
>
> > +#
> > # Features:
> > #
> > # @unstable: Members @x-colo and @x-ignore-shared are experimental.
> > @@ -536,7 +541,7 @@
> > { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
> > 'validate-uuid', 'background-snapshot',
> > 'zero-copy-send', 'postcopy-preempt', 'switchover-ack',
> > - 'dirty-limit', 'mapped-ram'] }
> > + 'dirty-limit', 'mapped-ram', 'netpass'] }
> >
> > ##
> > # @MigrationCapabilityStatus:
> > --
> > 2.52.0
> >
> >
>
> With regards,
> Daniel
> --
> |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o- https://fstop138.berrange.com :|
> |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
>
On Tue, Jan 27, 2026 at 02:25:23PM +0000, Daniel P. Berrangé wrote:
> On Tue, Jan 27, 2026 at 03:03:10PM +0100, Juraj Marcin wrote:
> > From: Juraj Marcin <jmarcin@redhat.com>
> >
> > During migration switchover both the source and the destination machines
> > are paused (compute downtime). During this period network still routes
> > network packets to the source machine, as this is the last place where
> > the recipient MAC address has been seen. Once the destination side
> > starts and sends network announcement, all subsequent frames are routed
> > correctly. However, frames delivered to the source machine are never
> > processed and lost. This causes also a network downtime with roughly the
> > same duration as compute downtime.
> >
> > This can cause problems not only for protocols that cannot handle packet
> > loss, but can also introduce delays in protocols that can handle them.
> >
> > To resolve this, this feature instantiates a network filter for each
> > network backend present during migration setup on both migration sides.
> > On the source side, this filter caches all packets received from the
> > backend during switchover. Once the destination machine starts, all
> > cached packets are sent through the migration channel and the respective
> > filter object on the destination side injects them to the NIC attached
> > to the backend.
>
> If the dest QEMU has started, I presume this means that the ARP
> announcement has been sent ?
For example, with virtio guest announcements, it's sent by the dest VM.
Besides, arp "announcements" are not necessary to reprogram the network.
But if you want to abolutely avoid reordering, you can wait
until there's an attempt to transfer something, buffer that
something, process everything from the source (pass it to the VM),
then send whatever VM wants to send.
Thinkably, qemu initiated packets can be handled the same way.
> IOW, the packets being forwarded
> over the migration stream are guaranteed to be delivered "out
> of order" wrt the sender. Should be safe for TCP, but may have
> an impact on other protocols. Though apps should be aware of
> that risk in general, they may not frequently encounter it, and
> it could still cause service disruption
>
> > diff --git a/qapi/migration.json b/qapi/migration.json
> > index f925e5541b..d637b22c80 100644
> > --- a/qapi/migration.json
> > +++ b/qapi/migration.json
> > @@ -520,6 +520,11 @@
> > # each RAM page. Requires a migration URI that supports seeking,
> > # such as a file. (since 9.0)
> > #
> > +# @netpass: Collect packets received by network backedns after source
> > +# VM is paused and send them to the destination once it resumes.
> > +# This (almost) completely eliminates packet loss caused by
> > +# switchover. (since 11.0)
>
> Should mention they will be deliver "out of order"
>
> > +#
> > # Features:
> > #
> > # @unstable: Members @x-colo and @x-ignore-shared are experimental.
> > @@ -536,7 +541,7 @@
> > { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
> > 'validate-uuid', 'background-snapshot',
> > 'zero-copy-send', 'postcopy-preempt', 'switchover-ack',
> > - 'dirty-limit', 'mapped-ram'] }
> > + 'dirty-limit', 'mapped-ram', 'netpass'] }
> >
> > ##
> > # @MigrationCapabilityStatus:
> > --
> > 2.52.0
> >
> >
>
> With regards,
> Daniel
> --
> |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o- https://fstop138.berrange.com :|
> |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
© 2016 - 2026 Red Hat, Inc.