contrib/libvhost-user/libvhost-user.c | 61 +++++++++++++++++++++++---- contrib/libvhost-user/libvhost-user.h | 3 ++ 2 files changed, 56 insertions(+), 8 deletions(-)
From: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
contrib/libvhost-user/libvhost-user.c | 61 +++++++++++++++++++++++----
contrib/libvhost-user/libvhost-user.h | 3 ++
2 files changed, 56 insertions(+), 8 deletions(-)
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index fba291c13db4..550f6416a211 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -136,6 +136,7 @@ vu_request_to_string(unsigned int req)
REQ(VHOST_USER_GET_INFLIGHT_FD),
REQ(VHOST_USER_SET_INFLIGHT_FD),
REQ(VHOST_USER_GPU_SET_SOCKET),
+ REQ(VHOST_USER_VRING_KICK),
REQ(VHOST_USER_MAX),
};
#undef REQ
@@ -920,6 +921,7 @@ static bool
vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
{
int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
if (index >= dev->max_queues) {
vmsg_close_fds(vmsg);
@@ -927,8 +929,12 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
return false;
}
- if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK ||
- vmsg->fd_num != 1) {
+ if (nofd) {
+ vmsg_close_fds(vmsg);
+ return true;
+ }
+
+ if (vmsg->fd_num != 1) {
vmsg_close_fds(vmsg);
vu_panic(dev, "Invalid fds in request: %d", vmsg->request);
return false;
@@ -1025,6 +1031,7 @@ static bool
vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
{
int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK;
DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
@@ -1038,8 +1045,8 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
dev->vq[index].kick_fd = -1;
}
- dev->vq[index].kick_fd = vmsg->fds[0];
- DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index);
+ dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0];
+ DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index);
dev->vq[index].started = true;
if (dev->iface->queue_set_started) {
@@ -1116,6 +1123,7 @@ static bool
vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
{
int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ bool nofd = VHOST_USER_VRING_NOFD_MASK;
DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
@@ -1128,14 +1136,14 @@ vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg)
dev->vq[index].call_fd = -1;
}
- dev->vq[index].call_fd = vmsg->fds[0];
+ dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0];
/* in case of I/O hang after reconnecting */
- if (eventfd_write(vmsg->fds[0], 1)) {
+ if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) {
return -1;
}
- DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index);
+ DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index);
return false;
}
@@ -1169,7 +1177,8 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ |
1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER |
1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD |
- 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK;
+ 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
+ 1ULL << VHOST_USER_PROTOCOL_F_KICK_CALL_MSGS;
if (have_userfault()) {
features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT;
@@ -1456,6 +1465,25 @@ vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
return false;
}
+static bool
+vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg)
+{
+ unsigned int index = vmsg->payload.state.index;
+
+ if (index >= dev->max_queues) {
+ vu_panic(dev, "Invalid queue index: %u", index);
+ return false;
+ }
+
+ DPRINT("Got kick message: handler:%p idx:%d\n",
+ dev->vq[index].handler, index);
+ if (dev->vq[index].handler) {
+ dev->vq[index].handler(dev, index);
+ }
+
+ return false;
+}
+
static bool
vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
{
@@ -1538,6 +1566,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
return vu_get_inflight_fd(dev, vmsg);
case VHOST_USER_SET_INFLIGHT_FD:
return vu_set_inflight_fd(dev, vmsg);
+ case VHOST_USER_VRING_KICK:
+ return vu_handle_vring_kick(dev, vmsg);
default:
vmsg_close_fds(vmsg);
vu_panic(dev, "Unhandled request: %d", vmsg->request);
@@ -2010,6 +2040,21 @@ vu_queue_notify(VuDev *dev, VuVirtq *vq)
return;
}
+ if (vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_KICK_CALL_MSGS) &&
+ vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) {
+ VhostUserMsg vmsg = {
+ .request = VHOST_USER_SLAVE_VRING_CALL,
+ .flags = VHOST_USER_VERSION,
+ .size = sizeof(vmsg.payload.state),
+ .payload.state = {
+ .index = dev->vq - vq,
+ },
+ };
+
+ vu_message_write(dev, dev->slave_fd, &vmsg);
+ return;
+ }
+
if (eventfd_write(vq->call_fd, 1) < 0) {
vu_panic(dev, "Error writing eventfd: %s", strerror(errno));
}
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 46b600799b2e..392dea306bb9 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -53,6 +53,7 @@ enum VhostUserProtocolFeature {
VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
+ VHOST_USER_PROTOCOL_F_KICK_CALL_MSGS = 13,
VHOST_USER_PROTOCOL_F_MAX
};
@@ -94,6 +95,7 @@ typedef enum VhostUserRequest {
VHOST_USER_GET_INFLIGHT_FD = 31,
VHOST_USER_SET_INFLIGHT_FD = 32,
VHOST_USER_GPU_SET_SOCKET = 33,
+ VHOST_USER_VRING_KICK = 34,
VHOST_USER_MAX
} VhostUserRequest;
@@ -102,6 +104,7 @@ typedef enum VhostUserSlaveRequest {
VHOST_USER_SLAVE_IOTLB_MSG = 1,
VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
+ VHOST_USER_SLAVE_VRING_CALL = 4,
VHOST_USER_SLAVE_MAX
} VhostUserSlaveRequest;
--
2.23.0
On Fri, Sep 06, 2019 at 03:13:50PM +0300, Johannes Berg wrote: > From: Johannes Berg <johannes.berg@intel.com> > > Signed-off-by: Johannes Berg <johannes.berg@intel.com> a bit more content here about the motivation for this? > --- > contrib/libvhost-user/libvhost-user.c | 61 +++++++++++++++++++++++---- > contrib/libvhost-user/libvhost-user.h | 3 ++ > 2 files changed, 56 insertions(+), 8 deletions(-) > > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c > index fba291c13db4..550f6416a211 100644 > --- a/contrib/libvhost-user/libvhost-user.c > +++ b/contrib/libvhost-user/libvhost-user.c > @@ -136,6 +136,7 @@ vu_request_to_string(unsigned int req) > REQ(VHOST_USER_GET_INFLIGHT_FD), > REQ(VHOST_USER_SET_INFLIGHT_FD), > REQ(VHOST_USER_GPU_SET_SOCKET), > + REQ(VHOST_USER_VRING_KICK), > REQ(VHOST_USER_MAX), > }; > #undef REQ > @@ -920,6 +921,7 @@ static bool > vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) > { > int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; > + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; > > if (index >= dev->max_queues) { > vmsg_close_fds(vmsg); > @@ -927,8 +929,12 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) > return false; > } > > - if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK || > - vmsg->fd_num != 1) { > + if (nofd) { > + vmsg_close_fds(vmsg); > + return true; > + } > + > + if (vmsg->fd_num != 1) { > vmsg_close_fds(vmsg); > vu_panic(dev, "Invalid fds in request: %d", vmsg->request); > return false; > @@ -1025,6 +1031,7 @@ static bool > vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) > { > int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; > + bool nofd = vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK; > > DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); > > @@ -1038,8 +1045,8 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) > dev->vq[index].kick_fd = -1; > } > > - dev->vq[index].kick_fd = vmsg->fds[0]; > - DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); > + dev->vq[index].kick_fd = nofd ? -1 : vmsg->fds[0]; > + DPRINT("Got kick_fd: %d for vq: %d\n", dev->vq[index].kick_fd, index); > > dev->vq[index].started = true; > if (dev->iface->queue_set_started) { > @@ -1116,6 +1123,7 @@ static bool > vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) > { > int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; > + bool nofd = VHOST_USER_VRING_NOFD_MASK; > > DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); > > @@ -1128,14 +1136,14 @@ vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) > dev->vq[index].call_fd = -1; > } > > - dev->vq[index].call_fd = vmsg->fds[0]; > + dev->vq[index].call_fd = nofd ? -1 : vmsg->fds[0]; > > /* in case of I/O hang after reconnecting */ > - if (eventfd_write(vmsg->fds[0], 1)) { > + if (dev->vq[index].call_fd != -1 && eventfd_write(vmsg->fds[0], 1)) { > return -1; > } > > - DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); > + DPRINT("Got call_fd: %d for vq: %d\n", dev->vq[index].call_fd, index); > > return false; > } > @@ -1169,7 +1177,8 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) > 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | > 1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | > 1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | > - 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; > + 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | > + 1ULL << VHOST_USER_PROTOCOL_F_KICK_CALL_MSGS; > > if (have_userfault()) { > features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT; > @@ -1456,6 +1465,25 @@ vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg) > return false; > } > > +static bool > +vu_handle_vring_kick(VuDev *dev, VhostUserMsg *vmsg) > +{ > + unsigned int index = vmsg->payload.state.index; > + > + if (index >= dev->max_queues) { > + vu_panic(dev, "Invalid queue index: %u", index); > + return false; > + } > + > + DPRINT("Got kick message: handler:%p idx:%d\n", > + dev->vq[index].handler, index); > + if (dev->vq[index].handler) { > + dev->vq[index].handler(dev, index); > + } > + > + return false; > +} > + > static bool > vu_process_message(VuDev *dev, VhostUserMsg *vmsg) > { > @@ -1538,6 +1566,8 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg) > return vu_get_inflight_fd(dev, vmsg); > case VHOST_USER_SET_INFLIGHT_FD: > return vu_set_inflight_fd(dev, vmsg); > + case VHOST_USER_VRING_KICK: > + return vu_handle_vring_kick(dev, vmsg); > default: > vmsg_close_fds(vmsg); > vu_panic(dev, "Unhandled request: %d", vmsg->request); > @@ -2010,6 +2040,21 @@ vu_queue_notify(VuDev *dev, VuVirtq *vq) > return; > } > > + if (vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_KICK_CALL_MSGS) && > + vu_has_protocol_feature(dev, VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { > + VhostUserMsg vmsg = { > + .request = VHOST_USER_SLAVE_VRING_CALL, > + .flags = VHOST_USER_VERSION, > + .size = sizeof(vmsg.payload.state), > + .payload.state = { > + .index = dev->vq - vq, > + }, > + }; > + > + vu_message_write(dev, dev->slave_fd, &vmsg); > + return; > + } > + > if (eventfd_write(vq->call_fd, 1) < 0) { > vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); > } > diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h > index 46b600799b2e..392dea306bb9 100644 > --- a/contrib/libvhost-user/libvhost-user.h > +++ b/contrib/libvhost-user/libvhost-user.h > @@ -53,6 +53,7 @@ enum VhostUserProtocolFeature { > VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10, > VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11, > VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, > + VHOST_USER_PROTOCOL_F_KICK_CALL_MSGS = 13, > > VHOST_USER_PROTOCOL_F_MAX > }; > @@ -94,6 +95,7 @@ typedef enum VhostUserRequest { > VHOST_USER_GET_INFLIGHT_FD = 31, > VHOST_USER_SET_INFLIGHT_FD = 32, > VHOST_USER_GPU_SET_SOCKET = 33, > + VHOST_USER_VRING_KICK = 34, > VHOST_USER_MAX > } VhostUserRequest; > > @@ -102,6 +104,7 @@ typedef enum VhostUserSlaveRequest { > VHOST_USER_SLAVE_IOTLB_MSG = 1, > VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, > VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, > + VHOST_USER_SLAVE_VRING_CALL = 4, > VHOST_USER_SLAVE_MAX > } VhostUserSlaveRequest; > > -- > 2.23.0 >
Hi, On Fri, 2019-09-06 at 10:22 -0400, Michael S. Tsirkin wrote: > On Fri, Sep 06, 2019 at 03:13:50PM +0300, Johannes Berg wrote: > > From: Johannes Berg <johannes.berg@intel.com> > > > > Signed-off-by: Johannes Berg <johannes.berg@intel.com> > > a bit more content here about the motivation for this? Heh, right, definitely needed. I was just sending it out as the corresponding patch to the spec change RFC, where I explained more, so didn't really bother here yet. However, I evidently forgot to CC you on that: https://lore.kernel.org/qemu-devel/20190902121233.13382-1-johannes@sipsolutions.net/ I'm still trying to implement it in User-Mode Linux (UML, ARCH=um), we've submitted patches for virtio/vhost-user support to that, but the simulation-bound IRQ handling is a bit complicated. I need to see how it turns out once I actually get it to work - I've gotten this extension, SLAVE_REQ and REPLY_ACK to work now, so need to "just" integrate with the time-travel mode I already have. In any case, if you think that this is a stupid extension and say you will never accept it, I'll probably just implement a slightly more hackish way, setting vhost-user to polling mode and using out-of-band signalling or so. This seems a bit cleaner though, and if it's properly spec'ed and with sample code and all then it'll possibly be far more useful to others. (**) I think I also forgot to CC you on these two: https://lore.kernel.org/qemu-devel/20190828083401.2342-1-johannes@sipsolutions.net/ https://lore.kernel.org/qemu-devel/20190903192505.10686-1-johannes@sipsolutions.net/ Again, sorry about that. Btw, at least one of these files doesn't even have an entry in the maintainers file. Don't remember if it was the spec though or the libvhost-user stuff. (**) For example, there's the VMSimInt paper (***) that shows a very similar thing with QEMU, but works only with CPU emulation. With UML's time-travel mode made to work over virtio we can do similar things without CPU emulation. I suspect it's also possible to emulate the HPET or so in a KVM-based system, but seems far more tricky (to me at least). (***) http://www.ikr.uni-stuttgart.de/Content/Publications/Archive/We_SIMUTools_2014_40209.pdf Thanks, johannes
On Fri, Sep 06, 2019 at 04:48:39PM +0200, Johannes Berg wrote: > Hi, > > On Fri, 2019-09-06 at 10:22 -0400, Michael S. Tsirkin wrote: > > On Fri, Sep 06, 2019 at 03:13:50PM +0300, Johannes Berg wrote: > > > From: Johannes Berg <johannes.berg@intel.com> > > > > > > Signed-off-by: Johannes Berg <johannes.berg@intel.com> > > > > a bit more content here about the motivation for this? > > Heh, right, definitely needed. > > I was just sending it out as the corresponding patch to the spec change > RFC, where I explained more, so didn't really bother here yet. However, > I evidently forgot to CC you on that: > > https://lore.kernel.org/qemu-devel/20190902121233.13382-1-johannes@sipsolutions.net/ Oh. Apparently qemu mailman chose this time to kick me out of list subscription (too many bounces or something?) so I didn't see it. > I'm still trying to implement it in User-Mode Linux (UML, ARCH=um), > we've submitted patches for virtio/vhost-user support to that, but the > simulation-bound IRQ handling is a bit complicated. I need to see how it > turns out once I actually get it to work - I've gotten this extension, > SLAVE_REQ and REPLY_ACK to work now, so need to "just" integrate with > the time-travel mode I already have. > > In any case, if you think that this is a stupid extension and say you > will never accept it, I'll probably just implement a slightly more > hackish way, setting vhost-user to polling mode and using out-of-band > signalling or so. This seems a bit cleaner though, and if it's properly > spec'ed and with sample code and all then it'll possibly be far more > useful to others. (**) What worries me is the load this places on the socket. ATM if socket buffer is full qemu locks up, so we need to be careful not to send too many messages. > > > I think I also forgot to CC you on these two: > https://lore.kernel.org/qemu-devel/20190828083401.2342-1-johannes@sipsolutions.net/ > https://lore.kernel.org/qemu-devel/20190903192505.10686-1-johannes@sipsolutions.net/ > > Again, sorry about that. > > Btw, at least one of these files doesn't even have an entry in the > maintainers file. Don't remember if it was the spec though or the > libvhost-user stuff. > > > (**) For example, there's the VMSimInt paper (***) that shows a very > similar thing with QEMU, but works only with CPU emulation. With UML's > time-travel mode made to work over virtio we can do similar things > without CPU emulation. I suspect it's also possible to emulate the HPET > or so in a KVM-based system, but seems far more tricky (to me at least). > > (***) http://www.ikr.uni-stuttgart.de/Content/Publications/Archive/We_SIMUTools_2014_40209.pdf > > Thanks, > johannes
Hi, > Oh. Apparently qemu mailman chose this time to kick me out > of list subscription (too many bounces or something?) > so I didn't see it. D'oh. Well, it's really my mistake, I should've CC'ed you. > What worries me is the load this places on the socket. > ATM if socket buffer is full qemu locks up, so we > need to be careful not to send too many messages. Right, sure. I really don't think you ever want to use this extension in a "normal VM" use case. :-) I think the only use for this extension would be for simulation purposes, and even then only combined with the REPLY_ACK and SLAVE_REQ extensions, i.e. you explicitly *want* your virtual machine to lock up / wait for a response to the KICK command (and respectively, the device to wait for a response to the CALL command). Note that this is basically its sole purpose: ensuring exactly this synchronisation! Yes, it's bad for speed, but it's needed in simulation when time isn't "real". Let me try to explain again, most likely my previous explanation was too long winded. WLOG, I'll focus on the "kick" use case, the "call" is the same, just the other way around. I'm sure you know that the call is asynchronous, i.e. the VM will increment the eventfd counter, and "eventually" it becomes readable to the device. Now the device does something (as fast as it can, presumably) and returns the buffer to the VM. Now, imagine you're running in simulation time, i.e. "time travel" mode. Briefly, this hacks the idle loop of the (UML) VM to just skip forward when there's nothing to do, i.e. if you have a timer firing in 100ms and get to idle, time is immediately incremented by 100ms and the timer fires. For a single VM/device this is already implemented in UML, and while it's already very useful that's only half the story to me. Once you have multiple devices and/or VMs, you basically have to keep a "simulation calendar" where each participant (VM/device) can put an entry, and then whenever they become idle they don't immediately move time forward, but instead ask the calendar what's next, and the calendar determines who runs. Now, for these simulation cases, consider vhost-user again. It's absolutely necessary that the calendar is updated all the time, and the asynchronous nature of the call breaks that - the device cannot update the calendar to put an event there to process the call message. With this extension, the device would work in the following way. Assume that the device is idle, and waiting for the simulation calendar to tell it to run. Now, 1) it has an incoming call (message) from VM (which waits for reply) 2) the device will now put a new event on the simulation scheduler for a time slot to process the message 3) return reply to VM 4) device goes back to sleep - this stuff was asynchronously handled outside of the simulation basically. In a sense, the code that just ran isn't considered part of the simulated device, it's just the transport protocol and part of the simulation environment. At this point, the device is still waiting for its calendar event to be triggered, but now it has a new one to process the message. Now, once the VM goes to sleep, the scheduler will check the calendar and presumably tell the device to run, which runs and processes the message. This repeats for as long as the simulation runs, going both ways (or multiple ways if there are more than 2 participants). Now, what if you didn't have this synchronisation, ie. we don't have this extension or we don't have REPLY_ACK or whatnot? In that case, after the step 1 above, the VM will immediately continue running. Let's say it'll wait for a response from the device for a few hundred milliseconds (of now simulated time). However, depending on the scheduling, the device has quite likely not yet put the new event on the simulation calendar (that happens in step 2 above). This means that the VM's calendar event to wake it up after a few hundred milliseconds will immediately trigger, and the simulation ends with the driver getting a timeout from the device. So - yes, while I understand your concern, I basically think this is not something anyone will want to use outside of such simulations. OTOH, there are various use cases (I'm doing device simulation, others are doing network simulation) that use such a behaviour, and it might be nice to support it in a more standard way, rather than everyone having their own local hacks for everything, like e.g. the VMSimInt paper(**). But again, like I said, no hard feelings if you think such simulation has no place in upstream vhost-user. (**) I put a copy of their qemu changes on top of 1.6.0 here: https://p.sipsolutions.net/af9a68ded948c07e.txt johannes
On Fri, Sep 06, 2019 at 05:32:02PM +0200, Johannes Berg wrote: > Hi, > > > Oh. Apparently qemu mailman chose this time to kick me out > > of list subscription (too many bounces or something?) > > so I didn't see it. > > D'oh. Well, it's really my mistake, I should've CC'ed you. > > > What worries me is the load this places on the socket. > > ATM if socket buffer is full qemu locks up, so we > > need to be careful not to send too many messages. > > Right, sure. I really don't think you ever want to use this extension in > a "normal VM" use case. :-) > > I think the only use for this extension would be for simulation > purposes, and even then only combined with the REPLY_ACK and SLAVE_REQ > extensions, i.e. you explicitly *want* your virtual machine to lock up / > wait for a response to the KICK command (and respectively, the device to > wait for a response to the CALL command). OK so when combined with these, it's OK I think. Do we want to force this restriction in code maybe then? > Note that this is basically its sole purpose: ensuring exactly this > synchronisation! Yes, it's bad for speed, but it's needed in simulation > when time isn't "real". > > Let me try to explain again, most likely my previous explanation was too > long winded. WLOG, I'll focus on the "kick" use case, the "call" is the > same, just the other way around. I'm sure you know that the call is > asynchronous, i.e. the VM will increment the eventfd counter, and > "eventually" it becomes readable to the device. Now the device does > something (as fast as it can, presumably) and returns the buffer to the > VM. > > Now, imagine you're running in simulation time, i.e. "time travel" mode. > Briefly, this hacks the idle loop of the (UML) VM to just skip forward > when there's nothing to do, i.e. if you have a timer firing in 100ms and > get to idle, time is immediately incremented by 100ms and the timer > fires. For a single VM/device this is already implemented in UML, and > while it's already very useful that's only half the story to me. > > Once you have multiple devices and/or VMs, you basically have to keep a > "simulation calendar" where each participant (VM/device) can put an > entry, and then whenever they become idle they don't immediately move > time forward, but instead ask the calendar what's next, and the calendar > determines who runs. > > Now, for these simulation cases, consider vhost-user again. It's > absolutely necessary that the calendar is updated all the time, and the > asynchronous nature of the call breaks that - the device cannot update > the calendar to put an event there to process the call message. > > With this extension, the device would work in the following way. Assume > that the device is idle, and waiting for the simulation calendar to tell > it to run. Now, > > 1) it has an incoming call (message) from VM (which waits for reply) > 2) the device will now put a new event on the simulation scheduler for > a time slot to process the message > 3) return reply to VM > 4) device goes back to sleep - this stuff was asynchronously handled > outside of the simulation basically. > > In a sense, the code that just ran isn't considered part of the > simulated device, it's just the transport protocol and part of the > simulation environment. > > At this point, the device is still waiting for its calendar event to be > triggered, but now it has a new one to process the message. Now, once > the VM goes to sleep, the scheduler will check the calendar and > presumably tell the device to run, which runs and processes the message. > This repeats for as long as the simulation runs, going both ways (or > multiple ways if there are more than 2 participants). > > > Now, what if you didn't have this synchronisation, ie. we don't have > this extension or we don't have REPLY_ACK or whatnot? > > In that case, after the step 1 above, the VM will immediately continue > running. Let's say it'll wait for a response from the device for a few > hundred milliseconds (of now simulated time). However, depending on the > scheduling, the device has quite likely not yet put the new event on the > simulation calendar (that happens in step 2 above). This means that the > VM's calendar event to wake it up after a few hundred milliseconds will > immediately trigger, and the simulation ends with the driver getting a > timeout from the device. > > > So - yes, while I understand your concern, I basically think this is not > something anyone will want to use outside of such simulations. OTOH, > there are various use cases (I'm doing device simulation, others are > doing network simulation) that use such a behaviour, and it might be > nice to support it in a more standard way, rather than everyone having > their own local hacks for everything, like e.g. the VMSimInt paper(**). > > > But again, like I said, no hard feelings if you think such simulation > has no place in upstream vhost-user. > > > (**) I put a copy of their qemu changes on top of 1.6.0 here: > https://p.sipsolutions.net/af9a68ded948c07e.txt > > johannes
On Sun, 2019-09-08 at 09:13 -0400, Michael S. Tsirkin wrote: > > I think the only use for this extension would be for simulation > > purposes, and even then only combined with the REPLY_ACK and SLAVE_REQ > > extensions, i.e. you explicitly *want* your virtual machine to lock up / > > wait for a response to the KICK command (and respectively, the device to > > wait for a response to the CALL command). > > OK so when combined with these, it's OK I think. OK. > Do we want to force this restriction in code maybe then? Unlike in this patch, I was planning to not actually advertise VHOST_USER_PROTOCOL_F_KICK_CALL_MSGS, and do that only if the user of the library wants to advertise it, since devices used for simulation also have to be careful about how they use this. However, if I understand correctly we cannot enforce that all of them are used at the same time - we're the device side, so we only advertise our features and the master actually sets the ones it wants to use, no? The only thing we could do is crash if it wants to use this feature without the others, but would that really be helpful? I'm starting to actually get this working, so I'll post new patches in a few days or so. johannes
On Mon, Sep 09, 2019 at 01:35:19PM +0200, Johannes Berg wrote: > On Sun, 2019-09-08 at 09:13 -0400, Michael S. Tsirkin wrote: > > > > I think the only use for this extension would be for simulation > > > purposes, and even then only combined with the REPLY_ACK and SLAVE_REQ > > > extensions, i.e. you explicitly *want* your virtual machine to lock up / > > > wait for a response to the KICK command (and respectively, the device to > > > wait for a response to the CALL command). > > > > OK so when combined with these, it's OK I think. > > OK. > > > Do we want to force this restriction in code maybe then? > > Unlike in this patch, I was planning to not actually advertise > VHOST_USER_PROTOCOL_F_KICK_CALL_MSGS, and do that only if the user of > the library wants to advertise it, since devices used for simulation > also have to be careful about how they use this. > > However, if I understand correctly we cannot enforce that all of them > are used at the same time - we're the device side, so we only advertise > our features and the master actually sets the ones it wants to use, no? > > The only thing we could do is crash if it wants to use this feature > without the others, but would that really be helpful? We can return failure from SET_PROTOCOL_FEATURES. We can also fail all following messages. > I'm starting to actually get this working, so I'll post new patches in a > few days or so. > > johannes
On Mon, 2019-09-09 at 08:41 -0400, Michael S. Tsirkin wrote: > > The only thing we could do is crash if it wants to use this feature > > without the others, but would that really be helpful? > > We can return failure from SET_PROTOCOL_FEATURES. > We can also fail all following messages. Only if we have F_REPLY_ACK, I think? I suppose we could fail a later command that already needs a reply without REPLY_ACK, but that seems difficult to debug? Anyway, if you feel that we should have this as some sort of safeguard I can try to do that; to me it feels rather pointless as libvhost-user is more of a sample implementation than anything else. Unless you also wanted to write into the spec that F_KICK_CALL_MSGS absolutely *requires* F_REPLY_ACK, and add some text that tells a device how to behave when F_KICK_CALL_MSGS is negotiated without F_REPLY_ACK? johannes
On Mon, Sep 09, 2019 at 03:05:08PM +0200, Johannes Berg wrote: > On Mon, 2019-09-09 at 08:41 -0400, Michael S. Tsirkin wrote: > > > > The only thing we could do is crash if it wants to use this feature > > > without the others, but would that really be helpful? > > > > We can return failure from SET_PROTOCOL_FEATURES. > > We can also fail all following messages. > > Only if we have F_REPLY_ACK, I think? > > I suppose we could fail a later command that already needs a reply > without REPLY_ACK, but that seems difficult to debug? Next command is GET_FEATURES. Return an error response from that and device init will fail. > Anyway, if you feel that we should have this as some sort of safeguard I > can try to do that; to me it feels rather pointless as libvhost-user is > more of a sample implementation than anything else. Exactly for this reason :) > Unless you also wanted to write into the spec that F_KICK_CALL_MSGS > absolutely *requires* F_REPLY_ACK, yep > and add some text that tells a device > how to behave when F_KICK_CALL_MSGS is negotiated without F_REPLY_ACK? > > johannes We can document how to behave in case of inconsistent protocol features, yes.
On Mon, 2019-09-09 at 09:48 -0400, Michael S. Tsirkin wrote: > > I suppose we could fail a later command that already needs a reply > > without REPLY_ACK, but that seems difficult to debug? > > Next command is GET_FEATURES. Return an error response from that > and device init will fail. Hmm, what's an error here though? You can only return a value, no? > > Anyway, if you feel that we should have this as some sort of safeguard I > > can try to do that; to me it feels rather pointless as libvhost-user is > > more of a sample implementation than anything else. > > Exactly for this reason :) :) > > Unless you also wanted to write into the spec that F_KICK_CALL_MSGS > > absolutely *requires* F_REPLY_ACK, > > yep Sure, I'm fine with that. > We can document how to behave in case of inconsistent protocol features, > yes. OK. johannes
On Mon, Sep 09, 2019 at 03:50:48PM +0200, Johannes Berg wrote: > On Mon, 2019-09-09 at 09:48 -0400, Michael S. Tsirkin wrote: > > > > I suppose we could fail a later command that already needs a reply > > > without REPLY_ACK, but that seems difficult to debug? > > > > Next command is GET_FEATURES. Return an error response from that > > and device init will fail. > > Hmm, what's an error here though? You can only return a value, no? Either returning id that does not match GET_FEATURES or returning size != 8 bytes will work. Using 0 size payload has precedent in VHOST_USER_GET_CONFIG: vhost-user slave uses zero length of payload to indicate an error to vhost-user master. It's annoying that we don't get an error indicator in that case though. Worth returning e.g. a 4 byte code? And let's generalize for all other messages with response? > > > Anyway, if you feel that we should have this as some sort of safeguard I > > > can try to do that; to me it feels rather pointless as libvhost-user is > > > more of a sample implementation than anything else. > > > > Exactly for this reason :) > > :) > > > > Unless you also wanted to write into the spec that F_KICK_CALL_MSGS > > > absolutely *requires* F_REPLY_ACK, > > > > yep > > Sure, I'm fine with that. > > > We can document how to behave in case of inconsistent protocol features, > > yes. > > OK. > > johannes
On Mon, 2019-09-09 at 10:59 -0400, Michael S. Tsirkin wrote: > > > Next command is GET_FEATURES. Return an error response from that > > > and device init will fail. > > > > Hmm, what's an error here though? You can only return a value, no? > > Either returning id that does not match GET_FEATURES or > returning size != 8 bytes will work. Hmm, right. > Using 0 size payload has precedent in VHOST_USER_GET_CONFIG: > vhost-user slave uses zero length of > payload to indicate an error to vhost-user master. OK, I think zero-len makes sense, that's certainly something that must be checked by the receiver already. > It's annoying that we don't get an error indicator in that case though. > Worth returning e.g. a 4 byte code? That also would need to be checked by the receiver, but 1) Then we have to start defining an error code enum, as there's no natural space. Is that worth it for what's basically a corner case? 2) It'd also be annoying that we now have a 4-byte error code, but with F_REPLY_ACK / need_reply you get an 8-byte status code, which would be incompatible in a way. > And let's generalize for all other messages with response? We could theoretically have a message in the future that wants a 4-byte response, though by convention basically everything uses u64 anyway ... OTOH, 0-byte as error doesn't generalize, VHOST_USER_POSTCOPY_ADVISE has only a file descriptor as slave payload AFAICT... But then possibly in those cases the master also wouldn't check the response size at all, since it never uses it. Qemu does though, and is probably the currently relevant implementation? Our UML implementation doesn't use this. Maybe instead we should just add a "VHOST_USER_REPLY_ERROR" bit (e.g. bit 4 after NEED_REPLY). Qemu in vhost_user_read_header() validates that it received REPLY_MASK | VERSION, so it would reject the message at that point. Another possibility would be to define the highest bit of the 'request' field to indicate an error, so for GET_FEATURES we'd return the value 0x80000000 | GET_FEATURES. For even more safety we could make a 4-byte response which is never valid for anything right now, but it feels overly cautious given that we currently do check both the request and flag fields. If you think a response status is needed and want to define an enum with possible values, then I'd probably prefer an 8-byte status since that's the way everything works now, but I don't really care much either way. johannes
On Mon, 2019-09-09 at 17:26 +0200, Johannes Berg wrote: > > Maybe instead we should just add a "VHOST_USER_REPLY_ERROR" bit (e.g. > bit 4 after NEED_REPLY). Qemu in vhost_user_read_header() validates that > it received REPLY_MASK | VERSION, so it would reject the message at that > point. > > Another possibility would be to define the highest bit of the 'request' > field to indicate an error, so for GET_FEATURES we'd return the value > 0x80000000 | GET_FEATURES. However, one way or another, that basically leaves us with three different ways of indicating an error: 1) already defined errors in existing messages - we can't change them since those are handled at runtime now, e.g. VHOST_USER_POSTCOPY_END returns a u64 value with an error status, and current code cannot deal with an error flag in the 'request' or 'flags' field 2) F_REPLY_ACK errors to messages that do not specify a response at all 3) this new way of indicating an error back from messages that specify a response, but the response has no inherent way of returning an error To me that really feels a bit too complex from the spec POV. But I don't see a way to generalize this without another extension, and again the device cannot choose which extensions it supports since the master chooses them and just sets them. Perhaps I really should just stick a "g_assert()" into the code at that point, and have it crash, since it's likely that F_KICK_CALL_MSGS isn't even going to be implemented in qemu (unless it grows simulation support and then it'd all be conditional on some simulation command-line option) And actually ... you got the order wrong: > > Next command is GET_FEATURES. Return an error response from that > > and device init will fail. That's not the case. We *start* with GET_FEATURES, if that includes protocol features then we do GET_PROTOCOL_FEATURES next, and then we get the # of queues next ... Though the whole discussion pretty much applies equivalently to GET_QUEUES_NUM instead of GET_FEATURES. johannes
On Mon, Sep 09, 2019 at 05:34:13PM +0200, Johannes Berg wrote: > On Mon, 2019-09-09 at 17:26 +0200, Johannes Berg wrote: > > > > Maybe instead we should just add a "VHOST_USER_REPLY_ERROR" bit (e.g. > > bit 4 after NEED_REPLY). Qemu in vhost_user_read_header() validates that > > it received REPLY_MASK | VERSION, so it would reject the message at that > > point. > > > > Another possibility would be to define the highest bit of the 'request' > > field to indicate an error, so for GET_FEATURES we'd return the value > > 0x80000000 | GET_FEATURES. > > However, one way or another, that basically leaves us with three > different ways of indicating an error: > > 1) already defined errors in existing messages - we can't change them > since those are handled at runtime now, e.g. VHOST_USER_POSTCOPY_END > returns a u64 value with an error status, and current code cannot > deal with an error flag in the 'request' or 'flags' field > 2) F_REPLY_ACK errors to messages that do not specify a response at all > 3) this new way of indicating an error back from messages that specify > a response, but the response has no inherent way of returning an > error > > To me that really feels a bit too complex from the spec POV. But I don't > see a way to generalize this without another extension, and again the > device cannot choose which extensions it supports since the master > chooses them and just sets them. > > Perhaps I really should just stick a "g_assert()" into the code at that > point, There's the old way: close the socket. This will make reads fail gracefully. If we don't want complexity right now, I'd go with that. > and have it crash, since it's likely that F_KICK_CALL_MSGS isn't > even going to be implemented in qemu (unless it grows simulation support > and then it'd all be conditional on some simulation command-line option) > > > > And actually ... you got the order wrong: > > > > Next command is GET_FEATURES. Return an error response from that > > > and device init will fail. > > That's not the case. We *start* with GET_FEATURES, if that includes > protocol features then we do GET_PROTOCOL_FEATURES next, and then we get > the # of queues next ... > > Though the whole discussion pretty much applies equivalently to > GET_QUEUES_NUM instead of GET_FEATURES. > > johannes
On Mon, 2019-09-09 at 11:45 -0400, Michael S. Tsirkin wrote: > On Mon, Sep 09, 2019 at 05:34:13PM +0200, Johannes Berg wrote: > > On Mon, 2019-09-09 at 17:26 +0200, Johannes Berg wrote: > > > Maybe instead we should just add a "VHOST_USER_REPLY_ERROR" bit (e.g. > > > bit 4 after NEED_REPLY). Qemu in vhost_user_read_header() validates that > > > it received REPLY_MASK | VERSION, so it would reject the message at that > > > point. > > > > > > Another possibility would be to define the highest bit of the 'request' > > > field to indicate an error, so for GET_FEATURES we'd return the value > > > 0x80000000 | GET_FEATURES. > > > > However, one way or another, that basically leaves us with three > > different ways of indicating an error: > > > > 1) already defined errors in existing messages - we can't change them > > since those are handled at runtime now, e.g. VHOST_USER_POSTCOPY_END > > returns a u64 value with an error status, and current code cannot > > deal with an error flag in the 'request' or 'flags' field > > 2) F_REPLY_ACK errors to messages that do not specify a response at all > > 3) this new way of indicating an error back from messages that specify > > a response, but the response has no inherent way of returning an > > error > > > > To me that really feels a bit too complex from the spec POV. But I don't > > see a way to generalize this without another extension, and again the > > device cannot choose which extensions it supports since the master > > chooses them and just sets them. > > > > Perhaps I really should just stick a "g_assert()" into the code at that > > point, > > There's the old way: close the socket. > This will make reads fail gracefully. > If we don't want complexity right now, I'd go with that. D'oh, good point. OK, I'll do that. Though it's almost equivalent in libvhost-user to just asserting, since it's mostly set up to just handle a single connection and then quit. Alright, thanks. Like I said, I'll send some more patches all around once I get it working, right now I'm crashing in some weird ways that I need to debug :) johannes
On Mon, 2019-09-09 at 15:50 +0200, Johannes Berg wrote: > > We can document how to behave in case of inconsistent protocol features, > > yes. > > OK. Coming back to this, I was just looking at it. How/where would you like to see this done? There isn't really any section that lists and explains the various protocol features, there's only a list. I could add a new section for "Simulation" or something like that that explains it, but then most people would probably skip that and not ever read the text about how you shouldn't implement F_KICK_CALL_MSGS :-) Any thoughts? johannes
On Tue, Sep 10, 2019 at 05:52:36PM +0200, Johannes Berg wrote: > On Mon, 2019-09-09 at 15:50 +0200, Johannes Berg wrote: > > > > We can document how to behave in case of inconsistent protocol features, > > > yes. > > > > OK. > > Coming back to this, I was just looking at it. > > How/where would you like to see this done? > > There isn't really any section that lists and explains the various > protocol features, there's only a list. I could add a new section for > "Simulation" or something like that that explains it, but then most > people would probably skip that and not ever read the text about how you > shouldn't implement F_KICK_CALL_MSGS :-) > > Any thoughts? > > johannes Each feature is documented near the description of the functionality it enables, that can work for this. I don't much like F_KICK_CALL_MSGS as not generic enough but it's not simulation as such: IN_BAND_NOTIFICATIONS? As for how to handle errors, that probably belongs near "Communication". Or maybe add a new "Error handling" section.
> Each feature is documented near the description of the functionality it > enables, that can work for this. Hmm, so you mean I should add a section on in-band notifications, and document things there? > I don't much like F_KICK_CALL_MSGS as > not generic enough but it's not simulation as such: > IN_BAND_NOTIFICATIONS? Sure, sounds good to me, I guess I'm not good at naming things :) > As for how to handle errors, that probably belongs near > "Communication". > > Or maybe add a new "Error handling" section. OK. Btw, I tried this yesterday in libvhost-user, but if I just do vu_panic() it just aborts that message handling and hangs, if I forcefully close the FD then it ends up crashing later ... I'm tempted to go with vu_panic() only for now as that seems to be the normal way to handle unexpected protocol errors there, many such other errors probably should also close the FD? johannes
On Wed, Sep 11, 2019 at 11:20:40AM +0200, Johannes Berg wrote: > > > Each feature is documented near the description of the functionality it > > enables, that can work for this. > > Hmm, so you mean I should add a section on in-band notifications, and > document things there? Like other messages - look at e.g. inflight description as an example. There's also a bunch of work finding all places that deal with kick/call FDs and updating that they only make sense without the new feature flag. > > I don't much like F_KICK_CALL_MSGS as > > not generic enough but it's not simulation as such: > > IN_BAND_NOTIFICATIONS? > > Sure, sounds good to me, I guess I'm not good at naming things :) > > > As for how to handle errors, that probably belongs near > > "Communication". > > > > Or maybe add a new "Error handling" section. > > OK. > > Btw, I tried this yesterday in libvhost-user, but if I just do > vu_panic() it just aborts that message handling and hangs, if I > forcefully close the FD then it ends up crashing later ... > > I'm tempted to go with vu_panic() only for now as that seems to be the > normal way to handle unexpected protocol errors there, many such other > errors probably should also close the FD? > > johannes I'm fine with a TODO for now. -- MST
© 2016 - 2024 Red Hat, Inc.