From: Bobby Eshleman <bobbyeshleman@meta.com>
Add netns support to loopback and vhost. Keep netns disabled for
virtio-vsock, but add necessary changes to comply with common API
updates.
This is the patch in the series when vhost-vsock namespaces actually
come online. Hence, vhost_transport_supports_local_mode() is switched
to return true.
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
---
Changes in v12:
- change seqpacket_allow() and stream_allow() to return true for
loopback and vhost (Stefano)
Changes in v11:
- reorder with the skb ownership patch for loopback (Stefano)
- toggle vhost_transport_supports_local_mode() to true
Changes in v10:
- Splitting patches complicates the series with meaningless placeholder
values that eventually get replaced anyway, so to avoid that this
patch combines into one. Links to previous patches here:
- Link: https://lore.kernel.org/all/20251111-vsock-vmtest-v9-3-852787a37bed@meta.com/
- Link: https://lore.kernel.org/all/20251111-vsock-vmtest-v9-6-852787a37bed@meta.com/
- Link: https://lore.kernel.org/all/20251111-vsock-vmtest-v9-7-852787a37bed@meta.com/
- remove placeholder values (Stefano)
- update comment describe net/net_mode for
virtio_transport_reset_no_sock()
---
drivers/vhost/vsock.c | 56 +++++++++++++++++++++--------
include/linux/virtio_vsock.h | 8 +++--
net/vmw_vsock/virtio_transport.c | 10 ++++--
net/vmw_vsock/virtio_transport_common.c | 63 ++++++++++++++++++++++++---------
net/vmw_vsock/vsock_loopback.c | 19 +++++++---
5 files changed, 118 insertions(+), 38 deletions(-)
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 83937e1d63fa..82cb9ec09e78 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -46,6 +46,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
struct vhost_vsock {
struct vhost_dev dev;
struct vhost_virtqueue vqs[2];
+ struct net *net;
+ netns_tracker ns_tracker;
+
+ /* The ns mode at the time vhost_vsock was created */
+ enum vsock_net_mode net_mode;
/* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
struct hlist_node hash;
@@ -67,7 +72,8 @@ static u32 vhost_transport_get_local_cid(void)
/* Callers that dereference the return value must hold vhost_vsock_mutex or the
* RCU read lock.
*/
-static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid, struct net *net,
+ enum vsock_net_mode mode)
{
struct vhost_vsock *vsock;
@@ -78,9 +84,10 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
if (other_cid == 0)
continue;
- if (other_cid == guest_cid)
+ if (other_cid == guest_cid &&
+ vsock_net_check_mode(net, mode, vsock->net,
+ vsock->net_mode))
return vsock;
-
}
return NULL;
@@ -269,7 +276,8 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
}
static int
-vhost_transport_send_pkt(struct sk_buff *skb)
+vhost_transport_send_pkt(struct sk_buff *skb, struct net *net,
+ enum vsock_net_mode net_mode)
{
struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
struct vhost_vsock *vsock;
@@ -278,7 +286,7 @@ vhost_transport_send_pkt(struct sk_buff *skb)
rcu_read_lock();
/* Find the vhost_vsock according to guest context id */
- vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid));
+ vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid), net, net_mode);
if (!vsock) {
rcu_read_unlock();
kfree_skb(skb);
@@ -305,7 +313,8 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
rcu_read_lock();
/* Find the vhost_vsock according to guest context id */
- vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
+ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid,
+ sock_net(sk_vsock(vsk)), vsk->net_mode);
if (!vsock)
goto out;
@@ -407,6 +416,12 @@ static bool vhost_transport_msgzerocopy_allow(void)
static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk,
u32 remote_cid);
+static bool
+vhost_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port)
+{
+ return true;
+}
+
static struct virtio_transport vhost_transport = {
.transport = {
.module = THIS_MODULE,
@@ -431,7 +446,7 @@ static struct virtio_transport vhost_transport = {
.stream_has_space = virtio_transport_stream_has_space,
.stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
.stream_is_active = virtio_transport_stream_is_active,
- .stream_allow = virtio_transport_stream_allow,
+ .stream_allow = vhost_transport_stream_allow,
.seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
.seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
@@ -464,14 +479,12 @@ static struct virtio_transport vhost_transport = {
static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk,
u32 remote_cid)
{
+ struct net *net = sock_net(sk_vsock(vsk));
struct vhost_vsock *vsock;
bool seqpacket_allow = false;
- if (vsk->net_mode != VSOCK_NET_MODE_GLOBAL)
- return false;
-
rcu_read_lock();
- vsock = vhost_vsock_get(remote_cid);
+ vsock = vhost_vsock_get(remote_cid, net, vsk->net_mode);
if (vsock)
seqpacket_allow = vsock->seqpacket_allow;
@@ -542,7 +555,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid &&
le64_to_cpu(hdr->dst_cid) ==
vhost_transport_get_local_cid())
- virtio_transport_recv_pkt(&vhost_transport, skb);
+ virtio_transport_recv_pkt(&vhost_transport, skb,
+ vsock->net, vsock->net_mode);
else
kfree_skb(skb);
@@ -659,6 +673,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
{
struct vhost_virtqueue **vqs;
struct vhost_vsock *vsock;
+ struct net *net;
int ret;
/* This struct is large and allocation could fail, fall back to vmalloc
@@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
goto out;
}
+ net = current->nsproxy->net_ns;
+ vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL);
+
+ /* Store the mode of the namespace at the time of creation. If this
+ * namespace later changes from "global" to "local", we want this vsock
+ * to continue operating normally and not suddenly break. For that
+ * reason, we save the mode here and later use it when performing
+ * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()).
+ */
+ vsock->net_mode = vsock_net_mode(net);
+
vsock->guest_cid = 0; /* no CID assigned yet */
vsock->seqpacket_allow = false;
@@ -713,7 +739,8 @@ static void vhost_vsock_reset_orphans(struct sock *sk)
*/
/* If the peer is still valid, no need to reset connection */
- if (vhost_vsock_get(vsk->remote_addr.svm_cid))
+ if (vhost_vsock_get(vsk->remote_addr.svm_cid, sock_net(sk),
+ vsk->net_mode))
return;
/* If the close timeout is pending, let it expire. This avoids races
@@ -758,6 +785,7 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
vhost_dev_cleanup(&vsock->dev);
+ put_net_track(vsock->net, &vsock->ns_tracker);
kfree(vsock->dev.vqs);
vhost_vsock_free(vsock);
return 0;
@@ -784,7 +812,7 @@ static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
/* Refuse if CID is already in use */
mutex_lock(&vhost_vsock_mutex);
- other = vhost_vsock_get(guest_cid);
+ other = vhost_vsock_get(guest_cid, vsock->net, vsock->net_mode);
if (other && other != vsock) {
mutex_unlock(&vhost_vsock_mutex);
return -EADDRINUSE;
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 1845e8d4f78d..7ea264dcfff7 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -173,6 +173,8 @@ struct virtio_vsock_pkt_info {
u32 remote_cid, remote_port;
struct vsock_sock *vsk;
struct msghdr *msg;
+ struct net *net;
+ enum vsock_net_mode net_mode;
u32 pkt_len;
u16 type;
u16 op;
@@ -185,7 +187,8 @@ struct virtio_transport {
struct vsock_transport transport;
/* Takes ownership of the packet */
- int (*send_pkt)(struct sk_buff *skb);
+ int (*send_pkt)(struct sk_buff *skb, struct net *net,
+ enum vsock_net_mode net_mode);
/* Used in MSG_ZEROCOPY mode. Checks, that provided data
* (number of buffers) could be transmitted with zerocopy
@@ -280,7 +283,8 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
void virtio_transport_destruct(struct vsock_sock *vsk);
void virtio_transport_recv_pkt(struct virtio_transport *t,
- struct sk_buff *skb);
+ struct sk_buff *skb, struct net *net,
+ enum vsock_net_mode net_mode);
void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb);
u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index f5123810192d..3ff695740108 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -231,7 +231,8 @@ static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struc
}
static int
-virtio_transport_send_pkt(struct sk_buff *skb)
+virtio_transport_send_pkt(struct sk_buff *skb, struct net *net,
+ enum vsock_net_mode net_mode)
{
struct virtio_vsock_hdr *hdr;
struct virtio_vsock *vsock;
@@ -665,7 +666,12 @@ static void virtio_transport_rx_work(struct work_struct *work)
virtio_vsock_skb_put(skb, payload_len);
virtio_transport_deliver_tap_pkt(skb);
- virtio_transport_recv_pkt(&virtio_transport, skb);
+
+ /* Force virtio-transport into global mode since it
+ * does not yet support local-mode namespacing.
+ */
+ virtio_transport_recv_pkt(&virtio_transport, skb,
+ NULL, VSOCK_NET_MODE_GLOBAL);
}
} while (!virtqueue_enable_cb(vq));
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index de71e2b3f77e..a818152d8b79 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -413,7 +413,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
virtio_transport_inc_tx_pkt(vvs, skb);
- ret = t_ops->send_pkt(skb);
+ ret = t_ops->send_pkt(skb, info->net, info->net_mode);
if (ret < 0)
break;
@@ -527,6 +527,8 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk)
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
+ .net_mode = vsk->net_mode,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1067,6 +1069,8 @@ int virtio_transport_connect(struct vsock_sock *vsk)
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_REQUEST,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
+ .net_mode = vsk->net_mode,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1082,6 +1086,8 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
(mode & SEND_SHUTDOWN ?
VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
+ .net_mode = vsk->net_mode,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1108,6 +1114,8 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
.msg = msg,
.pkt_len = len,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
+ .net_mode = vsk->net_mode,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1145,6 +1153,8 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
.op = VIRTIO_VSOCK_OP_RST,
.reply = !!skb,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
+ .net_mode = vsk->net_mode,
};
/* Send RST only if the original pkt is not a RST pkt */
@@ -1156,9 +1166,14 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
/* Normally packets are associated with a socket. There may be no socket if an
* attempt was made to connect to a socket that does not exist.
+ *
+ * net and net_mode refer to the namespace of whoever sent the invalid message.
+ * For loopback, this is the namespace of the socket. For vhost, this is the
+ * namespace of the VM (i.e., vhost_vsock).
*/
static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net *net,
+ enum vsock_net_mode net_mode)
{
struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
struct virtio_vsock_pkt_info info = {
@@ -1171,6 +1186,13 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
* sock_net(sk) until the reply skb is freed.
*/
.vsk = vsock_sk(skb->sk),
+
+ /* net or net_mode are not defined here because we pass
+ * net and net_mode directly to t->send_pkt(), instead of
+ * relying on virtio_transport_send_pkt_info() to pass them to
+ * t->send_pkt(). They are not needed by
+ * virtio_transport_alloc_skb().
+ */
};
struct sk_buff *reply;
@@ -1189,7 +1211,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
if (!reply)
return -ENOMEM;
- return t->send_pkt(reply);
+ return t->send_pkt(reply, net, net_mode);
}
/* This function should be called with sk_lock held and SOCK_DONE set */
@@ -1471,6 +1493,8 @@ virtio_transport_send_response(struct vsock_sock *vsk,
.remote_port = le32_to_cpu(hdr->src_port),
.reply = true,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
+ .net_mode = vsk->net_mode,
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1513,12 +1537,14 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
int ret;
if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) {
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
+ vsk->net_mode);
return -EINVAL;
}
if (sk_acceptq_is_full(sk)) {
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
+ vsk->net_mode);
return -ENOMEM;
}
@@ -1526,13 +1552,15 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
* Subsequent enqueues would lead to a memory leak.
*/
if (sk->sk_shutdown == SHUTDOWN_MASK) {
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
+ vsk->net_mode);
return -ESHUTDOWN;
}
child = vsock_create_connected(sk);
if (!child) {
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
+ vsk->net_mode);
return -ENOMEM;
}
@@ -1554,7 +1582,8 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
*/
if (ret || vchild->transport != &t->transport) {
release_sock(child);
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
+ vsk->net_mode);
sock_put(child);
return ret;
}
@@ -1582,7 +1611,8 @@ static bool virtio_transport_valid_type(u16 type)
* lock.
*/
void virtio_transport_recv_pkt(struct virtio_transport *t,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net *net,
+ enum vsock_net_mode net_mode)
{
struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
struct sockaddr_vm src, dst;
@@ -1605,24 +1635,25 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
le32_to_cpu(hdr->fwd_cnt));
if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) {
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net, net_mode);
goto free_pkt;
}
/* The socket must be in connected or bound table
* otherwise send reset back
*/
- sk = vsock_find_connected_socket(&src, &dst);
+ sk = vsock_find_connected_socket_net(&src, &dst, net, net_mode);
if (!sk) {
- sk = vsock_find_bound_socket(&dst);
+ sk = vsock_find_bound_socket_net(&dst, net, net_mode);
if (!sk) {
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net,
+ net_mode);
goto free_pkt;
}
}
if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) {
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net, net_mode);
sock_put(sk);
goto free_pkt;
}
@@ -1641,7 +1672,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
*/
if (sock_flag(sk, SOCK_DONE) ||
(sk->sk_state != TCP_LISTEN && vsk->transport != &t->transport)) {
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net, net_mode);
release_sock(sk);
sock_put(sk);
goto free_pkt;
@@ -1673,7 +1704,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
kfree_skb(skb);
break;
default:
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net, net_mode);
kfree_skb(skb);
break;
}
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index afad27cf533a..aef44d1631c3 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -26,7 +26,8 @@ static u32 vsock_loopback_get_local_cid(void)
return VMADDR_CID_LOCAL;
}
-static int vsock_loopback_send_pkt(struct sk_buff *skb)
+static int vsock_loopback_send_pkt(struct sk_buff *skb, struct net *net,
+ enum vsock_net_mode net_mode)
{
struct vsock_loopback *vsock = &the_vsock_loopback;
int len = skb->len;
@@ -48,6 +49,13 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk,
u32 remote_cid);
+
+static bool vsock_loopback_stream_allow(struct vsock_sock *vsk, u32 cid,
+ u32 port)
+{
+ return true;
+}
+
static bool vsock_loopback_msgzerocopy_allow(void)
{
return true;
@@ -77,7 +85,7 @@ static struct virtio_transport loopback_transport = {
.stream_has_space = virtio_transport_stream_has_space,
.stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
.stream_is_active = virtio_transport_stream_is_active,
- .stream_allow = virtio_transport_stream_allow,
+ .stream_allow = vsock_loopback_stream_allow,
.seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
.seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
@@ -110,7 +118,7 @@ static struct virtio_transport loopback_transport = {
static bool
vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
{
- return vsk->net_mode == VSOCK_NET_MODE_GLOBAL;
+ return true;
}
static void vsock_loopback_work(struct work_struct *work)
@@ -132,7 +140,10 @@ static void vsock_loopback_work(struct work_struct *work)
*/
virtio_transport_consume_skb_sent(skb, false);
virtio_transport_deliver_tap_pkt(skb);
- virtio_transport_recv_pkt(&loopback_transport, skb);
+
+ virtio_transport_recv_pkt(&loopback_transport, skb,
+ sock_net(skb->sk),
+ vsock_sk(skb->sk)->net_mode);
}
}
--
2.47.3
On 11/27/25 8:47 AM, Bobby Eshleman wrote: > @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) > goto out; > } > > + net = current->nsproxy->net_ns; > + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); > + > + /* Store the mode of the namespace at the time of creation. If this > + * namespace later changes from "global" to "local", we want this vsock > + * to continue operating normally and not suddenly break. For that > + * reason, we save the mode here and later use it when performing > + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). > + */ > + vsock->net_mode = vsock_net_mode(net); I'm sorry for the very late feedback. I think that at very least the user-space needs a way to query if the given transport is in local or global mode, as AFAICS there is no way to tell that when socket creation races with mode change. Also I'm a bit uneasy with the model implemented here, as 'local' socket may cross netns boundaris and connect to 'local' socket in other netns (if I read correctly patch 2/12). That in turns AFAICS break the netns isolation. Have you considered instead a slightly different model, where the local/global model is set in stone at netns creation time - alike what /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and inter-netns connectivity is explicitly granted by the admin (I guess you will need new transport operations for that)? /P [1] tcp allows using per-netns established socket lookup tables - as opposed to the default global lookup table (even if match always takes in account the netns obviously). The mentioned sysctl specify such configuration for the children namespaces, if any.
On Tue, Dec 02, 2025 at 11:18:14AM +0100, Paolo Abeni wrote: > On 11/27/25 8:47 AM, Bobby Eshleman wrote: > > @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) > > goto out; > > } > > > > + net = current->nsproxy->net_ns; > > + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); > > + > > + /* Store the mode of the namespace at the time of creation. If this > > + * namespace later changes from "global" to "local", we want this vsock > > + * to continue operating normally and not suddenly break. For that > > + * reason, we save the mode here and later use it when performing > > + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). > > + */ > > + vsock->net_mode = vsock_net_mode(net); > > I'm sorry for the very late feedback. I think that at very least the > user-space needs a way to query if the given transport is in local or > global mode, as AFAICS there is no way to tell that when socket creation > races with mode change. Are you thinking something along the lines of sockopt? > > Also I'm a bit uneasy with the model implemented here, as 'local' socket > may cross netns boundaris and connect to 'local' socket in other netns > (if I read correctly patch 2/12). That in turns AFAICS break the netns > isolation. Local mode sockets are unable to communicate with local mode (and global mode too) sockets that are in other namespaces. The key piece of code for that is vsock_net_check_mode(), where if either modes is local the namespaces must be the same. > > Have you considered instead a slightly different model, where the > local/global model is set in stone at netns creation time - alike what > /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and > inter-netns connectivity is explicitly granted by the admin (I guess > you will need new transport operations for that)? > > /P > > [1] tcp allows using per-netns established socket lookup tables - as > opposed to the default global lookup table (even if match always takes > in account the netns obviously). The mentioned sysctl specify such > configuration for the children namespaces, if any. > I'll save this discussion if the above doesn't resolve your concerns. Best, Bobby
On 12/2/25 6:56 PM, Bobby Eshleman wrote: > On Tue, Dec 02, 2025 at 11:18:14AM +0100, Paolo Abeni wrote: >> On 11/27/25 8:47 AM, Bobby Eshleman wrote: >>> @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) >>> goto out; >>> } >>> >>> + net = current->nsproxy->net_ns; >>> + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); >>> + >>> + /* Store the mode of the namespace at the time of creation. If this >>> + * namespace later changes from "global" to "local", we want this vsock >>> + * to continue operating normally and not suddenly break. For that >>> + * reason, we save the mode here and later use it when performing >>> + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). >>> + */ >>> + vsock->net_mode = vsock_net_mode(net); >> >> I'm sorry for the very late feedback. I think that at very least the >> user-space needs a way to query if the given transport is in local or >> global mode, as AFAICS there is no way to tell that when socket creation >> races with mode change. > > Are you thinking something along the lines of sockopt? I'd like to see a way for the user-space to query the socket 'namespace mode'. sockopt could be an option; a possibly better one could be sock_diag. Or you could do both using dumping the info with a shared helper invoked by both code paths, alike what TCP is doing. >> Also I'm a bit uneasy with the model implemented here, as 'local' socket >> may cross netns boundaris and connect to 'local' socket in other netns >> (if I read correctly patch 2/12). That in turns AFAICS break the netns >> isolation. > > Local mode sockets are unable to communicate with local mode (and global > mode too) sockets that are in other namespaces. The key piece of code > for that is vsock_net_check_mode(), where if either modes is local the > namespaces must be the same. Sorry, I likely misread the large comment in patch 2: https://lore.kernel.org/netdev/20251126-vsock-vmtest-v12-2-257ee21cd5de@meta.com/ >> Have you considered instead a slightly different model, where the >> local/global model is set in stone at netns creation time - alike what >> /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and >> inter-netns connectivity is explicitly granted by the admin (I guess >> you will need new transport operations for that)? >> >> /P >> >> [1] tcp allows using per-netns established socket lookup tables - as >> opposed to the default global lookup table (even if match always takes >> in account the netns obviously). The mentioned sysctl specify such >> configuration for the children namespaces, if any. > > I'll save this discussion if the above doesn't resolve your concerns. I still have some concern WRT the dynamic mode change after netns creation. I fear some 'unsolvable' (or very hard to solve) race I can't see now. A tcp_child_ehash_entries-like model will avoid completely the issue, but I understand it would be a significant change over the current status. "Luckily" the merge window is on us and we have some time to discuss. Do you have a specific use-case for the ability to change the netns mode after creation? /P
On Tue, Dec 02, 2025 at 09:47:19PM +0100, Paolo Abeni wrote: > On 12/2/25 6:56 PM, Bobby Eshleman wrote: > > On Tue, Dec 02, 2025 at 11:18:14AM +0100, Paolo Abeni wrote: > >> On 11/27/25 8:47 AM, Bobby Eshleman wrote: > >>> @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) > >>> goto out; > >>> } > >>> > >>> + net = current->nsproxy->net_ns; > >>> + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); > >>> + > >>> + /* Store the mode of the namespace at the time of creation. If this > >>> + * namespace later changes from "global" to "local", we want this vsock > >>> + * to continue operating normally and not suddenly break. For that > >>> + * reason, we save the mode here and later use it when performing > >>> + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). > >>> + */ > >>> + vsock->net_mode = vsock_net_mode(net); > >> > >> I'm sorry for the very late feedback. I think that at very least the > >> user-space needs a way to query if the given transport is in local or > >> global mode, as AFAICS there is no way to tell that when socket creation > >> races with mode change. > > > > Are you thinking something along the lines of sockopt? > > I'd like to see a way for the user-space to query the socket 'namespace > mode'. > > sockopt could be an option; a possibly better one could be sock_diag. Or > you could do both using dumping the info with a shared helper invoked by > both code paths, alike what TCP is doing. > >> Also I'm a bit uneasy with the model implemented here, as 'local' socket > >> may cross netns boundaris and connect to 'local' socket in other netns > >> (if I read correctly patch 2/12). That in turns AFAICS break the netns > >> isolation. > > > > Local mode sockets are unable to communicate with local mode (and global > > mode too) sockets that are in other namespaces. The key piece of code > > for that is vsock_net_check_mode(), where if either modes is local the > > namespaces must be the same. > > Sorry, I likely misread the large comment in patch 2: > > https://lore.kernel.org/netdev/20251126-vsock-vmtest-v12-2-257ee21cd5de@meta.com/ > > >> Have you considered instead a slightly different model, where the > >> local/global model is set in stone at netns creation time - alike what > >> /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and > >> inter-netns connectivity is explicitly granted by the admin (I guess > >> you will need new transport operations for that)? > >> > >> /P > >> > >> [1] tcp allows using per-netns established socket lookup tables - as > >> opposed to the default global lookup table (even if match always takes > >> in account the netns obviously). The mentioned sysctl specify such > >> configuration for the children namespaces, if any. > > > > I'll save this discussion if the above doesn't resolve your concerns. > I still have some concern WRT the dynamic mode change after netns > creation. I fear some 'unsolvable' (or very hard to solve) race I can't > see now. A tcp_child_ehash_entries-like model will avoid completely the > issue, but I understand it would be a significant change over the > current status. > > "Luckily" the merge window is on us and we have some time to discuss. Do > you have a specific use-case for the ability to change the netns mode > after creation? > > /P I don't think there is a hard requirement that the mode be change-able after creation. Though I'd love to avoid such a big change... or at least leave unchanged as much of what we've already reviewed as possible. In the scheme of defining the mode at creation and following the tcp_child_ehash_entries-ish model, what I'm imagining is: - /proc/sys/net/vsock/child_ns_mode can be set to "local" or "global" - /proc/sys/net/vsock/child_ns_mode is not immutable, can change any number of times - when a netns is created, the new netns mode is inherited from child_ns_mode, being assigned using something like: net->vsock.ns_mode = get_net_ns_by_pid(current->pid)->child_ns_mode - /proc/sys/net/vsock/ns_mode queries the current mode, returning "local" or "global", returning value of net->vsock.ns_mode - /proc/sys/net/vsock/ns_mode and net->vsock.ns_mode are immutable and reject writes Does that align with what you have in mind? Stefano, what are your thoughts? Best, Bobby
Hi, On 12/2/25 11:01 PM, Bobby Eshleman wrote: > On Tue, Dec 02, 2025 at 09:47:19PM +0100, Paolo Abeni wrote: >> I still have some concern WRT the dynamic mode change after netns >> creation. I fear some 'unsolvable' (or very hard to solve) race I can't >> see now. A tcp_child_ehash_entries-like model will avoid completely the >> issue, but I understand it would be a significant change over the >> current status. >> >> "Luckily" the merge window is on us and we have some time to discuss. Do >> you have a specific use-case for the ability to change the netns mode >> after creation? >> >> /P > > I don't think there is a hard requirement that the mode be change-able > after creation. Though I'd love to avoid such a big change... or at > least leave unchanged as much of what we've already reviewed as > possible. > > In the scheme of defining the mode at creation and following the > tcp_child_ehash_entries-ish model, what I'm imagining is: > - /proc/sys/net/vsock/child_ns_mode can be set to "local" or "global" > - /proc/sys/net/vsock/child_ns_mode is not immutable, can change any > number of times > > - when a netns is created, the new netns mode is inherited from > child_ns_mode, being assigned using something like: > > net->vsock.ns_mode = > get_net_ns_by_pid(current->pid)->child_ns_mode > > - /proc/sys/net/vsock/ns_mode queries the current mode, returning > "local" or "global", returning value of net->vsock.ns_mode > - /proc/sys/net/vsock/ns_mode and net->vsock.ns_mode are immutable and > reject writes > > Does that align with what you have in mind? Sorry for the latency. This fell of my radar while I still processed PW before EoY and afterwards I had some break. Yes, the above aligns with what I suggested, and I think it should solve possible race-related concerns (but I haven't looked at the RFC). /P
On Wed, Jan 07, 2026 at 10:47:56AM +0100, Paolo Abeni wrote: > Hi, > > On 12/2/25 11:01 PM, Bobby Eshleman wrote: > > On Tue, Dec 02, 2025 at 09:47:19PM +0100, Paolo Abeni wrote: > >> I still have some concern WRT the dynamic mode change after netns > >> creation. I fear some 'unsolvable' (or very hard to solve) race I can't > >> see now. A tcp_child_ehash_entries-like model will avoid completely the > >> issue, but I understand it would be a significant change over the > >> current status. > >> > >> "Luckily" the merge window is on us and we have some time to discuss. Do > >> you have a specific use-case for the ability to change the netns mode > >> after creation? > >> > >> /P > > > > I don't think there is a hard requirement that the mode be change-able > > after creation. Though I'd love to avoid such a big change... or at > > least leave unchanged as much of what we've already reviewed as > > possible. > > > > In the scheme of defining the mode at creation and following the > > tcp_child_ehash_entries-ish model, what I'm imagining is: > > - /proc/sys/net/vsock/child_ns_mode can be set to "local" or "global" > > - /proc/sys/net/vsock/child_ns_mode is not immutable, can change any > > number of times > > > > - when a netns is created, the new netns mode is inherited from > > child_ns_mode, being assigned using something like: > > > > net->vsock.ns_mode = > > get_net_ns_by_pid(current->pid)->child_ns_mode > > > > - /proc/sys/net/vsock/ns_mode queries the current mode, returning > > "local" or "global", returning value of net->vsock.ns_mode > > - /proc/sys/net/vsock/ns_mode and net->vsock.ns_mode are immutable and > > reject writes > > > > Does that align with what you have in mind? > Sorry for the latency. This fell of my radar while I still processed PW > before EoY and afterwards I had some break. > > Yes, the above aligns with what I suggested, and I think it should solve > possible race-related concerns (but I haven't looked at the RFC). > > /P > > No worries, understandable! Thanks for the confirmation. Best, Bobby
On Tue, Dec 02, 2025 at 02:01:04PM -0800, Bobby Eshleman wrote: > On Tue, Dec 02, 2025 at 09:47:19PM +0100, Paolo Abeni wrote: > > On 12/2/25 6:56 PM, Bobby Eshleman wrote: > > > On Tue, Dec 02, 2025 at 11:18:14AM +0100, Paolo Abeni wrote: > > >> On 11/27/25 8:47 AM, Bobby Eshleman wrote: > > >>> @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) > > >>> goto out; > > >>> } > > >>> > > >>> + net = current->nsproxy->net_ns; > > >>> + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); > > >>> + > > >>> + /* Store the mode of the namespace at the time of creation. If this > > >>> + * namespace later changes from "global" to "local", we want this vsock > > >>> + * to continue operating normally and not suddenly break. For that > > >>> + * reason, we save the mode here and later use it when performing > > >>> + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). > > >>> + */ > > >>> + vsock->net_mode = vsock_net_mode(net); > > >> > > >> I'm sorry for the very late feedback. I think that at very least the > > >> user-space needs a way to query if the given transport is in local or > > >> global mode, as AFAICS there is no way to tell that when socket creation > > >> races with mode change. > > > > > > Are you thinking something along the lines of sockopt? > > > > I'd like to see a way for the user-space to query the socket 'namespace > > mode'. > > > > sockopt could be an option; a possibly better one could be sock_diag. Or > > you could do both using dumping the info with a shared helper invoked by > > both code paths, alike what TCP is doing. > > >> Also I'm a bit uneasy with the model implemented here, as 'local' socket > > >> may cross netns boundaris and connect to 'local' socket in other netns > > >> (if I read correctly patch 2/12). That in turns AFAICS break the netns > > >> isolation. > > > > > > Local mode sockets are unable to communicate with local mode (and global > > > mode too) sockets that are in other namespaces. The key piece of code > > > for that is vsock_net_check_mode(), where if either modes is local the > > > namespaces must be the same. > > > > Sorry, I likely misread the large comment in patch 2: > > > > https://lore.kernel.org/netdev/20251126-vsock-vmtest-v12-2-257ee21cd5de@meta.com/ > > > > >> Have you considered instead a slightly different model, where the > > >> local/global model is set in stone at netns creation time - alike what > > >> /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and > > >> inter-netns connectivity is explicitly granted by the admin (I guess > > >> you will need new transport operations for that)? > > >> > > >> /P > > >> > > >> [1] tcp allows using per-netns established socket lookup tables - as > > >> opposed to the default global lookup table (even if match always takes > > >> in account the netns obviously). The mentioned sysctl specify such > > >> configuration for the children namespaces, if any. > > > > > > I'll save this discussion if the above doesn't resolve your concerns. > > I still have some concern WRT the dynamic mode change after netns > > creation. I fear some 'unsolvable' (or very hard to solve) race I can't > > see now. A tcp_child_ehash_entries-like model will avoid completely the > > issue, but I understand it would be a significant change over the > > current status. > > > > "Luckily" the merge window is on us and we have some time to discuss. Do > > you have a specific use-case for the ability to change the netns mode > > after creation? > > > > /P > > I don't think there is a hard requirement that the mode be change-able > after creation. Though I'd love to avoid such a big change... or at > least leave unchanged as much of what we've already reviewed as > possible. > > In the scheme of defining the mode at creation and following the > tcp_child_ehash_entries-ish model, what I'm imagining is: > - /proc/sys/net/vsock/child_ns_mode can be set to "local" or "global" > - /proc/sys/net/vsock/child_ns_mode is not immutable, can change any > number of times > > - when a netns is created, the new netns mode is inherited from > child_ns_mode, being assigned using something like: > > net->vsock.ns_mode = > get_net_ns_by_pid(current->pid)->child_ns_mode > > - /proc/sys/net/vsock/ns_mode queries the current mode, returning > "local" or "global", returning value of net->vsock.ns_mode > - /proc/sys/net/vsock/ns_mode and net->vsock.ns_mode are immutable and > reject writes > > Does that align with what you have in mind? Hey Paolo, I just wanted to sync up on this one. Does the above align with what you envision? Best, Bobby
On Fri, Dec 12, 2025 at 07:26:15AM -0800, Bobby Eshleman wrote: >On Tue, Dec 02, 2025 at 02:01:04PM -0800, Bobby Eshleman wrote: >> On Tue, Dec 02, 2025 at 09:47:19PM +0100, Paolo Abeni wrote: >> > On 12/2/25 6:56 PM, Bobby Eshleman wrote: >> > > On Tue, Dec 02, 2025 at 11:18:14AM +0100, Paolo Abeni wrote: >> > >> On 11/27/25 8:47 AM, Bobby Eshleman wrote: >> > >>> @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) >> > >>> goto out; >> > >>> } >> > >>> >> > >>> + net = current->nsproxy->net_ns; >> > >>> + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); >> > >>> + >> > >>> + /* Store the mode of the namespace at the time of creation. If this >> > >>> + * namespace later changes from "global" to "local", we want this vsock >> > >>> + * to continue operating normally and not suddenly break. For that >> > >>> + * reason, we save the mode here and later use it when performing >> > >>> + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). >> > >>> + */ >> > >>> + vsock->net_mode = vsock_net_mode(net); >> > >> >> > >> I'm sorry for the very late feedback. I think that at very least the >> > >> user-space needs a way to query if the given transport is in local or >> > >> global mode, as AFAICS there is no way to tell that when socket creation >> > >> races with mode change. >> > > >> > > Are you thinking something along the lines of sockopt? >> > >> > I'd like to see a way for the user-space to query the socket 'namespace >> > mode'. >> > >> > sockopt could be an option; a possibly better one could be sock_diag. Or >> > you could do both using dumping the info with a shared helper invoked by >> > both code paths, alike what TCP is doing. >> > >> Also I'm a bit uneasy with the model implemented here, as 'local' socket >> > >> may cross netns boundaris and connect to 'local' socket in other netns >> > >> (if I read correctly patch 2/12). That in turns AFAICS break the netns >> > >> isolation. >> > > >> > > Local mode sockets are unable to communicate with local mode (and global >> > > mode too) sockets that are in other namespaces. The key piece of code >> > > for that is vsock_net_check_mode(), where if either modes is local the >> > > namespaces must be the same. >> > >> > Sorry, I likely misread the large comment in patch 2: >> > >> > https://lore.kernel.org/netdev/20251126-vsock-vmtest-v12-2-257ee21cd5de@meta.com/ >> > >> > >> Have you considered instead a slightly different model, where the >> > >> local/global model is set in stone at netns creation time - alike what >> > >> /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and >> > >> inter-netns connectivity is explicitly granted by the admin (I guess >> > >> you will need new transport operations for that)? >> > >> >> > >> /P >> > >> >> > >> [1] tcp allows using per-netns established socket lookup tables - as >> > >> opposed to the default global lookup table (even if match always takes >> > >> in account the netns obviously). The mentioned sysctl specify such >> > >> configuration for the children namespaces, if any. >> > > >> > > I'll save this discussion if the above doesn't resolve your concerns. >> > I still have some concern WRT the dynamic mode change after netns >> > creation. I fear some 'unsolvable' (or very hard to solve) race I can't >> > see now. A tcp_child_ehash_entries-like model will avoid completely the >> > issue, but I understand it would be a significant change over the >> > current status. >> > >> > "Luckily" the merge window is on us and we have some time to discuss. Do >> > you have a specific use-case for the ability to change the netns >> > mode >> > after creation? >> > >> > /P >> >> I don't think there is a hard requirement that the mode be change-able >> after creation. Though I'd love to avoid such a big change... or at >> least leave unchanged as much of what we've already reviewed as >> possible. >> >> In the scheme of defining the mode at creation and following the >> tcp_child_ehash_entries-ish model, what I'm imagining is: >> - /proc/sys/net/vsock/child_ns_mode can be set to "local" or "global" >> - /proc/sys/net/vsock/child_ns_mode is not immutable, can change any >> number of times >> >> - when a netns is created, the new netns mode is inherited from >> child_ns_mode, being assigned using something like: >> >> net->vsock.ns_mode = >> get_net_ns_by_pid(current->pid)->child_ns_mode >> >> - /proc/sys/net/vsock/ns_mode queries the current mode, returning >> "local" or "global", returning value of net->vsock.ns_mode >> - /proc/sys/net/vsock/ns_mode and net->vsock.ns_mode are immutable and >> reject writes >> >> Does that align with what you have in mind? > >Hey Paolo, I just wanted to sync up on this one. Does the above align >with what you envision? Hi Bobby, AFAIK Paolo was at LPC, so there could be some delay. FYI I'll be off from Dec 25 to Jan 6, so if we want to do an RFC in the middle, I'll do my best to take a look before my time off. Thanks, Stefano
On Mon, Dec 15, 2025 at 03:11:22PM +0100, Stefano Garzarella wrote: > On Fri, Dec 12, 2025 at 07:26:15AM -0800, Bobby Eshleman wrote: > > On Tue, Dec 02, 2025 at 02:01:04PM -0800, Bobby Eshleman wrote: > > > On Tue, Dec 02, 2025 at 09:47:19PM +0100, Paolo Abeni wrote: > > > > On 12/2/25 6:56 PM, Bobby Eshleman wrote: > > > > > On Tue, Dec 02, 2025 at 11:18:14AM +0100, Paolo Abeni wrote: > > > > >> On 11/27/25 8:47 AM, Bobby Eshleman wrote: > > > > >>> @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) > > > > >>> goto out; > > > > >>> } > > > > >>> > > > > >>> + net = current->nsproxy->net_ns; > > > > >>> + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); > > > > >>> + > > > > >>> + /* Store the mode of the namespace at the time of creation. If this > > > > >>> + * namespace later changes from "global" to "local", we want this vsock > > > > >>> + * to continue operating normally and not suddenly break. For that > > > > >>> + * reason, we save the mode here and later use it when performing > > > > >>> + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). > > > > >>> + */ > > > > >>> + vsock->net_mode = vsock_net_mode(net); > > > > >> > > > > >> I'm sorry for the very late feedback. I think that at very least the > > > > >> user-space needs a way to query if the given transport is in local or > > > > >> global mode, as AFAICS there is no way to tell that when socket creation > > > > >> races with mode change. > > > > > > > > > > Are you thinking something along the lines of sockopt? > > > > > > > > I'd like to see a way for the user-space to query the socket 'namespace > > > > mode'. > > > > > > > > sockopt could be an option; a possibly better one could be sock_diag. Or > > > > you could do both using dumping the info with a shared helper invoked by > > > > both code paths, alike what TCP is doing. > > > > >> Also I'm a bit uneasy with the model implemented here, as 'local' socket > > > > >> may cross netns boundaris and connect to 'local' socket in other netns > > > > >> (if I read correctly patch 2/12). That in turns AFAICS break the netns > > > > >> isolation. > > > > > > > > > > Local mode sockets are unable to communicate with local mode (and global > > > > > mode too) sockets that are in other namespaces. The key piece of code > > > > > for that is vsock_net_check_mode(), where if either modes is local the > > > > > namespaces must be the same. > > > > > > > > Sorry, I likely misread the large comment in patch 2: > > > > > > > > https://lore.kernel.org/netdev/20251126-vsock-vmtest-v12-2-257ee21cd5de@meta.com/ > > > > > > > > >> Have you considered instead a slightly different model, where the > > > > >> local/global model is set in stone at netns creation time - alike what > > > > >> /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and > > > > >> inter-netns connectivity is explicitly granted by the admin (I guess > > > > >> you will need new transport operations for that)? > > > > >> > > > > >> /P > > > > >> > > > > >> [1] tcp allows using per-netns established socket lookup tables - as > > > > >> opposed to the default global lookup table (even if match always takes > > > > >> in account the netns obviously). The mentioned sysctl specify such > > > > >> configuration for the children namespaces, if any. > > > > > > > > > > I'll save this discussion if the above doesn't resolve your concerns. > > > > I still have some concern WRT the dynamic mode change after netns > > > > creation. I fear some 'unsolvable' (or very hard to solve) race I can't > > > > see now. A tcp_child_ehash_entries-like model will avoid completely the > > > > issue, but I understand it would be a significant change over the > > > > current status. > > > > > > > > "Luckily" the merge window is on us and we have some time to discuss. Do > > > > you have a specific use-case for the ability to change the netns > > > > mode > > > > after creation? > > > > > > > > /P > > > > > > I don't think there is a hard requirement that the mode be change-able > > > after creation. Though I'd love to avoid such a big change... or at > > > least leave unchanged as much of what we've already reviewed as > > > possible. > > > > > > In the scheme of defining the mode at creation and following the > > > tcp_child_ehash_entries-ish model, what I'm imagining is: > > > - /proc/sys/net/vsock/child_ns_mode can be set to "local" or "global" > > > - /proc/sys/net/vsock/child_ns_mode is not immutable, can change any > > > number of times > > > > > > - when a netns is created, the new netns mode is inherited from > > > child_ns_mode, being assigned using something like: > > > > > > net->vsock.ns_mode = > > > get_net_ns_by_pid(current->pid)->child_ns_mode > > > > > > - /proc/sys/net/vsock/ns_mode queries the current mode, returning > > > "local" or "global", returning value of net->vsock.ns_mode > > > - /proc/sys/net/vsock/ns_mode and net->vsock.ns_mode are immutable and > > > reject writes > > > > > > Does that align with what you have in mind? > > > > Hey Paolo, I just wanted to sync up on this one. Does the above align > > with what you envision? > > Hi Bobby, AFAIK Paolo was at LPC, so there could be some delay. > > FYI I'll be off from Dec 25 to Jan 6, so if we want to do an RFC in the > middle, I'll do my best to take a look before my time off. > > Thanks, > Stefano > Sounds like a plan, thanks! Best, Bobby
On Mon, Dec 15, 2025 at 05:22:02PM -0800, Bobby Eshleman wrote: > On Mon, Dec 15, 2025 at 03:11:22PM +0100, Stefano Garzarella wrote: > > On Fri, Dec 12, 2025 at 07:26:15AM -0800, Bobby Eshleman wrote: > > > On Tue, Dec 02, 2025 at 02:01:04PM -0800, Bobby Eshleman wrote: > > > > On Tue, Dec 02, 2025 at 09:47:19PM +0100, Paolo Abeni wrote: > > > > > On 12/2/25 6:56 PM, Bobby Eshleman wrote: > > > > > > On Tue, Dec 02, 2025 at 11:18:14AM +0100, Paolo Abeni wrote: > > > > > >> On 11/27/25 8:47 AM, Bobby Eshleman wrote: > > > > > >>> @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) > > > > > >>> goto out; > > > > > >>> } > > > > > >>> > > > > > >>> + net = current->nsproxy->net_ns; > > > > > >>> + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); > > > > > >>> + > > > > > >>> + /* Store the mode of the namespace at the time of creation. If this > > > > > >>> + * namespace later changes from "global" to "local", we want this vsock > > > > > >>> + * to continue operating normally and not suddenly break. For that > > > > > >>> + * reason, we save the mode here and later use it when performing > > > > > >>> + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). > > > > > >>> + */ > > > > > >>> + vsock->net_mode = vsock_net_mode(net); > > > > > >> > > > > > >> I'm sorry for the very late feedback. I think that at very least the > > > > > >> user-space needs a way to query if the given transport is in local or > > > > > >> global mode, as AFAICS there is no way to tell that when socket creation > > > > > >> races with mode change. > > > > > > > > > > > > Are you thinking something along the lines of sockopt? > > > > > > > > > > I'd like to see a way for the user-space to query the socket 'namespace > > > > > mode'. > > > > > > > > > > sockopt could be an option; a possibly better one could be sock_diag. Or > > > > > you could do both using dumping the info with a shared helper invoked by > > > > > both code paths, alike what TCP is doing. > > > > > >> Also I'm a bit uneasy with the model implemented here, as 'local' socket > > > > > >> may cross netns boundaris and connect to 'local' socket in other netns > > > > > >> (if I read correctly patch 2/12). That in turns AFAICS break the netns > > > > > >> isolation. > > > > > > > > > > > > Local mode sockets are unable to communicate with local mode (and global > > > > > > mode too) sockets that are in other namespaces. The key piece of code > > > > > > for that is vsock_net_check_mode(), where if either modes is local the > > > > > > namespaces must be the same. > > > > > > > > > > Sorry, I likely misread the large comment in patch 2: > > > > > > > > > > https://lore.kernel.org/netdev/20251126-vsock-vmtest-v12-2-257ee21cd5de@meta.com/ > > > > > > > > > > >> Have you considered instead a slightly different model, where the > > > > > >> local/global model is set in stone at netns creation time - alike what > > > > > >> /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and > > > > > >> inter-netns connectivity is explicitly granted by the admin (I guess > > > > > >> you will need new transport operations for that)? > > > > > >> > > > > > >> /P > > > > > >> > > > > > >> [1] tcp allows using per-netns established socket lookup tables - as > > > > > >> opposed to the default global lookup table (even if match always takes > > > > > >> in account the netns obviously). The mentioned sysctl specify such > > > > > >> configuration for the children namespaces, if any. > > > > > > > > > > > > I'll save this discussion if the above doesn't resolve your concerns. > > > > > I still have some concern WRT the dynamic mode change after netns > > > > > creation. I fear some 'unsolvable' (or very hard to solve) race I can't > > > > > see now. A tcp_child_ehash_entries-like model will avoid completely the > > > > > issue, but I understand it would be a significant change over the > > > > > current status. > > > > > > > > > > "Luckily" the merge window is on us and we have some time to discuss. Do > > > > > you have a specific use-case for the ability to change the netns > > > > > mode > > > > > after creation? > > > > > > > > > > /P > > > > > > > > I don't think there is a hard requirement that the mode be change-able > > > > after creation. Though I'd love to avoid such a big change... or at > > > > least leave unchanged as much of what we've already reviewed as > > > > possible. > > > > > > > > In the scheme of defining the mode at creation and following the > > > > tcp_child_ehash_entries-ish model, what I'm imagining is: > > > > - /proc/sys/net/vsock/child_ns_mode can be set to "local" or "global" > > > > - /proc/sys/net/vsock/child_ns_mode is not immutable, can change any > > > > number of times > > > > > > > > - when a netns is created, the new netns mode is inherited from > > > > child_ns_mode, being assigned using something like: > > > > > > > > net->vsock.ns_mode = > > > > get_net_ns_by_pid(current->pid)->child_ns_mode > > > > > > > > - /proc/sys/net/vsock/ns_mode queries the current mode, returning > > > > "local" or "global", returning value of net->vsock.ns_mode > > > > - /proc/sys/net/vsock/ns_mode and net->vsock.ns_mode are immutable and > > > > reject writes > > > > > > > > Does that align with what you have in mind? > > > > > > Hey Paolo, I just wanted to sync up on this one. Does the above align > > > with what you envision? > > > > Hi Bobby, AFAIK Paolo was at LPC, so there could be some delay. > > > > FYI I'll be off from Dec 25 to Jan 6, so if we want to do an RFC in the > > middle, I'll do my best to take a look before my time off. > > > > Thanks, > > Stefano Just sent this out, though I acknowledge its pretty last minute WRT your time off. If I don't hear from you before then, have a good holiday! Best, Bobby
On Tue, Dec 23, 2025 at 04:32:30PM -0800, Bobby Eshleman wrote: >On Mon, Dec 15, 2025 at 05:22:02PM -0800, Bobby Eshleman wrote: >> On Mon, Dec 15, 2025 at 03:11:22PM +0100, Stefano Garzarella wrote: [...] >> > >> > FYI I'll be off from Dec 25 to Jan 6, so if we want to do an RFC in the >> > middle, I'll do my best to take a look before my time off. >> > >> > Thanks, >> > Stefano > >Just sent this out, though I acknowledge its pretty last minute WRT >your time off. Thanks for that, but yeah I didn't have time to take a closer look :-( I'll do as soon I'm back! > >If I don't hear from you before then, have a good holiday! Thanks, you too if you will have the opportunity! Thanks, Stefano
On Tue, 2 Dec 2025 at 23:01, Bobby Eshleman <bobbyeshleman@gmail.com> wrote: > > On Tue, Dec 02, 2025 at 09:47:19PM +0100, Paolo Abeni wrote: > > On 12/2/25 6:56 PM, Bobby Eshleman wrote: > > > On Tue, Dec 02, 2025 at 11:18:14AM +0100, Paolo Abeni wrote: > > >> On 11/27/25 8:47 AM, Bobby Eshleman wrote: > > >>> @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) > > >>> goto out; > > >>> } > > >>> > > >>> + net = current->nsproxy->net_ns; > > >>> + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); > > >>> + > > >>> + /* Store the mode of the namespace at the time of creation. If this > > >>> + * namespace later changes from "global" to "local", we want this vsock > > >>> + * to continue operating normally and not suddenly break. For that > > >>> + * reason, we save the mode here and later use it when performing > > >>> + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). > > >>> + */ > > >>> + vsock->net_mode = vsock_net_mode(net); > > >> > > >> I'm sorry for the very late feedback. I think that at very least the > > >> user-space needs a way to query if the given transport is in local or > > >> global mode, as AFAICS there is no way to tell that when socket creation > > >> races with mode change. > > > > > > Are you thinking something along the lines of sockopt? > > > > I'd like to see a way for the user-space to query the socket 'namespace > > mode'. > > > > sockopt could be an option; a possibly better one could be sock_diag. Or > > you could do both using dumping the info with a shared helper invoked by > > both code paths, alike what TCP is doing. > > >> Also I'm a bit uneasy with the model implemented here, as 'local' socket > > >> may cross netns boundaris and connect to 'local' socket in other netns > > >> (if I read correctly patch 2/12). That in turns AFAICS break the netns > > >> isolation. > > > > > > Local mode sockets are unable to communicate with local mode (and global > > > mode too) sockets that are in other namespaces. The key piece of code > > > for that is vsock_net_check_mode(), where if either modes is local the > > > namespaces must be the same. > > > > Sorry, I likely misread the large comment in patch 2: > > > > https://lore.kernel.org/netdev/20251126-vsock-vmtest-v12-2-257ee21cd5de@meta.com/ > > > > >> Have you considered instead a slightly different model, where the > > >> local/global model is set in stone at netns creation time - alike what > > >> /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and > > >> inter-netns connectivity is explicitly granted by the admin (I guess > > >> you will need new transport operations for that)? > > >> > > >> /P > > >> > > >> [1] tcp allows using per-netns established socket lookup tables - as > > >> opposed to the default global lookup table (even if match always takes > > >> in account the netns obviously). The mentioned sysctl specify such > > >> configuration for the children namespaces, if any. > > > > > > I'll save this discussion if the above doesn't resolve your concerns. > > I still have some concern WRT the dynamic mode change after netns > > creation. I fear some 'unsolvable' (or very hard to solve) race I can't > > see now. A tcp_child_ehash_entries-like model will avoid completely the > > issue, but I understand it would be a significant change over the > > current status. > > > > "Luckily" the merge window is on us and we have some time to discuss. Do > > you have a specific use-case for the ability to change the netns mode > > after creation? > > > > /P > > I don't think there is a hard requirement that the mode be change-able > after creation. Though I'd love to avoid such a big change... or at > least leave unchanged as much of what we've already reviewed as > possible. I think the big part is done, IIUC this should just be a change to the uAPI and maybe simplify what we have a little (e.g., avoid saving the mode each socket had when it was created). > > In the scheme of defining the mode at creation and following the > tcp_child_ehash_entries-ish model, what I'm imagining is: > - /proc/sys/net/vsock/child_ns_mode can be set to "local" or "global" > - /proc/sys/net/vsock/child_ns_mode is not immutable, can change any > number of times > > - when a netns is created, the new netns mode is inherited from > child_ns_mode, being assigned using something like: > > net->vsock.ns_mode = > get_net_ns_by_pid(current->pid)->child_ns_mode > > - /proc/sys/net/vsock/ns_mode queries the current mode, returning > "local" or "global", returning value of net->vsock.ns_mode > - /proc/sys/net/vsock/ns_mode and net->vsock.ns_mode are immutable and > reject writes > > Does that align with what you have in mind? > > Stefano, what are your thoughts? If we can avoid having sockets in a namespace that can be both global and local, perhaps it makes a lot of sense to make this change. My only concern is that there is still a small window where the mode can change, but we are sure that only one is picked during creation and then within the namespace this can be easily checked and give us the assurance that all sockets comply with it, right? Thanks, Stefano
On Tue, Dec 02, 2025 at 09:56:02AM -0800, Bobby Eshleman wrote: > On Tue, Dec 02, 2025 at 11:18:14AM +0100, Paolo Abeni wrote: > > On 11/27/25 8:47 AM, Bobby Eshleman wrote: > > > @@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) > > > goto out; > > > } > > > > > > + net = current->nsproxy->net_ns; > > > + vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL); > > > + > > > + /* Store the mode of the namespace at the time of creation. If this > > > + * namespace later changes from "global" to "local", we want this vsock > > > + * to continue operating normally and not suddenly break. For that > > > + * reason, we save the mode here and later use it when performing > > > + * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()). > > > + */ > > > + vsock->net_mode = vsock_net_mode(net); > > > > I'm sorry for the very late feedback. I think that at very least the > > user-space needs a way to query if the given transport is in local or > > global mode, as AFAICS there is no way to tell that when socket creation > > races with mode change. > > Are you thinking something along the lines of sockopt? > To clarify... do we want the user to be able to query the socket for which namespace mode it is in (so the results of the race can be queried), or are you looking for a way for the user to query if the transport supports local mode (maybe via /dev/vsock ioctl). I'm not sure we can attach a namespace to a transport per-se, as different namespaces in different modes can use the same transport. Best, Bobby > > > > Also I'm a bit uneasy with the model implemented here, as 'local' socket > > may cross netns boundaris and connect to 'local' socket in other netns > > (if I read correctly patch 2/12). That in turns AFAICS break the netns > > isolation. > > Local mode sockets are unable to communicate with local mode (and global > mode too) sockets that are in other namespaces. The key piece of code > for that is vsock_net_check_mode(), where if either modes is local the > namespaces must be the same. > > > > > Have you considered instead a slightly different model, where the > > local/global model is set in stone at netns creation time - alike what > > /proc/sys/net/ipv4/tcp_child_ehash_entries is doing[1] - and > > inter-netns connectivity is explicitly granted by the admin (I guess > > you will need new transport operations for that)? > > > > /P > > > > [1] tcp allows using per-netns established socket lookup tables - as > > opposed to the default global lookup table (even if match always takes > > in account the netns obviously). The mentioned sysctl specify such > > configuration for the children namespaces, if any. > > > > I'll save this discussion if the above doesn't resolve your concerns. > > Best, > Bobby
On Wed, Nov 26, 2025 at 11:47:33PM -0800, Bobby Eshleman wrote:
>From: Bobby Eshleman <bobbyeshleman@meta.com>
>
>Add netns support to loopback and vhost. Keep netns disabled for
>virtio-vsock, but add necessary changes to comply with common API
>updates.
>
>This is the patch in the series when vhost-vsock namespaces actually
>come online. Hence, vhost_transport_supports_local_mode() is switched
>to return true.
>
>Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
>---
>Changes in v12:
>- change seqpacket_allow() and stream_allow() to return true for
> loopback and vhost (Stefano)
>
>Changes in v11:
>- reorder with the skb ownership patch for loopback (Stefano)
>- toggle vhost_transport_supports_local_mode() to true
>
>Changes in v10:
>- Splitting patches complicates the series with meaningless placeholder
> values that eventually get replaced anyway, so to avoid that this
> patch combines into one. Links to previous patches here:
> - Link: https://lore.kernel.org/all/20251111-vsock-vmtest-v9-3-852787a37bed@meta.com/
> - Link: https://lore.kernel.org/all/20251111-vsock-vmtest-v9-6-852787a37bed@meta.com/
> - Link: https://lore.kernel.org/all/20251111-vsock-vmtest-v9-7-852787a37bed@meta.com/
>- remove placeholder values (Stefano)
>- update comment describe net/net_mode for
> virtio_transport_reset_no_sock()
>---
> drivers/vhost/vsock.c | 56 +++++++++++++++++++++--------
> include/linux/virtio_vsock.h | 8 +++--
> net/vmw_vsock/virtio_transport.c | 10 ++++--
> net/vmw_vsock/virtio_transport_common.c | 63 ++++++++++++++++++++++++---------
> net/vmw_vsock/vsock_loopback.c | 19 +++++++---
> 5 files changed, 118 insertions(+), 38 deletions(-)
>
>diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>index 83937e1d63fa..82cb9ec09e78 100644
>--- a/drivers/vhost/vsock.c
>+++ b/drivers/vhost/vsock.c
>@@ -46,6 +46,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);
> struct vhost_vsock {
> struct vhost_dev dev;
> struct vhost_virtqueue vqs[2];
>+ struct net *net;
>+ netns_tracker ns_tracker;
>+
>+ /* The ns mode at the time vhost_vsock was created */
>+ enum vsock_net_mode net_mode;
>
> /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
> struct hlist_node hash;
>@@ -67,7 +72,8 @@ static u32 vhost_transport_get_local_cid(void)
> /* Callers that dereference the return value must hold vhost_vsock_mutex or the
> * RCU read lock.
> */
>-static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
>+static struct vhost_vsock *vhost_vsock_get(u32 guest_cid, struct net *net,
>+ enum vsock_net_mode mode)
> {
> struct vhost_vsock *vsock;
>
>@@ -78,9 +84,10 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid)
> if (other_cid == 0)
> continue;
>
>- if (other_cid == guest_cid)
>+ if (other_cid == guest_cid &&
>+ vsock_net_check_mode(net, mode, vsock->net,
>+ vsock->net_mode))
> return vsock;
>-
> }
>
> return NULL;
>@@ -269,7 +276,8 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
> }
>
> static int
>-vhost_transport_send_pkt(struct sk_buff *skb)
>+vhost_transport_send_pkt(struct sk_buff *skb, struct net *net,
>+ enum vsock_net_mode net_mode)
> {
> struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
> struct vhost_vsock *vsock;
>@@ -278,7 +286,7 @@ vhost_transport_send_pkt(struct sk_buff *skb)
> rcu_read_lock();
>
> /* Find the vhost_vsock according to guest context id */
>- vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid));
>+ vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid), net, net_mode);
> if (!vsock) {
> rcu_read_unlock();
> kfree_skb(skb);
>@@ -305,7 +313,8 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
> rcu_read_lock();
>
> /* Find the vhost_vsock according to guest context id */
>- vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
>+ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid,
>+ sock_net(sk_vsock(vsk)), vsk->net_mode);
> if (!vsock)
> goto out;
>
>@@ -407,6 +416,12 @@ static bool vhost_transport_msgzerocopy_allow(void)
> static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk,
> u32 remote_cid);
>
>+static bool
>+vhost_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port)
>+{
>+ return true;
>+}
>+
> static struct virtio_transport vhost_transport = {
> .transport = {
> .module = THIS_MODULE,
>@@ -431,7 +446,7 @@ static struct virtio_transport vhost_transport = {
> .stream_has_space = virtio_transport_stream_has_space,
> .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
> .stream_is_active = virtio_transport_stream_is_active,
>- .stream_allow = virtio_transport_stream_allow,
>+ .stream_allow = vhost_transport_stream_allow,
>
> .seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
> .seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
>@@ -464,14 +479,12 @@ static struct virtio_transport vhost_transport = {
> static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk,
> u32 remote_cid)
> {
>+ struct net *net = sock_net(sk_vsock(vsk));
> struct vhost_vsock *vsock;
> bool seqpacket_allow = false;
>
>- if (vsk->net_mode != VSOCK_NET_MODE_GLOBAL)
>- return false;
>-
> rcu_read_lock();
>- vsock = vhost_vsock_get(remote_cid);
>+ vsock = vhost_vsock_get(remote_cid, net, vsk->net_mode);
>
> if (vsock)
> seqpacket_allow = vsock->seqpacket_allow;
>@@ -542,7 +555,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
> if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid &&
> le64_to_cpu(hdr->dst_cid) ==
> vhost_transport_get_local_cid())
>- virtio_transport_recv_pkt(&vhost_transport, skb);
>+ virtio_transport_recv_pkt(&vhost_transport, skb,
>+ vsock->net, vsock->net_mode);
> else
> kfree_skb(skb);
>
>@@ -659,6 +673,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
> {
> struct vhost_virtqueue **vqs;
> struct vhost_vsock *vsock;
>+ struct net *net;
> int ret;
>
> /* This struct is large and allocation could fail, fall back to vmalloc
>@@ -674,6 +689,17 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
> goto out;
> }
>
>+ net = current->nsproxy->net_ns;
>+ vsock->net = get_net_track(net, &vsock->ns_tracker, GFP_KERNEL);
>+
>+ /* Store the mode of the namespace at the time of creation. If this
>+ * namespace later changes from "global" to "local", we want this vsock
>+ * to continue operating normally and not suddenly break. For that
>+ * reason, we save the mode here and later use it when performing
>+ * socket lookups with vsock_net_check_mode() (see vhost_vsock_get()).
>+ */
>+ vsock->net_mode = vsock_net_mode(net);
>+
> vsock->guest_cid = 0; /* no CID assigned yet */
> vsock->seqpacket_allow = false;
>
>@@ -713,7 +739,8 @@ static void vhost_vsock_reset_orphans(struct sock *sk)
> */
>
> /* If the peer is still valid, no need to reset connection */
>- if (vhost_vsock_get(vsk->remote_addr.svm_cid))
>+ if (vhost_vsock_get(vsk->remote_addr.svm_cid, sock_net(sk),
>+ vsk->net_mode))
> return;
>
> /* If the close timeout is pending, let it expire. This avoids races
>@@ -758,6 +785,7 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
> virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
>
> vhost_dev_cleanup(&vsock->dev);
>+ put_net_track(vsock->net, &vsock->ns_tracker);
> kfree(vsock->dev.vqs);
> vhost_vsock_free(vsock);
> return 0;
>@@ -784,7 +812,7 @@ static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid)
>
> /* Refuse if CID is already in use */
> mutex_lock(&vhost_vsock_mutex);
>- other = vhost_vsock_get(guest_cid);
>+ other = vhost_vsock_get(guest_cid, vsock->net, vsock->net_mode);
> if (other && other != vsock) {
> mutex_unlock(&vhost_vsock_mutex);
> return -EADDRINUSE;
>diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>index 1845e8d4f78d..7ea264dcfff7 100644
>--- a/include/linux/virtio_vsock.h
>+++ b/include/linux/virtio_vsock.h
>@@ -173,6 +173,8 @@ struct virtio_vsock_pkt_info {
> u32 remote_cid, remote_port;
> struct vsock_sock *vsk;
> struct msghdr *msg;
>+ struct net *net;
>+ enum vsock_net_mode net_mode;
> u32 pkt_len;
> u16 type;
> u16 op;
>@@ -185,7 +187,8 @@ struct virtio_transport {
> struct vsock_transport transport;
>
> /* Takes ownership of the packet */
>- int (*send_pkt)(struct sk_buff *skb);
>+ int (*send_pkt)(struct sk_buff *skb, struct net *net,
>+ enum vsock_net_mode net_mode);
>
> /* Used in MSG_ZEROCOPY mode. Checks, that provided data
> * (number of buffers) could be transmitted with zerocopy
>@@ -280,7 +283,8 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
> void virtio_transport_destruct(struct vsock_sock *vsk);
>
> void virtio_transport_recv_pkt(struct virtio_transport *t,
>- struct sk_buff *skb);
>+ struct sk_buff *skb, struct net *net,
>+ enum vsock_net_mode net_mode);
> void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb);
> u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
> void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
>diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>index f5123810192d..3ff695740108 100644
>--- a/net/vmw_vsock/virtio_transport.c
>+++ b/net/vmw_vsock/virtio_transport.c
>@@ -231,7 +231,8 @@ static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struc
> }
>
> static int
>-virtio_transport_send_pkt(struct sk_buff *skb)
>+virtio_transport_send_pkt(struct sk_buff *skb, struct net *net,
>+ enum vsock_net_mode net_mode)
> {
> struct virtio_vsock_hdr *hdr;
> struct virtio_vsock *vsock;
>@@ -665,7 +666,12 @@ static void virtio_transport_rx_work(struct work_struct *work)
> virtio_vsock_skb_put(skb, payload_len);
>
> virtio_transport_deliver_tap_pkt(skb);
>- virtio_transport_recv_pkt(&virtio_transport, skb);
>+
>+ /* Force virtio-transport into global mode since it
>+ * does not yet support local-mode namespacing.
>+ */
>+ virtio_transport_recv_pkt(&virtio_transport, skb,
>+ NULL, VSOCK_NET_MODE_GLOBAL);
This is related to the discussion of the previous patch I guess.
So if I get it right, it LGTM!
> }
> } while (!virtqueue_enable_cb(vq));
>
>diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>index de71e2b3f77e..a818152d8b79 100644
>--- a/net/vmw_vsock/virtio_transport_common.c
>+++ b/net/vmw_vsock/virtio_transport_common.c
>@@ -413,7 +413,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>
> virtio_transport_inc_tx_pkt(vvs, skb);
>
>- ret = t_ops->send_pkt(skb);
>+ ret = t_ops->send_pkt(skb, info->net, info->net_mode);
> if (ret < 0)
> break;
>
>@@ -527,6 +527,8 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk)
> struct virtio_vsock_pkt_info info = {
> .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
>+ .net_mode = vsk->net_mode,
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1067,6 +1069,8 @@ int virtio_transport_connect(struct vsock_sock *vsk)
> struct virtio_vsock_pkt_info info = {
> .op = VIRTIO_VSOCK_OP_REQUEST,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
>+ .net_mode = vsk->net_mode,
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1082,6 +1086,8 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
> (mode & SEND_SHUTDOWN ?
> VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
>+ .net_mode = vsk->net_mode,
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1108,6 +1114,8 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
> .msg = msg,
> .pkt_len = len,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
>+ .net_mode = vsk->net_mode,
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1145,6 +1153,8 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
> .op = VIRTIO_VSOCK_OP_RST,
> .reply = !!skb,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
>+ .net_mode = vsk->net_mode,
> };
>
> /* Send RST only if the original pkt is not a RST pkt */
>@@ -1156,9 +1166,14 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
>
> /* Normally packets are associated with a socket. There may be no socket if an
> * attempt was made to connect to a socket that does not exist.
>+ *
>+ * net and net_mode refer to the namespace of whoever sent the invalid message.
>+ * For loopback, this is the namespace of the socket. For vhost, this is the
>+ * namespace of the VM (i.e., vhost_vsock).
> */
> static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
>- struct sk_buff *skb)
>+ struct sk_buff *skb, struct net *net,
>+ enum vsock_net_mode net_mode)
> {
> struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
> struct virtio_vsock_pkt_info info = {
>@@ -1171,6 +1186,13 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
> * sock_net(sk) until the reply skb is freed.
> */
> .vsk = vsock_sk(skb->sk),
>+
>+ /* net or net_mode are not defined here because we pass
>+ * net and net_mode directly to t->send_pkt(), instead of
>+ * relying on virtio_transport_send_pkt_info() to pass them to
>+ * t->send_pkt(). They are not needed by
>+ * virtio_transport_alloc_skb().
>+ */
> };
> struct sk_buff *reply;
>
>@@ -1189,7 +1211,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
> if (!reply)
> return -ENOMEM;
>
>- return t->send_pkt(reply);
>+ return t->send_pkt(reply, net, net_mode);
> }
>
> /* This function should be called with sk_lock held and SOCK_DONE set */
>@@ -1471,6 +1493,8 @@ virtio_transport_send_response(struct vsock_sock *vsk,
> .remote_port = le32_to_cpu(hdr->src_port),
> .reply = true,
> .vsk = vsk,
>+ .net = sock_net(sk_vsock(vsk)),
>+ .net_mode = vsk->net_mode,
> };
>
> return virtio_transport_send_pkt_info(vsk, &info);
>@@ -1513,12 +1537,14 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
> int ret;
>
> if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) {
>- virtio_transport_reset_no_sock(t, skb);
>+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
>+ vsk->net_mode);
> return -EINVAL;
> }
>
> if (sk_acceptq_is_full(sk)) {
>- virtio_transport_reset_no_sock(t, skb);
>+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
>+ vsk->net_mode);
> return -ENOMEM;
> }
>
>@@ -1526,13 +1552,15 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
> * Subsequent enqueues would lead to a memory leak.
> */
> if (sk->sk_shutdown == SHUTDOWN_MASK) {
>- virtio_transport_reset_no_sock(t, skb);
>+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
>+ vsk->net_mode);
> return -ESHUTDOWN;
> }
>
> child = vsock_create_connected(sk);
> if (!child) {
>- virtio_transport_reset_no_sock(t, skb);
>+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
>+ vsk->net_mode);
> return -ENOMEM;
> }
>
>@@ -1554,7 +1582,8 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
> */
> if (ret || vchild->transport != &t->transport) {
> release_sock(child);
>- virtio_transport_reset_no_sock(t, skb);
>+ virtio_transport_reset_no_sock(t, skb, sock_net(sk),
>+ vsk->net_mode);
> sock_put(child);
> return ret;
> }
>@@ -1582,7 +1611,8 @@ static bool virtio_transport_valid_type(u16 type)
> * lock.
> */
> void virtio_transport_recv_pkt(struct virtio_transport *t,
>- struct sk_buff *skb)
>+ struct sk_buff *skb, struct net *net,
>+ enum vsock_net_mode net_mode)
> {
> struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
> struct sockaddr_vm src, dst;
>@@ -1605,24 +1635,25 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
> le32_to_cpu(hdr->fwd_cnt));
>
> if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) {
>- (void)virtio_transport_reset_no_sock(t, skb);
>+ (void)virtio_transport_reset_no_sock(t, skb, net, net_mode);
> goto free_pkt;
> }
>
> /* The socket must be in connected or bound table
> * otherwise send reset back
> */
>- sk = vsock_find_connected_socket(&src, &dst);
>+ sk = vsock_find_connected_socket_net(&src, &dst, net, net_mode);
> if (!sk) {
>- sk = vsock_find_bound_socket(&dst);
>+ sk = vsock_find_bound_socket_net(&dst, net, net_mode);
> if (!sk) {
>- (void)virtio_transport_reset_no_sock(t, skb);
>+ (void)virtio_transport_reset_no_sock(t, skb, net,
>+ net_mode);
> goto free_pkt;
> }
> }
>
> if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) {
>- (void)virtio_transport_reset_no_sock(t, skb);
>+ (void)virtio_transport_reset_no_sock(t, skb, net, net_mode);
> sock_put(sk);
> goto free_pkt;
> }
>@@ -1641,7 +1672,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
> */
> if (sock_flag(sk, SOCK_DONE) ||
> (sk->sk_state != TCP_LISTEN && vsk->transport != &t->transport)) {
>- (void)virtio_transport_reset_no_sock(t, skb);
>+ (void)virtio_transport_reset_no_sock(t, skb, net, net_mode);
> release_sock(sk);
> sock_put(sk);
> goto free_pkt;
>@@ -1673,7 +1704,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
> kfree_skb(skb);
> break;
> default:
>- (void)virtio_transport_reset_no_sock(t, skb);
>+ (void)virtio_transport_reset_no_sock(t, skb, net, net_mode);
> kfree_skb(skb);
> break;
> }
>diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
>index afad27cf533a..aef44d1631c3 100644
>--- a/net/vmw_vsock/vsock_loopback.c
>+++ b/net/vmw_vsock/vsock_loopback.c
>@@ -26,7 +26,8 @@ static u32 vsock_loopback_get_local_cid(void)
> return VMADDR_CID_LOCAL;
> }
>
>-static int vsock_loopback_send_pkt(struct sk_buff *skb)
>+static int vsock_loopback_send_pkt(struct sk_buff *skb, struct net *net,
>+ enum vsock_net_mode net_mode)
> {
> struct vsock_loopback *vsock = &the_vsock_loopback;
> int len = skb->len;
>@@ -48,6 +49,13 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
>
> static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk,
> u32 remote_cid);
>+
>+static bool vsock_loopback_stream_allow(struct vsock_sock *vsk, u32 cid,
>+ u32 port)
>+{
>+ return true;
>+}
>+
> static bool vsock_loopback_msgzerocopy_allow(void)
> {
> return true;
>@@ -77,7 +85,7 @@ static struct virtio_transport loopback_transport = {
> .stream_has_space = virtio_transport_stream_has_space,
> .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
> .stream_is_active = virtio_transport_stream_is_active,
>- .stream_allow = virtio_transport_stream_allow,
>+ .stream_allow = vsock_loopback_stream_allow,
So after this change, there is only virtio_transport.c using the
virtio_transport_stream_allow() defined in virtio_transport_common.c
right?
At that point, we should move it in virtio_transport.c IMO.
That said, we can do it with a follow-up patch, since the behaviour is
unchanged, so:
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
>
> .seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
> .seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
>@@ -110,7 +118,7 @@ static struct virtio_transport loopback_transport = {
> static bool
> vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
> {
>- return vsk->net_mode == VSOCK_NET_MODE_GLOBAL;
>+ return true;
> }
>
> static void vsock_loopback_work(struct work_struct *work)
>@@ -132,7 +140,10 @@ static void vsock_loopback_work(struct work_struct *work)
> */
> virtio_transport_consume_skb_sent(skb, false);
> virtio_transport_deliver_tap_pkt(skb);
>- virtio_transport_recv_pkt(&loopback_transport, skb);
>+
>+ virtio_transport_recv_pkt(&loopback_transport, skb,
>+ sock_net(skb->sk),
>+ vsock_sk(skb->sk)->net_mode);
> }
> }
>
>
>--
>2.47.3
>
© 2016 - 2026 Red Hat, Inc.