Implement a paravirt scheduling framework for linux kernel.
The framework allows for pvsched driver to register to the kernel and
receive callbacks from hypervisor(eg: kvm) for interested vcpu events
like VMENTER, VMEXIT etc.
The framework also allows hypervisor to select a pvsched driver (from
the available list of registered drivers) for each guest.
Also implement a sysctl for listing the available pvsched drivers.
Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
Kconfig | 2 +
include/linux/pvsched.h | 102 +++++++++++++++++++
kernel/sysctl.c | 27 +++++
virt/Makefile | 2 +-
virt/pvsched/Kconfig | 12 +++
virt/pvsched/Makefile | 2 +
virt/pvsched/pvsched.c | 215 ++++++++++++++++++++++++++++++++++++++++
7 files changed, 361 insertions(+), 1 deletion(-)
create mode 100644 include/linux/pvsched.h
create mode 100644 virt/pvsched/Kconfig
create mode 100644 virt/pvsched/Makefile
create mode 100644 virt/pvsched/pvsched.c
diff --git a/Kconfig b/Kconfig
index 745bc773f567..4a52eaa21166 100644
--- a/Kconfig
+++ b/Kconfig
@@ -29,4 +29,6 @@ source "lib/Kconfig"
source "lib/Kconfig.debug"
+source "virt/pvsched/Kconfig"
+
source "Documentation/Kconfig"
diff --git a/include/linux/pvsched.h b/include/linux/pvsched.h
new file mode 100644
index 000000000000..59df6b44aacb
--- /dev/null
+++ b/include/linux/pvsched.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2024 Google */
+
+#ifndef _LINUX_PVSCHED_H
+#define _LINUX_PVSCHED_H 1
+
+/*
+ * List of events for which hypervisor calls back into pvsched driver.
+ * Driver can specify the events it is interested in.
+ */
+enum pvsched_vcpu_events {
+ PVSCHED_VCPU_VMENTER = 0x1,
+ PVSCHED_VCPU_VMEXIT = 0x2,
+ PVSCHED_VCPU_HALT = 0x4,
+ PVSCHED_VCPU_INTR_INJ = 0x8,
+};
+
+#define PVSCHED_NAME_MAX 32
+#define PVSCHED_MAX 8
+#define PVSCHED_DRV_BUF_MAX (PVSCHED_NAME_MAX * PVSCHED_MAX + PVSCHED_MAX)
+
+/*
+ * pvsched driver callbacks.
+ * TODO: versioning support for better compatibility with the guest
+ * component implementing this feature.
+ */
+struct pvsched_vcpu_ops {
+ /*
+ * pvsched_vcpu_register() - Register the vcpu with pvsched driver.
+ * @pid: pid of the vcpu task.
+ *
+ * pvsched driver can store the pid internally and initialize
+ * itself to prepare for receiving callbacks from thsi vcpu.
+ */
+ int (*pvsched_vcpu_register)(struct pid *pid);
+
+ /*
+ * pvsched_vcpu_unregister() - Un-register the vcpu with pvsched driver.
+ * @pid: pid of the vcpu task.
+ */
+ void (*pvsched_vcpu_unregister)(struct pid *pid);
+
+ /*
+ * pvsched_vcpu_notify_event() - Callback for pvsched events
+ * @addr: Address of the memory region shared with guest
+ * @pid: pid of the vcpu task.
+ * @events: bit mask of the events that hypervisor wants to notify.
+ */
+ void (*pvsched_vcpu_notify_event)(void *addr, struct pid *pid, u32 event);
+
+ char name[PVSCHED_NAME_MAX];
+ struct module *owner;
+ struct list_head list;
+ u32 events;
+ u32 key;
+};
+
+#ifdef CONFIG_PARAVIRT_SCHED_HOST
+int pvsched_get_available_drivers(char *buf, size_t maxlen);
+
+int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops);
+void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops);
+
+struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name);
+void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops);
+
+static inline int pvsched_validate_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+ /*
+ * All callbacks are mandatory.
+ */
+ if (!ops->pvsched_vcpu_register || !ops->pvsched_vcpu_unregister ||
+ !ops->pvsched_vcpu_notify_event)
+ return -EINVAL;
+
+ return 0;
+}
+#else
+static inline void pvsched_get_available_drivers(char *buf, size_t maxlen)
+{
+}
+
+static inline int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+ return -ENOTSUPP;
+}
+
+static inline void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+}
+
+static inline struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name)
+{
+ return NULL;
+}
+
+static inline void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+}
+#endif
+
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 157f7ce2942d..10a18a791b4f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -63,6 +63,7 @@
#include <linux/mount.h>
#include <linux/userfaultfd_k.h>
#include <linux/pid.h>
+#include <linux/pvsched.h>
#include "../lib/kstrtox.h"
@@ -1615,6 +1616,24 @@ int proc_do_static_key(struct ctl_table *table, int write,
return ret;
}
+#ifdef CONFIG_PARAVIRT_SCHED_HOST
+static int proc_pvsched_available_drivers(struct ctl_table *ctl,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = PVSCHED_DRV_BUF_MAX, };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+ pvsched_get_available_drivers(tbl.data, PVSCHED_DRV_BUF_MAX);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ kfree(tbl.data);
+ return ret;
+}
+#endif
+
static struct ctl_table kern_table[] = {
{
.procname = "panic",
@@ -2033,6 +2052,14 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_INT_MAX,
},
+#endif
+#ifdef CONFIG_PARAVIRT_SCHED_HOST
+ {
+ .procname = "pvsched_available_drivers",
+ .maxlen = PVSCHED_DRV_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_pvsched_available_drivers,
+ },
#endif
{ }
};
diff --git a/virt/Makefile b/virt/Makefile
index 1cfea9436af9..9d0f32d775a1 100644
--- a/virt/Makefile
+++ b/virt/Makefile
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += lib/
+obj-y += lib/ pvsched/
diff --git a/virt/pvsched/Kconfig b/virt/pvsched/Kconfig
new file mode 100644
index 000000000000..5ca2669060cb
--- /dev/null
+++ b/virt/pvsched/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config PARAVIRT_SCHED_HOST
+ bool "Paravirt scheduling framework in the host kernel"
+ default n
+ help
+ Paravirtualized scheduling facilitates the exchange of scheduling
+ related information between the host and guest through shared memory,
+ enhancing the efficiency of vCPU thread scheduling by the hypervisor.
+ An illustrative use case involves dynamically boosting the priority of
+ a vCPU thread when the guest is executing a latency-sensitive workload
+ on that specific vCPU.
+ This config enables paravirt scheduling framework in the host kernel.
diff --git a/virt/pvsched/Makefile b/virt/pvsched/Makefile
new file mode 100644
index 000000000000..4ca38e30479b
--- /dev/null
+++ b/virt/pvsched/Makefile
@@ -0,0 +1,2 @@
+
+obj-$(CONFIG_PARAVIRT_SCHED_HOST) += pvsched.o
diff --git a/virt/pvsched/pvsched.c b/virt/pvsched/pvsched.c
new file mode 100644
index 000000000000..610c85cf90d2
--- /dev/null
+++ b/virt/pvsched/pvsched.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Google */
+
+/*
+ * Paravirt scheduling framework
+ *
+ */
+
+/*
+ * Heavily inspired from tcp congestion avoidance implementation.
+ * (net/ipv4/tcp_cong.c)
+ */
+
+#define pr_fmt(fmt) "PVSCHED: " fmt
+
+#include <linux/module.h>
+#include <linux/bpf.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/pvsched.h>
+
+static DEFINE_SPINLOCK(pvsched_drv_list_lock);
+static int nr_pvsched_drivers = 0;
+static LIST_HEAD(pvsched_drv_list);
+
+/*
+ * Retrieve pvsched_vcpu_ops given the name.
+ */
+static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_name(char *name)
+{
+ struct pvsched_vcpu_ops *ops;
+
+ list_for_each_entry_rcu(ops, &pvsched_drv_list, list) {
+ if (strcmp(ops->name, name) == 0)
+ return ops;
+ }
+
+ return NULL;
+}
+
+/*
+ * Retrieve pvsched_vcpu_ops given the hash key.
+ */
+static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_key(u32 key)
+{
+ struct pvsched_vcpu_ops *ops;
+
+ list_for_each_entry_rcu(ops, &pvsched_drv_list, list) {
+ if (ops->key == key)
+ return ops;
+ }
+
+ return NULL;
+}
+
+/*
+ * pvsched_get_available_drivers() - Copy space separated list of pvsched
+ * driver names.
+ * @buf: buffer to store the list of driver names
+ * @maxlen: size of the buffer
+ *
+ * Return: 0 on success, negative value on error.
+ */
+int pvsched_get_available_drivers(char *buf, size_t maxlen)
+{
+ struct pvsched_vcpu_ops *ops;
+ size_t offs = 0;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (maxlen > PVSCHED_DRV_BUF_MAX)
+ maxlen = PVSCHED_DRV_BUF_MAX;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &pvsched_drv_list, list) {
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", ops->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pvsched_get_available_drivers);
+
+/*
+ * pvsched_register_vcpu_ops() - Register the driver in the kernel.
+ * @ops: Driver data(callbacks)
+ *
+ * After the registration, driver will be exposed to the hypervisor
+ * for assignment to the guest VMs.
+ *
+ * Return: 0 on success, negative value on error.
+ */
+int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+ int ret = 0;
+
+ ops->key = jhash(ops->name, sizeof(ops->name), strlen(ops->name));
+ spin_lock(&pvsched_drv_list_lock);
+ if (nr_pvsched_drivers > PVSCHED_MAX) {
+ ret = -ENOSPC;
+ } if (pvsched_find_vcpu_ops_key(ops->key)) {
+ ret = -EEXIST;
+ } else if (!(ret = pvsched_validate_vcpu_ops(ops))) {
+ list_add_tail_rcu(&ops->list, &pvsched_drv_list);
+ nr_pvsched_drivers++;
+ }
+ spin_unlock(&pvsched_drv_list_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pvsched_register_vcpu_ops);
+
+/*
+ * pvsched_register_vcpu_ops() - Un-register the driver from the kernel.
+ * @ops: Driver data(callbacks)
+ *
+ * After un-registration, driver will not be visible to hypervisor.
+ */
+void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+ spin_lock(&pvsched_drv_list_lock);
+ list_del_rcu(&ops->list);
+ nr_pvsched_drivers--;
+ spin_unlock(&pvsched_drv_list_lock);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(pvsched_unregister_vcpu_ops);
+
+/*
+ * pvsched_get_vcpu_ops: Acquire the driver.
+ * @name: Name of the driver to be acquired.
+ *
+ * Hypervisor can use this API to get the driver structure for
+ * assigning it to guest VMs. This API takes a reference on the
+ * module/bpf program so that driver doesn't vanish under the
+ * hypervisor.
+ *
+ * Return: driver structure if found, else NULL.
+ */
+struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name)
+{
+ struct pvsched_vcpu_ops *ops;
+
+ if (!name || (strlen(name) >= PVSCHED_NAME_MAX))
+ return NULL;
+
+ rcu_read_lock();
+ ops = pvsched_find_vcpu_ops_name(name);
+ if (!ops)
+ goto out;
+
+ if (unlikely(!bpf_try_module_get(ops, ops->owner))) {
+ ops = NULL;
+ goto out;
+ }
+
+out:
+ rcu_read_unlock();
+ return ops;
+}
+EXPORT_SYMBOL_GPL(pvsched_get_vcpu_ops);
+
+/*
+ * pvsched_put_vcpu_ops: Release the driver.
+ * @name: Name of the driver to be releases.
+ *
+ * Hypervisor can use this API to release the driver.
+ */
+void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops)
+{
+ bpf_module_put(ops, ops->owner);
+}
+EXPORT_SYMBOL_GPL(pvsched_put_vcpu_ops);
+
+/*
+ * NOP vm_ops Sample implementation.
+ * This driver doesn't do anything other than registering itself.
+ * Placeholder for adding some default logic when the feature is
+ * complete.
+ */
+static int nop_pvsched_vcpu_register(struct pid *pid)
+{
+ return 0;
+}
+static void nop_pvsched_vcpu_unregister(struct pid *pid)
+{
+}
+static void nop_pvsched_notify_event(void *addr, struct pid *pid, u32 event)
+{
+}
+
+struct pvsched_vcpu_ops nop_vcpu_ops = {
+ .events = PVSCHED_VCPU_VMENTER | PVSCHED_VCPU_VMEXIT | PVSCHED_VCPU_HALT,
+ .pvsched_vcpu_register = nop_pvsched_vcpu_register,
+ .pvsched_vcpu_unregister = nop_pvsched_vcpu_unregister,
+ .pvsched_vcpu_notify_event = nop_pvsched_notify_event,
+ .name = "pvsched_nop",
+ .owner = THIS_MODULE,
+};
+
+static int __init pvsched_init(void)
+{
+ return WARN_ON(pvsched_register_vcpu_ops(&nop_vcpu_ops));
+}
+
+late_initcall(pvsched_init);
--
2.40.1
Adding sched_ext folks
On Wed, Apr 3, 2024 at 10:01 AM Vineeth Pillai (Google)
<vineeth@bitbyteword.org> wrote:
>
> Implement a paravirt scheduling framework for linux kernel.
>
> The framework allows for pvsched driver to register to the kernel and
> receive callbacks from hypervisor(eg: kvm) for interested vcpu events
> like VMENTER, VMEXIT etc.
>
> The framework also allows hypervisor to select a pvsched driver (from
> the available list of registered drivers) for each guest.
>
> Also implement a sysctl for listing the available pvsched drivers.
>
> Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
> Kconfig | 2 +
> include/linux/pvsched.h | 102 +++++++++++++++++++
> kernel/sysctl.c | 27 +++++
> virt/Makefile | 2 +-
> virt/pvsched/Kconfig | 12 +++
> virt/pvsched/Makefile | 2 +
> virt/pvsched/pvsched.c | 215 ++++++++++++++++++++++++++++++++++++++++
> 7 files changed, 361 insertions(+), 1 deletion(-)
> create mode 100644 include/linux/pvsched.h
> create mode 100644 virt/pvsched/Kconfig
> create mode 100644 virt/pvsched/Makefile
> create mode 100644 virt/pvsched/pvsched.c
>
> diff --git a/Kconfig b/Kconfig
> index 745bc773f567..4a52eaa21166 100644
> --- a/Kconfig
> +++ b/Kconfig
> @@ -29,4 +29,6 @@ source "lib/Kconfig"
>
> source "lib/Kconfig.debug"
>
> +source "virt/pvsched/Kconfig"
> +
> source "Documentation/Kconfig"
> diff --git a/include/linux/pvsched.h b/include/linux/pvsched.h
> new file mode 100644
> index 000000000000..59df6b44aacb
> --- /dev/null
> +++ b/include/linux/pvsched.h
> @@ -0,0 +1,102 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/* Copyright (c) 2024 Google */
> +
> +#ifndef _LINUX_PVSCHED_H
> +#define _LINUX_PVSCHED_H 1
> +
> +/*
> + * List of events for which hypervisor calls back into pvsched driver.
> + * Driver can specify the events it is interested in.
> + */
> +enum pvsched_vcpu_events {
> + PVSCHED_VCPU_VMENTER = 0x1,
> + PVSCHED_VCPU_VMEXIT = 0x2,
> + PVSCHED_VCPU_HALT = 0x4,
> + PVSCHED_VCPU_INTR_INJ = 0x8,
> +};
> +
> +#define PVSCHED_NAME_MAX 32
> +#define PVSCHED_MAX 8
> +#define PVSCHED_DRV_BUF_MAX (PVSCHED_NAME_MAX * PVSCHED_MAX + PVSCHED_MAX)
> +
> +/*
> + * pvsched driver callbacks.
> + * TODO: versioning support for better compatibility with the guest
> + * component implementing this feature.
> + */
> +struct pvsched_vcpu_ops {
> + /*
> + * pvsched_vcpu_register() - Register the vcpu with pvsched driver.
> + * @pid: pid of the vcpu task.
> + *
> + * pvsched driver can store the pid internally and initialize
> + * itself to prepare for receiving callbacks from thsi vcpu.
> + */
> + int (*pvsched_vcpu_register)(struct pid *pid);
> +
> + /*
> + * pvsched_vcpu_unregister() - Un-register the vcpu with pvsched driver.
> + * @pid: pid of the vcpu task.
> + */
> + void (*pvsched_vcpu_unregister)(struct pid *pid);
> +
> + /*
> + * pvsched_vcpu_notify_event() - Callback for pvsched events
> + * @addr: Address of the memory region shared with guest
> + * @pid: pid of the vcpu task.
> + * @events: bit mask of the events that hypervisor wants to notify.
> + */
> + void (*pvsched_vcpu_notify_event)(void *addr, struct pid *pid, u32 event);
> +
> + char name[PVSCHED_NAME_MAX];
> + struct module *owner;
> + struct list_head list;
> + u32 events;
> + u32 key;
> +};
> +
> +#ifdef CONFIG_PARAVIRT_SCHED_HOST
> +int pvsched_get_available_drivers(char *buf, size_t maxlen);
> +
> +int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops);
> +void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops);
> +
> +struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name);
> +void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops);
> +
> +static inline int pvsched_validate_vcpu_ops(struct pvsched_vcpu_ops *ops)
> +{
> + /*
> + * All callbacks are mandatory.
> + */
> + if (!ops->pvsched_vcpu_register || !ops->pvsched_vcpu_unregister ||
> + !ops->pvsched_vcpu_notify_event)
> + return -EINVAL;
> +
> + return 0;
> +}
> +#else
> +static inline void pvsched_get_available_drivers(char *buf, size_t maxlen)
> +{
> +}
> +
> +static inline int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops)
> +{
> + return -ENOTSUPP;
> +}
> +
> +static inline void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops)
> +{
> +}
> +
> +static inline struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name)
> +{
> + return NULL;
> +}
> +
> +static inline void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops)
> +{
> +}
> +#endif
> +
> +#endif
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 157f7ce2942d..10a18a791b4f 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -63,6 +63,7 @@
> #include <linux/mount.h>
> #include <linux/userfaultfd_k.h>
> #include <linux/pid.h>
> +#include <linux/pvsched.h>
>
> #include "../lib/kstrtox.h"
>
> @@ -1615,6 +1616,24 @@ int proc_do_static_key(struct ctl_table *table, int write,
> return ret;
> }
>
> +#ifdef CONFIG_PARAVIRT_SCHED_HOST
> +static int proc_pvsched_available_drivers(struct ctl_table *ctl,
> + int write, void *buffer,
> + size_t *lenp, loff_t *ppos)
> +{
> + struct ctl_table tbl = { .maxlen = PVSCHED_DRV_BUF_MAX, };
> + int ret;
> +
> + tbl.data = kmalloc(tbl.maxlen, GFP_USER);
> + if (!tbl.data)
> + return -ENOMEM;
> + pvsched_get_available_drivers(tbl.data, PVSCHED_DRV_BUF_MAX);
> + ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
> + kfree(tbl.data);
> + return ret;
> +}
> +#endif
> +
> static struct ctl_table kern_table[] = {
> {
> .procname = "panic",
> @@ -2033,6 +2052,14 @@ static struct ctl_table kern_table[] = {
> .extra1 = SYSCTL_ONE,
> .extra2 = SYSCTL_INT_MAX,
> },
> +#endif
> +#ifdef CONFIG_PARAVIRT_SCHED_HOST
> + {
> + .procname = "pvsched_available_drivers",
> + .maxlen = PVSCHED_DRV_BUF_MAX,
> + .mode = 0444,
> + .proc_handler = proc_pvsched_available_drivers,
> + },
> #endif
> { }
> };
> diff --git a/virt/Makefile b/virt/Makefile
> index 1cfea9436af9..9d0f32d775a1 100644
> --- a/virt/Makefile
> +++ b/virt/Makefile
> @@ -1,2 +1,2 @@
> # SPDX-License-Identifier: GPL-2.0-only
> -obj-y += lib/
> +obj-y += lib/ pvsched/
> diff --git a/virt/pvsched/Kconfig b/virt/pvsched/Kconfig
> new file mode 100644
> index 000000000000..5ca2669060cb
> --- /dev/null
> +++ b/virt/pvsched/Kconfig
> @@ -0,0 +1,12 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +config PARAVIRT_SCHED_HOST
> + bool "Paravirt scheduling framework in the host kernel"
> + default n
> + help
> + Paravirtualized scheduling facilitates the exchange of scheduling
> + related information between the host and guest through shared memory,
> + enhancing the efficiency of vCPU thread scheduling by the hypervisor.
> + An illustrative use case involves dynamically boosting the priority of
> + a vCPU thread when the guest is executing a latency-sensitive workload
> + on that specific vCPU.
> + This config enables paravirt scheduling framework in the host kernel.
> diff --git a/virt/pvsched/Makefile b/virt/pvsched/Makefile
> new file mode 100644
> index 000000000000..4ca38e30479b
> --- /dev/null
> +++ b/virt/pvsched/Makefile
> @@ -0,0 +1,2 @@
> +
> +obj-$(CONFIG_PARAVIRT_SCHED_HOST) += pvsched.o
> diff --git a/virt/pvsched/pvsched.c b/virt/pvsched/pvsched.c
> new file mode 100644
> index 000000000000..610c85cf90d2
> --- /dev/null
> +++ b/virt/pvsched/pvsched.c
> @@ -0,0 +1,215 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2024 Google */
> +
> +/*
> + * Paravirt scheduling framework
> + *
> + */
> +
> +/*
> + * Heavily inspired from tcp congestion avoidance implementation.
> + * (net/ipv4/tcp_cong.c)
> + */
> +
> +#define pr_fmt(fmt) "PVSCHED: " fmt
> +
> +#include <linux/module.h>
> +#include <linux/bpf.h>
> +#include <linux/gfp.h>
> +#include <linux/types.h>
> +#include <linux/list.h>
> +#include <linux/jhash.h>
> +#include <linux/pvsched.h>
> +
> +static DEFINE_SPINLOCK(pvsched_drv_list_lock);
> +static int nr_pvsched_drivers = 0;
> +static LIST_HEAD(pvsched_drv_list);
> +
> +/*
> + * Retrieve pvsched_vcpu_ops given the name.
> + */
> +static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_name(char *name)
> +{
> + struct pvsched_vcpu_ops *ops;
> +
> + list_for_each_entry_rcu(ops, &pvsched_drv_list, list) {
> + if (strcmp(ops->name, name) == 0)
> + return ops;
> + }
> +
> + return NULL;
> +}
> +
> +/*
> + * Retrieve pvsched_vcpu_ops given the hash key.
> + */
> +static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_key(u32 key)
> +{
> + struct pvsched_vcpu_ops *ops;
> +
> + list_for_each_entry_rcu(ops, &pvsched_drv_list, list) {
> + if (ops->key == key)
> + return ops;
> + }
> +
> + return NULL;
> +}
> +
> +/*
> + * pvsched_get_available_drivers() - Copy space separated list of pvsched
> + * driver names.
> + * @buf: buffer to store the list of driver names
> + * @maxlen: size of the buffer
> + *
> + * Return: 0 on success, negative value on error.
> + */
> +int pvsched_get_available_drivers(char *buf, size_t maxlen)
> +{
> + struct pvsched_vcpu_ops *ops;
> + size_t offs = 0;
> +
> + if (!buf)
> + return -EINVAL;
> +
> + if (maxlen > PVSCHED_DRV_BUF_MAX)
> + maxlen = PVSCHED_DRV_BUF_MAX;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(ops, &pvsched_drv_list, list) {
> + offs += snprintf(buf + offs, maxlen - offs,
> + "%s%s",
> + offs == 0 ? "" : " ", ops->name);
> +
> + if (WARN_ON_ONCE(offs >= maxlen))
> + break;
> + }
> + rcu_read_unlock();
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(pvsched_get_available_drivers);
> +
> +/*
> + * pvsched_register_vcpu_ops() - Register the driver in the kernel.
> + * @ops: Driver data(callbacks)
> + *
> + * After the registration, driver will be exposed to the hypervisor
> + * for assignment to the guest VMs.
> + *
> + * Return: 0 on success, negative value on error.
> + */
> +int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops)
> +{
> + int ret = 0;
> +
> + ops->key = jhash(ops->name, sizeof(ops->name), strlen(ops->name));
> + spin_lock(&pvsched_drv_list_lock);
> + if (nr_pvsched_drivers > PVSCHED_MAX) {
> + ret = -ENOSPC;
> + } if (pvsched_find_vcpu_ops_key(ops->key)) {
> + ret = -EEXIST;
> + } else if (!(ret = pvsched_validate_vcpu_ops(ops))) {
> + list_add_tail_rcu(&ops->list, &pvsched_drv_list);
> + nr_pvsched_drivers++;
> + }
> + spin_unlock(&pvsched_drv_list_lock);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(pvsched_register_vcpu_ops);
> +
> +/*
> + * pvsched_register_vcpu_ops() - Un-register the driver from the kernel.
> + * @ops: Driver data(callbacks)
> + *
> + * After un-registration, driver will not be visible to hypervisor.
> + */
> +void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops)
> +{
> + spin_lock(&pvsched_drv_list_lock);
> + list_del_rcu(&ops->list);
> + nr_pvsched_drivers--;
> + spin_unlock(&pvsched_drv_list_lock);
> +
> + synchronize_rcu();
> +}
> +EXPORT_SYMBOL_GPL(pvsched_unregister_vcpu_ops);
> +
> +/*
> + * pvsched_get_vcpu_ops: Acquire the driver.
> + * @name: Name of the driver to be acquired.
> + *
> + * Hypervisor can use this API to get the driver structure for
> + * assigning it to guest VMs. This API takes a reference on the
> + * module/bpf program so that driver doesn't vanish under the
> + * hypervisor.
> + *
> + * Return: driver structure if found, else NULL.
> + */
> +struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name)
> +{
> + struct pvsched_vcpu_ops *ops;
> +
> + if (!name || (strlen(name) >= PVSCHED_NAME_MAX))
> + return NULL;
> +
> + rcu_read_lock();
> + ops = pvsched_find_vcpu_ops_name(name);
> + if (!ops)
> + goto out;
> +
> + if (unlikely(!bpf_try_module_get(ops, ops->owner))) {
> + ops = NULL;
> + goto out;
> + }
> +
> +out:
> + rcu_read_unlock();
> + return ops;
> +}
> +EXPORT_SYMBOL_GPL(pvsched_get_vcpu_ops);
> +
> +/*
> + * pvsched_put_vcpu_ops: Release the driver.
> + * @name: Name of the driver to be releases.
> + *
> + * Hypervisor can use this API to release the driver.
> + */
> +void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops)
> +{
> + bpf_module_put(ops, ops->owner);
> +}
> +EXPORT_SYMBOL_GPL(pvsched_put_vcpu_ops);
> +
> +/*
> + * NOP vm_ops Sample implementation.
> + * This driver doesn't do anything other than registering itself.
> + * Placeholder for adding some default logic when the feature is
> + * complete.
> + */
> +static int nop_pvsched_vcpu_register(struct pid *pid)
> +{
> + return 0;
> +}
> +static void nop_pvsched_vcpu_unregister(struct pid *pid)
> +{
> +}
> +static void nop_pvsched_notify_event(void *addr, struct pid *pid, u32 event)
> +{
> +}
> +
> +struct pvsched_vcpu_ops nop_vcpu_ops = {
> + .events = PVSCHED_VCPU_VMENTER | PVSCHED_VCPU_VMEXIT | PVSCHED_VCPU_HALT,
> + .pvsched_vcpu_register = nop_pvsched_vcpu_register,
> + .pvsched_vcpu_unregister = nop_pvsched_vcpu_unregister,
> + .pvsched_vcpu_notify_event = nop_pvsched_notify_event,
> + .name = "pvsched_nop",
> + .owner = THIS_MODULE,
> +};
> +
> +static int __init pvsched_init(void)
> +{
> + return WARN_ON(pvsched_register_vcpu_ops(&nop_vcpu_ops));
> +}
> +
> +late_initcall(pvsched_init);
> --
> 2.40.1
>
© 2016 - 2026 Red Hat, Inc.