[PATCH RFC 5/6] nfsd: adjust number of running nfsd threads based on activity

Jeff Layton posted 6 patches 1 day, 7 hours ago
[PATCH RFC 5/6] nfsd: adjust number of running nfsd threads based on activity
Posted by Jeff Layton 1 day, 7 hours ago
This patch is based on a draft patch by Neil:

svc_recv() is changed to return a status.  This can be:

 -ETIMEDOUT - waited for 5 seconds and found nothing to do.  This is
          boring.  Also there are more actual threads than really
          needed.
 -EBUSY - I did something, but there is more stuff to do and no one
          idle who I can wake up to do it.
          BTW I successful set a flag: SP_TASK_STARTING.  You better
          clear it.
 0 - just minding my own business, nothing to see here.

nfsd() is changed to pay attention to this status.  In the case of
-ETIMEDOUT, if the service mutex can be taken (trylock), the thread
becomes and RQ_VICTIM so that it will exit.  In the case of -EBUSY, if
the actual number of threads is below the calculated maximum, a new
thread is started.  SP_TASK_STARTING is cleared.

To support the above, some code is split out of svc_start_kthreads()
into svc_new_thread().

I think we want memory pressure to be able to push a thread into
returning -ETIMEDOUT.  That can come later.

Signed-off-by: NeilBrown <neil@brown.name>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfssvc.c               | 35 ++++++++++++++++++++-
 fs/nfsd/trace.h                | 35 +++++++++++++++++++++
 include/linux/sunrpc/svc.h     |  2 ++
 include/linux/sunrpc/svcsock.h |  2 +-
 net/sunrpc/svc.c               | 69 ++++++++++++++++++++++++------------------
 net/sunrpc/svc_xprt.c          | 45 ++++++++++++++++++++++-----
 6 files changed, 148 insertions(+), 40 deletions(-)

diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 993ed338764b0ccd7bdfb76bd6fbb5dc6ab4022d..26c3a6cb1f400f1b757d26f6ba77e27deb7e8ee2 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -896,9 +896,11 @@ static int
 nfsd(void *vrqstp)
 {
 	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
+	struct svc_pool *pool = rqstp->rq_pool;
 	struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
 	struct net *net = perm_sock->xpt_net;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	bool have_mutex = false;
 
 	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with the
@@ -916,7 +918,36 @@ nfsd(void *vrqstp)
 	 * The main request loop
 	 */
 	while (!svc_thread_should_stop(rqstp)) {
-		svc_recv(rqstp);
+		switch (svc_recv(rqstp)) {
+		case -ETIMEDOUT: /* Nothing to do */
+			if (mutex_trylock(&nfsd_mutex)) {
+				if (pool->sp_nrthreads > pool->sp_nrthrmin) {
+					trace_nfsd_dynthread_kill(net, pool);
+					set_bit(RQ_VICTIM, &rqstp->rq_flags);
+					have_mutex = true;
+				} else
+					mutex_unlock(&nfsd_mutex);
+			} else {
+				trace_nfsd_dynthread_trylock_fail(net, pool);
+			}
+			break;
+		case -EBUSY: /* Too much to do */
+			if (pool->sp_nrthreads < pool->sp_nrthrmax &&
+			    mutex_trylock(&nfsd_mutex)) {
+				// check no idle threads?
+				if (pool->sp_nrthreads < pool->sp_nrthrmax) {
+					trace_nfsd_dynthread_start(net, pool);
+					svc_new_thread(rqstp->rq_server, pool);
+				}
+				mutex_unlock(&nfsd_mutex);
+			} else {
+				trace_nfsd_dynthread_trylock_fail(net, pool);
+			}
+			clear_bit(SP_TASK_STARTING, &pool->sp_flags);
+			break;
+		default:
+			break;
+		}
 		nfsd_file_net_dispose(nn);
 	}
 
@@ -924,6 +955,8 @@ nfsd(void *vrqstp)
 
 	/* Release the thread */
 	svc_exit_thread(rqstp);
+	if (have_mutex)
+		mutex_unlock(&nfsd_mutex);
 	return 0;
 }
 
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 5ae2a611e57f4b4e51a4d9eb6e0fccb66ad8d288..8885fd9bead98ebf55379d68ab9c3701981a5150 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -91,6 +91,41 @@ DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \
 DEFINE_NFSD_XDR_ERR_EVENT(garbage_args);
 DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
 
+DECLARE_EVENT_CLASS(nfsd_dynthread_class,
+	TP_PROTO(
+		const struct net *net,
+		const struct svc_pool *pool
+	),
+	TP_ARGS(net, pool),
+	TP_STRUCT__entry(
+		__field(unsigned int, netns_ino)
+		__field(unsigned int, pool_id)
+		__field(unsigned int, nrthreads)
+		__field(unsigned int, nrthrmin)
+		__field(unsigned int, nrthrmax)
+	),
+	TP_fast_assign(
+		__entry->netns_ino = net->ns.inum;
+		__entry->pool_id = pool->sp_id;
+		__entry->nrthreads = pool->sp_nrthreads;
+		__entry->nrthrmin = pool->sp_nrthrmin;
+		__entry->nrthrmax = pool->sp_nrthrmax;
+	),
+	TP_printk("pool=%u nrthreads=%u nrthrmin=%u nrthrmax=%u",
+		__entry->pool_id, __entry->nrthreads,
+		__entry->nrthrmin, __entry->nrthrmax
+	)
+);
+
+#define DEFINE_NFSD_DYNTHREAD_EVENT(name) \
+DEFINE_EVENT(nfsd_dynthread_class, nfsd_dynthread_##name, \
+	TP_PROTO(const struct net *net, const struct svc_pool *pool), \
+	TP_ARGS(net, pool))
+
+DEFINE_NFSD_DYNTHREAD_EVENT(start);
+DEFINE_NFSD_DYNTHREAD_EVENT(kill);
+DEFINE_NFSD_DYNTHREAD_EVENT(trylock_fail);
+
 #define show_nfsd_may_flags(x)						\
 	__print_flags(x, "|",						\
 		{ NFSD_MAY_EXEC,		"EXEC" },		\
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 35bd3247764ae8dc5dcdfffeea36f7cfefd13372..f47e19c9bd9466986438766e9ab7b4c71cda1ba6 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -55,6 +55,7 @@ enum {
 	SP_TASK_PENDING,	/* still work to do even if no xprt is queued */
 	SP_NEED_VICTIM,		/* One thread needs to agree to exit */
 	SP_VICTIM_REMAINS,	/* One thread needs to actually exit */
+	SP_TASK_STARTING,	/* Task has started but not added to idle yet */
 };
 
 
@@ -442,6 +443,7 @@ struct svc_serv *svc_create(struct svc_program *, unsigned int,
 bool		   svc_rqst_replace_page(struct svc_rqst *rqstp,
 					 struct page *page);
 void		   svc_rqst_release_pages(struct svc_rqst *rqstp);
+int		   svc_new_thread(struct svc_serv *serv, struct svc_pool *pool);
 void		   svc_exit_thread(struct svc_rqst *);
 struct svc_serv *  svc_create_pooled(struct svc_program *prog,
 				     unsigned int nprog,
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index de37069aba90899be19b1090e6e90e509a3cf530..5c87d3fedd33e7edf5ade32e60523cae7e9ebaba 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -61,7 +61,7 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk)
 /*
  * Function prototypes.
  */
-void		svc_recv(struct svc_rqst *rqstp);
+int		svc_recv(struct svc_rqst *rqstp);
 void		svc_send(struct svc_rqst *rqstp);
 int		svc_addsock(struct svc_serv *serv, struct net *net,
 			    const int fd, char *name_return, const size_t len,
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index dc818158f8529b62dcf96c91bd9a9d4ab21df91f..9fca2dd340037f82baa4936766ebe0e38c3f0d85 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -714,9 +714,6 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
 
 	rqstp->rq_err = -EAGAIN; /* No error yet */
 
-	serv->sv_nrthreads += 1;
-	pool->sp_nrthreads += 1;
-
 	/* Protected by whatever lock the service uses when calling
 	 * svc_set_num_threads()
 	 */
@@ -763,45 +760,57 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
 }
 EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
 
-static int
-svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+int svc_new_thread(struct svc_serv *serv, struct svc_pool *pool)
 {
 	struct svc_rqst	*rqstp;
 	struct task_struct *task;
 	int node;
 	int err;
 
-	do {
-		nrservs--;
-		node = svc_pool_map_get_node(pool->sp_id);
-
-		rqstp = svc_prepare_thread(serv, pool, node);
-		if (!rqstp)
-			return -ENOMEM;
-		task = kthread_create_on_node(serv->sv_threadfn, rqstp,
-					      node, "%s", serv->sv_name);
-		if (IS_ERR(task)) {
-			svc_exit_thread(rqstp);
-			return PTR_ERR(task);
-		}
+	node = svc_pool_map_get_node(pool->sp_id);
 
-		rqstp->rq_task = task;
-		if (serv->sv_nrpools > 1)
-			svc_pool_map_set_cpumask(task, pool->sp_id);
+	rqstp = svc_prepare_thread(serv, pool, node);
+	if (!rqstp)
+		return -ENOMEM;
+	set_bit(SP_TASK_STARTING, &pool->sp_flags);
+	task = kthread_create_on_node(serv->sv_threadfn, rqstp,
+				      node, "%s", serv->sv_name);
+	if (IS_ERR(task)) {
+		clear_bit(SP_TASK_STARTING, &pool->sp_flags);
+		svc_exit_thread(rqstp);
+		return PTR_ERR(task);
+	}
 
-		svc_sock_update_bufs(serv);
-		wake_up_process(task);
+	serv->sv_nrthreads += 1;
+	pool->sp_nrthreads += 1;
 
-		wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
-		err = rqstp->rq_err;
-		if (err) {
-			svc_exit_thread(rqstp);
-			return err;
-		}
-	} while (nrservs > 0);
+	rqstp->rq_task = task;
+	if (serv->sv_nrpools > 1)
+		svc_pool_map_set_cpumask(task, pool->sp_id);
 
+	svc_sock_update_bufs(serv);
+	wake_up_process(task);
+
+	wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
+	err = rqstp->rq_err;
+	if (err) {
+		svc_exit_thread(rqstp);
+		return err;
+	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(svc_new_thread);
+
+static int
+svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+	int err = 0;
+
+	while (!err && nrservs--)
+		err = svc_new_thread(serv, pool);
+
+	return err;
+}
 
 static int
 svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 6973184ff6675211b4338fac80105894e9c8d4df..9612334300c8dae38720a0f5c61c0f505432ec2f 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -714,15 +714,22 @@ svc_thread_should_sleep(struct svc_rqst *rqstp)
 	return true;
 }
 
-static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
+static bool nfsd_schedule_timeout(long timeout)
+{
+	return schedule_timeout(timeout) == 0;
+}
+
+static bool svc_thread_wait_for_work(struct svc_rqst *rqstp)
 {
 	struct svc_pool *pool = rqstp->rq_pool;
+	bool did_timeout = false;
 
 	if (svc_thread_should_sleep(rqstp)) {
 		set_current_state(TASK_IDLE | TASK_FREEZABLE);
 		llist_add(&rqstp->rq_idle, &pool->sp_idle_threads);
+		clear_bit(SP_TASK_STARTING, &pool->sp_flags);
 		if (likely(svc_thread_should_sleep(rqstp)))
-			schedule();
+			did_timeout = nfsd_schedule_timeout(5 * HZ);
 
 		while (!llist_del_first_this(&pool->sp_idle_threads,
 					     &rqstp->rq_idle)) {
@@ -734,7 +741,10 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
 			 * for this new work.  This thread can safely sleep
 			 * until woken again.
 			 */
-			schedule();
+			if (did_timeout)
+				did_timeout = nfsd_schedule_timeout(HZ);
+			else
+				did_timeout = nfsd_schedule_timeout(5 * HZ);
 			set_current_state(TASK_IDLE | TASK_FREEZABLE);
 		}
 		__set_current_state(TASK_RUNNING);
@@ -742,6 +752,7 @@ static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
 		cond_resched();
 	}
 	try_to_freeze();
+	return did_timeout;
 }
 
 static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
@@ -825,6 +836,8 @@ static void svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 
 static void svc_thread_wake_next(struct svc_rqst *rqstp)
 {
+	clear_bit(SP_TASK_STARTING, &rqstp->rq_pool->sp_flags);
+
 	if (!svc_thread_should_sleep(rqstp))
 		/* More work pending after I dequeued some,
 		 * wake another worker
@@ -839,21 +852,31 @@ static void svc_thread_wake_next(struct svc_rqst *rqstp)
  * This code is carefully organised not to touch any cachelines in
  * the shared svc_serv structure, only cachelines in the local
  * svc_pool.
+ *
+ * Returns -ETIMEDOUT if idle for an extended period
+ *         -EBUSY is there is more work to do than available threads
+ *         0 otherwise.
  */
-void svc_recv(struct svc_rqst *rqstp)
+int svc_recv(struct svc_rqst *rqstp)
 {
 	struct svc_pool *pool = rqstp->rq_pool;
+	bool did_wait;
+	int ret = 0;
 
 	if (!svc_alloc_arg(rqstp))
-		return;
+		return ret;
+
+	did_wait = svc_thread_wait_for_work(rqstp);
 
-	svc_thread_wait_for_work(rqstp);
+	if (did_wait && svc_thread_should_sleep(rqstp) &&
+	    pool->sp_nrthrmin && (pool->sp_nrthreads > pool->sp_nrthrmin))
+		ret = -ETIMEDOUT;
 
 	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
 
 	if (svc_thread_should_stop(rqstp)) {
 		svc_thread_wake_next(rqstp);
-		return;
+		return ret;
 	}
 
 	rqstp->rq_xprt = svc_xprt_dequeue(pool);
@@ -867,8 +890,13 @@ void svc_recv(struct svc_rqst *rqstp)
 		 */
 		if (pool->sp_idle_threads.first)
 			rqstp->rq_chandle.thread_wait = 5 * HZ;
-		else
+		else {
 			rqstp->rq_chandle.thread_wait = 1 * HZ;
+			if (!did_wait &&
+			    !test_and_set_bit(SP_TASK_STARTING,
+					      &pool->sp_flags))
+				ret = -EBUSY;
+		}
 
 		trace_svc_xprt_dequeue(rqstp);
 		svc_handle_xprt(rqstp, xprt);
@@ -887,6 +915,7 @@ void svc_recv(struct svc_rqst *rqstp)
 		}
 	}
 #endif
+	return ret;
 }
 EXPORT_SYMBOL_GPL(svc_recv);
 

-- 
2.52.0
Re: [PATCH RFC 5/6] nfsd: adjust number of running nfsd threads based on activity
Posted by Chuck Lever 9 hours ago

On Fri, Dec 12, 2025, at 5:39 PM, Jeff Layton wrote:
> This patch is based on a draft patch by Neil:
>
> svc_recv() is changed to return a status.  This can be:
>
>  -ETIMEDOUT - waited for 5 seconds and found nothing to do.  This is
>           boring.  Also there are more actual threads than really
>           needed.
>  -EBUSY - I did something, but there is more stuff to do and no one
>           idle who I can wake up to do it.
>           BTW I successful set a flag: SP_TASK_STARTING.  You better
>           clear it.
>  0 - just minding my own business, nothing to see here.
>
> nfsd() is changed to pay attention to this status.  In the case of
> -ETIMEDOUT, if the service mutex can be taken (trylock), the thread
> becomes and RQ_VICTIM so that it will exit.  In the case of -EBUSY, if
> the actual number of threads is below the calculated maximum, a new
> thread is started.  SP_TASK_STARTING is cleared.

Jeff, since you reworked things to be based on a minimum rather
than a maximum count, is this paragraph now stale?


> To support the above, some code is split out of svc_start_kthreads()
> into svc_new_thread().
>
> I think we want memory pressure to be able to push a thread into
> returning -ETIMEDOUT.  That can come later.
>
> Signed-off-by: NeilBrown <neil@brown.name>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfssvc.c               | 35 ++++++++++++++++++++-
>  fs/nfsd/trace.h                | 35 +++++++++++++++++++++
>  include/linux/sunrpc/svc.h     |  2 ++
>  include/linux/sunrpc/svcsock.h |  2 +-
>  net/sunrpc/svc.c               | 69 ++++++++++++++++++++++++------------------
>  net/sunrpc/svc_xprt.c          | 45 ++++++++++++++++++++++-----
>  6 files changed, 148 insertions(+), 40 deletions(-)
>
> diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> index 
> 993ed338764b0ccd7bdfb76bd6fbb5dc6ab4022d..26c3a6cb1f400f1b757d26f6ba77e27deb7e8ee2 
> 100644
> --- a/fs/nfsd/nfssvc.c
> +++ b/fs/nfsd/nfssvc.c
> @@ -896,9 +896,11 @@ static int
>  nfsd(void *vrqstp)
>  {
>  	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
> +	struct svc_pool *pool = rqstp->rq_pool;
>  	struct svc_xprt *perm_sock = 
> list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct 
> svc_xprt), xpt_list);
>  	struct net *net = perm_sock->xpt_net;
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	bool have_mutex = false;
> 
>  	/* At this point, the thread shares current->fs
>  	 * with the init process. We need to create files with the
> @@ -916,7 +918,36 @@ nfsd(void *vrqstp)
>  	 * The main request loop
>  	 */
>  	while (!svc_thread_should_stop(rqstp)) {
> -		svc_recv(rqstp);
> +		switch (svc_recv(rqstp)) {
> +		case -ETIMEDOUT: /* Nothing to do */
> +			if (mutex_trylock(&nfsd_mutex)) {
> +				if (pool->sp_nrthreads > pool->sp_nrthrmin) {
> +					trace_nfsd_dynthread_kill(net, pool);
> +					set_bit(RQ_VICTIM, &rqstp->rq_flags);
> +					have_mutex = true;
> +				} else
> +					mutex_unlock(&nfsd_mutex);
> +			} else {
> +				trace_nfsd_dynthread_trylock_fail(net, pool);
> +			}
> +			break;
> +		case -EBUSY: /* Too much to do */
> +			if (pool->sp_nrthreads < pool->sp_nrthrmax &&
> +			    mutex_trylock(&nfsd_mutex)) {
> +				// check no idle threads?

Can this comment be clarified? It looks like a note-to-self, that maybe
something is unfinished.


> +				if (pool->sp_nrthreads < pool->sp_nrthrmax) {
> +					trace_nfsd_dynthread_start(net, pool);
> +					svc_new_thread(rqstp->rq_server, pool);
> +				}
> +				mutex_unlock(&nfsd_mutex);
> +			} else {
> +				trace_nfsd_dynthread_trylock_fail(net, pool);
> +			}
> +			clear_bit(SP_TASK_STARTING, &pool->sp_flags);
> +			break;
> +		default:
> +			break;
> +		}
>  		nfsd_file_net_dispose(nn);
>  	}
> 
> @@ -924,6 +955,8 @@ nfsd(void *vrqstp)
> 
>  	/* Release the thread */
>  	svc_exit_thread(rqstp);
> +	if (have_mutex)
> +		mutex_unlock(&nfsd_mutex);
>  	return 0;
>  }
> 
> diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> index 
> 5ae2a611e57f4b4e51a4d9eb6e0fccb66ad8d288..8885fd9bead98ebf55379d68ab9c3701981a5150 
> 100644
> --- a/fs/nfsd/trace.h
> +++ b/fs/nfsd/trace.h
> @@ -91,6 +91,41 @@ DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \
>  DEFINE_NFSD_XDR_ERR_EVENT(garbage_args);
>  DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
> 
> +DECLARE_EVENT_CLASS(nfsd_dynthread_class,
> +	TP_PROTO(
> +		const struct net *net,
> +		const struct svc_pool *pool
> +	),
> +	TP_ARGS(net, pool),
> +	TP_STRUCT__entry(
> +		__field(unsigned int, netns_ino)
> +		__field(unsigned int, pool_id)
> +		__field(unsigned int, nrthreads)
> +		__field(unsigned int, nrthrmin)
> +		__field(unsigned int, nrthrmax)
> +	),
> +	TP_fast_assign(
> +		__entry->netns_ino = net->ns.inum;
> +		__entry->pool_id = pool->sp_id;
> +		__entry->nrthreads = pool->sp_nrthreads;
> +		__entry->nrthrmin = pool->sp_nrthrmin;
> +		__entry->nrthrmax = pool->sp_nrthrmax;
> +	),
> +	TP_printk("pool=%u nrthreads=%u nrthrmin=%u nrthrmax=%u",
> +		__entry->pool_id, __entry->nrthreads,
> +		__entry->nrthrmin, __entry->nrthrmax
> +	)
> +);
> +
> +#define DEFINE_NFSD_DYNTHREAD_EVENT(name) \
> +DEFINE_EVENT(nfsd_dynthread_class, nfsd_dynthread_##name, \
> +	TP_PROTO(const struct net *net, const struct svc_pool *pool), \
> +	TP_ARGS(net, pool))
> +
> +DEFINE_NFSD_DYNTHREAD_EVENT(start);
> +DEFINE_NFSD_DYNTHREAD_EVENT(kill);
> +DEFINE_NFSD_DYNTHREAD_EVENT(trylock_fail);
> +
>  #define show_nfsd_may_flags(x)						\
>  	__print_flags(x, "|",						\
>  		{ NFSD_MAY_EXEC,		"EXEC" },		\
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index 
> 35bd3247764ae8dc5dcdfffeea36f7cfefd13372..f47e19c9bd9466986438766e9ab7b4c71cda1ba6 
> 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -55,6 +55,7 @@ enum {
>  	SP_TASK_PENDING,	/* still work to do even if no xprt is queued */
>  	SP_NEED_VICTIM,		/* One thread needs to agree to exit */
>  	SP_VICTIM_REMAINS,	/* One thread needs to actually exit */
> +	SP_TASK_STARTING,	/* Task has started but not added to idle yet */
>  };
> 
> 
> @@ -442,6 +443,7 @@ struct svc_serv *svc_create(struct svc_program *, 
> unsigned int,
>  bool		   svc_rqst_replace_page(struct svc_rqst *rqstp,
>  					 struct page *page);
>  void		   svc_rqst_release_pages(struct svc_rqst *rqstp);
> +int		   svc_new_thread(struct svc_serv *serv, struct svc_pool *pool);
>  void		   svc_exit_thread(struct svc_rqst *);
>  struct svc_serv *  svc_create_pooled(struct svc_program *prog,
>  				     unsigned int nprog,
> diff --git a/include/linux/sunrpc/svcsock.h 
> b/include/linux/sunrpc/svcsock.h
> index 
> de37069aba90899be19b1090e6e90e509a3cf530..5c87d3fedd33e7edf5ade32e60523cae7e9ebaba 
> 100644
> --- a/include/linux/sunrpc/svcsock.h
> +++ b/include/linux/sunrpc/svcsock.h
> @@ -61,7 +61,7 @@ static inline u32 svc_sock_final_rec(struct svc_sock 
> *svsk)
>  /*
>   * Function prototypes.
>   */
> -void		svc_recv(struct svc_rqst *rqstp);
> +int		svc_recv(struct svc_rqst *rqstp);
>  void		svc_send(struct svc_rqst *rqstp);
>  int		svc_addsock(struct svc_serv *serv, struct net *net,
>  			    const int fd, char *name_return, const size_t len,
> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index 
> dc818158f8529b62dcf96c91bd9a9d4ab21df91f..9fca2dd340037f82baa4936766ebe0e38c3f0d85 
> 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c
> @@ -714,9 +714,6 @@ svc_prepare_thread(struct svc_serv *serv, struct 
> svc_pool *pool, int node)
> 
>  	rqstp->rq_err = -EAGAIN; /* No error yet */
> 
> -	serv->sv_nrthreads += 1;
> -	pool->sp_nrthreads += 1;
> -
>  	/* Protected by whatever lock the service uses when calling
>  	 * svc_set_num_threads()
>  	 */
> @@ -763,45 +760,57 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
>  }
>  EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
> 
> -static int
> -svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
> +int svc_new_thread(struct svc_serv *serv, struct svc_pool *pool)

Is now an exported function, should get a kdoc comment.


>  {
>  	struct svc_rqst	*rqstp;
>  	struct task_struct *task;
>  	int node;
>  	int err;
> 
> -	do {
> -		nrservs--;
> -		node = svc_pool_map_get_node(pool->sp_id);
> -
> -		rqstp = svc_prepare_thread(serv, pool, node);
> -		if (!rqstp)
> -			return -ENOMEM;
> -		task = kthread_create_on_node(serv->sv_threadfn, rqstp,
> -					      node, "%s", serv->sv_name);
> -		if (IS_ERR(task)) {
> -			svc_exit_thread(rqstp);
> -			return PTR_ERR(task);
> -		}
> +	node = svc_pool_map_get_node(pool->sp_id);
> 
> -		rqstp->rq_task = task;
> -		if (serv->sv_nrpools > 1)
> -			svc_pool_map_set_cpumask(task, pool->sp_id);
> +	rqstp = svc_prepare_thread(serv, pool, node);
> +	if (!rqstp)
> +		return -ENOMEM;
> +	set_bit(SP_TASK_STARTING, &pool->sp_flags);
> +	task = kthread_create_on_node(serv->sv_threadfn, rqstp,
> +				      node, "%s", serv->sv_name);
> +	if (IS_ERR(task)) {
> +		clear_bit(SP_TASK_STARTING, &pool->sp_flags);
> +		svc_exit_thread(rqstp);

svc_exit_thread() decrements serv->sv_nrthreads and pool->sp_nrthreads
but this call site hasn't incremented them yet. Perhaps this error
flow needs a simpler clean-up than calling svc_exit_thread().


> +		return PTR_ERR(task);
> +	}
> 
> -		svc_sock_update_bufs(serv);
> -		wake_up_process(task);
> +	serv->sv_nrthreads += 1;
> +	pool->sp_nrthreads += 1;
> 
> -		wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
> -		err = rqstp->rq_err;
> -		if (err) {
> -			svc_exit_thread(rqstp);
> -			return err;
> -		}
> -	} while (nrservs > 0);
> +	rqstp->rq_task = task;
> +	if (serv->sv_nrpools > 1)
> +		svc_pool_map_set_cpumask(task, pool->sp_id);
> 
> +	svc_sock_update_bufs(serv);
> +	wake_up_process(task);
> +
> +	wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
> +	err = rqstp->rq_err;
> +	if (err) {
> +		svc_exit_thread(rqstp);
> +		return err;
> +	}
>  	return 0;
>  }
> +EXPORT_SYMBOL_GPL(svc_new_thread);
> +
> +static int
> +svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
> +{
> +	int err = 0;
> +
> +	while (!err && nrservs--)
> +		err = svc_new_thread(serv, pool);
> +
> +	return err;
> +}
> 
>  static int
>  svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int 
> nrservs)
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index 
> 6973184ff6675211b4338fac80105894e9c8d4df..9612334300c8dae38720a0f5c61c0f505432ec2f 
> 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -714,15 +714,22 @@ svc_thread_should_sleep(struct svc_rqst *rqstp)
>  	return true;
>  }
> 
> -static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
> +static bool nfsd_schedule_timeout(long timeout)

Perhaps svc_schedule_timeout() is a more appropriate name for
a function that resides in net/sunrpc/svc_xprt.c.


> +{
> +	return schedule_timeout(timeout) == 0;
> +}
> +
> +static bool svc_thread_wait_for_work(struct svc_rqst *rqstp)
>  {
>  	struct svc_pool *pool = rqstp->rq_pool;
> +	bool did_timeout = false;
> 
>  	if (svc_thread_should_sleep(rqstp)) {
>  		set_current_state(TASK_IDLE | TASK_FREEZABLE);
>  		llist_add(&rqstp->rq_idle, &pool->sp_idle_threads);
> +		clear_bit(SP_TASK_STARTING, &pool->sp_flags);
>  		if (likely(svc_thread_should_sleep(rqstp)))
> -			schedule();
> +			did_timeout = nfsd_schedule_timeout(5 * HZ);
> 
>  		while (!llist_del_first_this(&pool->sp_idle_threads,
>  					     &rqstp->rq_idle)) {
> @@ -734,7 +741,10 @@ static void svc_thread_wait_for_work(struct 
> svc_rqst *rqstp)
>  			 * for this new work.  This thread can safely sleep
>  			 * until woken again.
>  			 */
> -			schedule();
> +			if (did_timeout)
> +				did_timeout = nfsd_schedule_timeout(HZ);
> +			else
> +				did_timeout = nfsd_schedule_timeout(5 * HZ);
>  			set_current_state(TASK_IDLE | TASK_FREEZABLE);
>  		}
>  		__set_current_state(TASK_RUNNING);
> @@ -742,6 +752,7 @@ static void svc_thread_wait_for_work(struct 
> svc_rqst *rqstp)
>  		cond_resched();
>  	}
>  	try_to_freeze();
> +	return did_timeout;
>  }
> 
>  static void svc_add_new_temp_xprt(struct svc_serv *serv, struct 
> svc_xprt *newxpt)
> @@ -825,6 +836,8 @@ static void svc_handle_xprt(struct svc_rqst *rqstp, 
> struct svc_xprt *xprt)
> 
>  static void svc_thread_wake_next(struct svc_rqst *rqstp)
>  {
> +	clear_bit(SP_TASK_STARTING, &rqstp->rq_pool->sp_flags);
> +
>  	if (!svc_thread_should_sleep(rqstp))
>  		/* More work pending after I dequeued some,
>  		 * wake another worker
> @@ -839,21 +852,31 @@ static void svc_thread_wake_next(struct svc_rqst *rqstp)
>   * This code is carefully organised not to touch any cachelines in
>   * the shared svc_serv structure, only cachelines in the local
>   * svc_pool.
> + *
> + * Returns -ETIMEDOUT if idle for an extended period
> + *         -EBUSY is there is more work to do than available threads
> + *         0 otherwise.
>   */
> -void svc_recv(struct svc_rqst *rqstp)
> +int svc_recv(struct svc_rqst *rqstp)
>  {
>  	struct svc_pool *pool = rqstp->rq_pool;
> +	bool did_wait;
> +	int ret = 0;
> 
>  	if (!svc_alloc_arg(rqstp))
> -		return;
> +		return ret;
> +
> +	did_wait = svc_thread_wait_for_work(rqstp);
> 
> -	svc_thread_wait_for_work(rqstp);
> +	if (did_wait && svc_thread_should_sleep(rqstp) &&
> +	    pool->sp_nrthrmin && (pool->sp_nrthreads > pool->sp_nrthrmin))
> +		ret = -ETIMEDOUT;
> 
>  	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
> 
>  	if (svc_thread_should_stop(rqstp)) {
>  		svc_thread_wake_next(rqstp);
> -		return;
> +		return ret;
>  	}
> 
>  	rqstp->rq_xprt = svc_xprt_dequeue(pool);
> @@ -867,8 +890,13 @@ void svc_recv(struct svc_rqst *rqstp)
>  		 */
>  		if (pool->sp_idle_threads.first)
>  			rqstp->rq_chandle.thread_wait = 5 * HZ;
> -		else
> +		else {
>  			rqstp->rq_chandle.thread_wait = 1 * HZ;
> +			if (!did_wait &&
> +			    !test_and_set_bit(SP_TASK_STARTING,
> +					      &pool->sp_flags))
> +				ret = -EBUSY;
> +		}
> 
>  		trace_svc_xprt_dequeue(rqstp);
>  		svc_handle_xprt(rqstp, xprt);
> @@ -887,6 +915,7 @@ void svc_recv(struct svc_rqst *rqstp)
>  		}
>  	}
>  #endif
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(svc_recv);
> 
>
> -- 
> 2.52.0

The extensive use of atomic bit ops here is a little worrying.
Those can be costly -- and the sp_flags field is going to get
poked at by more and more threads as the pool's thread count
increases.


-- 
Chuck Lever
Re: [PATCH RFC 5/6] nfsd: adjust number of running nfsd threads based on activity
Posted by Jeff Layton 8 hours ago
On Sat, 2025-12-13 at 15:54 -0500, Chuck Lever wrote:
> 
> On Fri, Dec 12, 2025, at 5:39 PM, Jeff Layton wrote:
> > This patch is based on a draft patch by Neil:
> > 
> > svc_recv() is changed to return a status.  This can be:
> > 
> >  -ETIMEDOUT - waited for 5 seconds and found nothing to do.  This is
> >           boring.  Also there are more actual threads than really
> >           needed.
> >  -EBUSY - I did something, but there is more stuff to do and no one
> >           idle who I can wake up to do it.
> >           BTW I successful set a flag: SP_TASK_STARTING.  You better
> >           clear it.
> >  0 - just minding my own business, nothing to see here.
> > 
> > nfsd() is changed to pay attention to this status.  In the case of
> > -ETIMEDOUT, if the service mutex can be taken (trylock), the thread
> > becomes and RQ_VICTIM so that it will exit.  In the case of -EBUSY, if
> > the actual number of threads is below the calculated maximum, a new
> > thread is started.  SP_TASK_STARTING is cleared.
> 
> Jeff, since you reworked things to be based on a minimum rather
> than a maximum count, is this paragraph now stale?
> 
> 

Yes, it is. Will fix.

> > To support the above, some code is split out of svc_start_kthreads()
> > into svc_new_thread().
> > 
> > I think we want memory pressure to be able to push a thread into
> > returning -ETIMEDOUT.  That can come later.
> > 
> > Signed-off-by: NeilBrown <neil@brown.name>
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/nfsd/nfssvc.c               | 35 ++++++++++++++++++++-
> >  fs/nfsd/trace.h                | 35 +++++++++++++++++++++
> >  include/linux/sunrpc/svc.h     |  2 ++
> >  include/linux/sunrpc/svcsock.h |  2 +-
> >  net/sunrpc/svc.c               | 69 ++++++++++++++++++++++++------------------
> >  net/sunrpc/svc_xprt.c          | 45 ++++++++++++++++++++++-----
> >  6 files changed, 148 insertions(+), 40 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
> > index 
> > 993ed338764b0ccd7bdfb76bd6fbb5dc6ab4022d..26c3a6cb1f400f1b757d26f6ba77e27deb7e8ee2 
> > 100644
> > --- a/fs/nfsd/nfssvc.c
> > +++ b/fs/nfsd/nfssvc.c
> > @@ -896,9 +896,11 @@ static int
> >  nfsd(void *vrqstp)
> >  {
> >  	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
> > +	struct svc_pool *pool = rqstp->rq_pool;
> >  	struct svc_xprt *perm_sock = 
> > list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct 
> > svc_xprt), xpt_list);
> >  	struct net *net = perm_sock->xpt_net;
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	bool have_mutex = false;
> > 
> >  	/* At this point, the thread shares current->fs
> >  	 * with the init process. We need to create files with the
> > @@ -916,7 +918,36 @@ nfsd(void *vrqstp)
> >  	 * The main request loop
> >  	 */
> >  	while (!svc_thread_should_stop(rqstp)) {
> > -		svc_recv(rqstp);
> > +		switch (svc_recv(rqstp)) {
> > +		case -ETIMEDOUT: /* Nothing to do */
> > +			if (mutex_trylock(&nfsd_mutex)) {
> > +				if (pool->sp_nrthreads > pool->sp_nrthrmin) {
> > +					trace_nfsd_dynthread_kill(net, pool);
> > +					set_bit(RQ_VICTIM, &rqstp->rq_flags);
> > +					have_mutex = true;
> > +				} else
> > +					mutex_unlock(&nfsd_mutex);
> > +			} else {
> > +				trace_nfsd_dynthread_trylock_fail(net, pool);
> > +			}
> > +			break;
> > +		case -EBUSY: /* Too much to do */
> > +			if (pool->sp_nrthreads < pool->sp_nrthrmax &&
> > +			    mutex_trylock(&nfsd_mutex)) {
> > +				// check no idle threads?
> 
> Can this comment be clarified? It looks like a note-to-self, that maybe
> something is unfinished.
> 

That's leftover from Neil's original patch. I'm not sure what his
thinking was there. I'll plan to remove it.

> 
> > +				if (pool->sp_nrthreads < pool->sp_nrthrmax) {
> > +					trace_nfsd_dynthread_start(net, pool);
> > +					svc_new_thread(rqstp->rq_server, pool);
> > +				}
> > +				mutex_unlock(&nfsd_mutex);
> > +			} else {
> > +				trace_nfsd_dynthread_trylock_fail(net, pool);
> > +			}
> > +			clear_bit(SP_TASK_STARTING, &pool->sp_flags);
> > +			break;
> > +		default:
> > +			break;
> > +		}
> >  		nfsd_file_net_dispose(nn);
> >  	}
> > 
> > @@ -924,6 +955,8 @@ nfsd(void *vrqstp)
> > 
> >  	/* Release the thread */
> >  	svc_exit_thread(rqstp);
> > +	if (have_mutex)
> > +		mutex_unlock(&nfsd_mutex);
> >  	return 0;
> >  }
> > 
> > diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
> > index 
> > 5ae2a611e57f4b4e51a4d9eb6e0fccb66ad8d288..8885fd9bead98ebf55379d68ab9c3701981a5150 
> > 100644
> > --- a/fs/nfsd/trace.h
> > +++ b/fs/nfsd/trace.h
> > @@ -91,6 +91,41 @@ DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \
> >  DEFINE_NFSD_XDR_ERR_EVENT(garbage_args);
> >  DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
> > 
> > +DECLARE_EVENT_CLASS(nfsd_dynthread_class,
> > +	TP_PROTO(
> > +		const struct net *net,
> > +		const struct svc_pool *pool
> > +	),
> > +	TP_ARGS(net, pool),
> > +	TP_STRUCT__entry(
> > +		__field(unsigned int, netns_ino)
> > +		__field(unsigned int, pool_id)
> > +		__field(unsigned int, nrthreads)
> > +		__field(unsigned int, nrthrmin)
> > +		__field(unsigned int, nrthrmax)
> > +	),
> > +	TP_fast_assign(
> > +		__entry->netns_ino = net->ns.inum;
> > +		__entry->pool_id = pool->sp_id;
> > +		__entry->nrthreads = pool->sp_nrthreads;
> > +		__entry->nrthrmin = pool->sp_nrthrmin;
> > +		__entry->nrthrmax = pool->sp_nrthrmax;
> > +	),
> > +	TP_printk("pool=%u nrthreads=%u nrthrmin=%u nrthrmax=%u",
> > +		__entry->pool_id, __entry->nrthreads,
> > +		__entry->nrthrmin, __entry->nrthrmax
> > +	)
> > +);
> > +
> > +#define DEFINE_NFSD_DYNTHREAD_EVENT(name) \
> > +DEFINE_EVENT(nfsd_dynthread_class, nfsd_dynthread_##name, \
> > +	TP_PROTO(const struct net *net, const struct svc_pool *pool), \
> > +	TP_ARGS(net, pool))
> > +
> > +DEFINE_NFSD_DYNTHREAD_EVENT(start);
> > +DEFINE_NFSD_DYNTHREAD_EVENT(kill);
> > +DEFINE_NFSD_DYNTHREAD_EVENT(trylock_fail);
> > +
> >  #define show_nfsd_may_flags(x)						\
> >  	__print_flags(x, "|",						\
> >  		{ NFSD_MAY_EXEC,		"EXEC" },		\
> > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> > index 
> > 35bd3247764ae8dc5dcdfffeea36f7cfefd13372..f47e19c9bd9466986438766e9ab7b4c71cda1ba6 
> > 100644
> > --- a/include/linux/sunrpc/svc.h
> > +++ b/include/linux/sunrpc/svc.h
> > @@ -55,6 +55,7 @@ enum {
> >  	SP_TASK_PENDING,	/* still work to do even if no xprt is queued */
> >  	SP_NEED_VICTIM,		/* One thread needs to agree to exit */
> >  	SP_VICTIM_REMAINS,	/* One thread needs to actually exit */
> > +	SP_TASK_STARTING,	/* Task has started but not added to idle yet */
> >  };
> > 
> > 
> > @@ -442,6 +443,7 @@ struct svc_serv *svc_create(struct svc_program *, 
> > unsigned int,
> >  bool		   svc_rqst_replace_page(struct svc_rqst *rqstp,
> >  					 struct page *page);
> >  void		   svc_rqst_release_pages(struct svc_rqst *rqstp);
> > +int		   svc_new_thread(struct svc_serv *serv, struct svc_pool *pool);
> >  void		   svc_exit_thread(struct svc_rqst *);
> >  struct svc_serv *  svc_create_pooled(struct svc_program *prog,
> >  				     unsigned int nprog,
> > diff --git a/include/linux/sunrpc/svcsock.h 
> > b/include/linux/sunrpc/svcsock.h
> > index 
> > de37069aba90899be19b1090e6e90e509a3cf530..5c87d3fedd33e7edf5ade32e60523cae7e9ebaba 
> > 100644
> > --- a/include/linux/sunrpc/svcsock.h
> > +++ b/include/linux/sunrpc/svcsock.h
> > @@ -61,7 +61,7 @@ static inline u32 svc_sock_final_rec(struct svc_sock 
> > *svsk)
> >  /*
> >   * Function prototypes.
> >   */
> > -void		svc_recv(struct svc_rqst *rqstp);
> > +int		svc_recv(struct svc_rqst *rqstp);
> >  void		svc_send(struct svc_rqst *rqstp);
> >  int		svc_addsock(struct svc_serv *serv, struct net *net,
> >  			    const int fd, char *name_return, const size_t len,
> > diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> > index 
> > dc818158f8529b62dcf96c91bd9a9d4ab21df91f..9fca2dd340037f82baa4936766ebe0e38c3f0d85 
> > 100644
> > --- a/net/sunrpc/svc.c
> > +++ b/net/sunrpc/svc.c
> > @@ -714,9 +714,6 @@ svc_prepare_thread(struct svc_serv *serv, struct 
> > svc_pool *pool, int node)
> > 
> >  	rqstp->rq_err = -EAGAIN; /* No error yet */
> > 
> > -	serv->sv_nrthreads += 1;
> > -	pool->sp_nrthreads += 1;
> > -
> >  	/* Protected by whatever lock the service uses when calling
> >  	 * svc_set_num_threads()
> >  	 */
> > @@ -763,45 +760,57 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
> >  }
> >  EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
> > 
> > -static int
> > -svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
> > +int svc_new_thread(struct svc_serv *serv, struct svc_pool *pool)
> 
> Is now an exported function, should get a kdoc comment.
> 

ACK.

> 
> >  {
> >  	struct svc_rqst	*rqstp;
> >  	struct task_struct *task;
> >  	int node;
> >  	int err;
> > 
> > -	do {
> > -		nrservs--;
> > -		node = svc_pool_map_get_node(pool->sp_id);
> > -
> > -		rqstp = svc_prepare_thread(serv, pool, node);
> > -		if (!rqstp)
> > -			return -ENOMEM;
> > -		task = kthread_create_on_node(serv->sv_threadfn, rqstp,
> > -					      node, "%s", serv->sv_name);
> > -		if (IS_ERR(task)) {
> > -			svc_exit_thread(rqstp);
> > -			return PTR_ERR(task);
> > -		}
> > +	node = svc_pool_map_get_node(pool->sp_id);
> > 
> > -		rqstp->rq_task = task;
> > -		if (serv->sv_nrpools > 1)
> > -			svc_pool_map_set_cpumask(task, pool->sp_id);
> > +	rqstp = svc_prepare_thread(serv, pool, node);
> > +	if (!rqstp)
> > +		return -ENOMEM;
> > +	set_bit(SP_TASK_STARTING, &pool->sp_flags);
> > +	task = kthread_create_on_node(serv->sv_threadfn, rqstp,
> > +				      node, "%s", serv->sv_name);
> > +	if (IS_ERR(task)) {
> > +		clear_bit(SP_TASK_STARTING, &pool->sp_flags);
> > +		svc_exit_thread(rqstp);
> 
> svc_exit_thread() decrements serv->sv_nrthreads and pool->sp_nrthreads
> but this call site hasn't incremented them yet. Perhaps this error
> flow needs a simpler clean-up than calling svc_exit_thread().
> 

ACK. I'll give that a harder look.

> 
> > +		return PTR_ERR(task);
> > +	}
> > 
> > -		svc_sock_update_bufs(serv);
> > -		wake_up_process(task);
> > +	serv->sv_nrthreads += 1;
> > +	pool->sp_nrthreads += 1;
> > 
> > -		wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
> > -		err = rqstp->rq_err;
> > -		if (err) {
> > -			svc_exit_thread(rqstp);
> > -			return err;
> > -		}
> > -	} while (nrservs > 0);
> > +	rqstp->rq_task = task;
> > +	if (serv->sv_nrpools > 1)
> > +		svc_pool_map_set_cpumask(task, pool->sp_id);
> > 
> > +	svc_sock_update_bufs(serv);
> > +	wake_up_process(task);
> > +
> > +	wait_var_event(&rqstp->rq_err, rqstp->rq_err != -EAGAIN);
> > +	err = rqstp->rq_err;
> > +	if (err) {
> > +		svc_exit_thread(rqstp);
> > +		return err;
> > +	}
> >  	return 0;
> >  }
> > +EXPORT_SYMBOL_GPL(svc_new_thread);
> > +
> > +static int
> > +svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
> > +{
> > +	int err = 0;
> > +
> > +	while (!err && nrservs--)
> > +		err = svc_new_thread(serv, pool);
> > +
> > +	return err;
> > +}
> > 
> >  static int
> >  svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int 
> > nrservs)
> > diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> > index 
> > 6973184ff6675211b4338fac80105894e9c8d4df..9612334300c8dae38720a0f5c61c0f505432ec2f 
> > 100644
> > --- a/net/sunrpc/svc_xprt.c
> > +++ b/net/sunrpc/svc_xprt.c
> > @@ -714,15 +714,22 @@ svc_thread_should_sleep(struct svc_rqst *rqstp)
> >  	return true;
> >  }
> > 
> > -static void svc_thread_wait_for_work(struct svc_rqst *rqstp)
> > +static bool nfsd_schedule_timeout(long timeout)
> 
> Perhaps svc_schedule_timeout() is a more appropriate name for
> a function that resides in net/sunrpc/svc_xprt.c.
> 

Sounds good.

> 
> > +{
> > +	return schedule_timeout(timeout) == 0;
> > +}
> > +
> > +static bool svc_thread_wait_for_work(struct svc_rqst *rqstp)
> >  {
> >  	struct svc_pool *pool = rqstp->rq_pool;
> > +	bool did_timeout = false;
> > 
> >  	if (svc_thread_should_sleep(rqstp)) {
> >  		set_current_state(TASK_IDLE | TASK_FREEZABLE);
> >  		llist_add(&rqstp->rq_idle, &pool->sp_idle_threads);
> > +		clear_bit(SP_TASK_STARTING, &pool->sp_flags);
> >  		if (likely(svc_thread_should_sleep(rqstp)))
> > -			schedule();
> > +			did_timeout = nfsd_schedule_timeout(5 * HZ);
> > 
> >  		while (!llist_del_first_this(&pool->sp_idle_threads,
> >  					     &rqstp->rq_idle)) {
> > @@ -734,7 +741,10 @@ static void svc_thread_wait_for_work(struct 
> > svc_rqst *rqstp)
> >  			 * for this new work.  This thread can safely sleep
> >  			 * until woken again.
> >  			 */
> > -			schedule();
> > +			if (did_timeout)
> > +				did_timeout = nfsd_schedule_timeout(HZ);
> > +			else
> > +				did_timeout = nfsd_schedule_timeout(5 * HZ);
> >  			set_current_state(TASK_IDLE | TASK_FREEZABLE);
> >  		}
> >  		__set_current_state(TASK_RUNNING);
> > @@ -742,6 +752,7 @@ static void svc_thread_wait_for_work(struct 
> > svc_rqst *rqstp)
> >  		cond_resched();
> >  	}
> >  	try_to_freeze();
> > +	return did_timeout;
> >  }
> > 
> >  static void svc_add_new_temp_xprt(struct svc_serv *serv, struct 
> > svc_xprt *newxpt)
> > @@ -825,6 +836,8 @@ static void svc_handle_xprt(struct svc_rqst *rqstp, 
> > struct svc_xprt *xprt)
> > 
> >  static void svc_thread_wake_next(struct svc_rqst *rqstp)
> >  {
> > +	clear_bit(SP_TASK_STARTING, &rqstp->rq_pool->sp_flags);
> > +
> >  	if (!svc_thread_should_sleep(rqstp))
> >  		/* More work pending after I dequeued some,
> >  		 * wake another worker
> > @@ -839,21 +852,31 @@ static void svc_thread_wake_next(struct svc_rqst *rqstp)
> >   * This code is carefully organised not to touch any cachelines in
> >   * the shared svc_serv structure, only cachelines in the local
> >   * svc_pool.
> > + *
> > + * Returns -ETIMEDOUT if idle for an extended period
> > + *         -EBUSY is there is more work to do than available threads
> > + *         0 otherwise.
> >   */
> > -void svc_recv(struct svc_rqst *rqstp)
> > +int svc_recv(struct svc_rqst *rqstp)
> >  {
> >  	struct svc_pool *pool = rqstp->rq_pool;
> > +	bool did_wait;
> > +	int ret = 0;
> > 
> >  	if (!svc_alloc_arg(rqstp))
> > -		return;
> > +		return ret;
> > +
> > +	did_wait = svc_thread_wait_for_work(rqstp);
> > 
> > -	svc_thread_wait_for_work(rqstp);
> > +	if (did_wait && svc_thread_should_sleep(rqstp) &&
> > +	    pool->sp_nrthrmin && (pool->sp_nrthreads > pool->sp_nrthrmin))
> > +		ret = -ETIMEDOUT;
> > 
> >  	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
> > 
> >  	if (svc_thread_should_stop(rqstp)) {
> >  		svc_thread_wake_next(rqstp);
> > -		return;
> > +		return ret;
> >  	}
> > 
> >  	rqstp->rq_xprt = svc_xprt_dequeue(pool);
> > @@ -867,8 +890,13 @@ void svc_recv(struct svc_rqst *rqstp)
> >  		 */
> >  		if (pool->sp_idle_threads.first)
> >  			rqstp->rq_chandle.thread_wait = 5 * HZ;
> > -		else
> > +		else {
> >  			rqstp->rq_chandle.thread_wait = 1 * HZ;
> > +			if (!did_wait &&
> > +			    !test_and_set_bit(SP_TASK_STARTING,
> > +					      &pool->sp_flags))
> > +				ret = -EBUSY;
> > +		}
> > 
> >  		trace_svc_xprt_dequeue(rqstp);
> >  		svc_handle_xprt(rqstp, xprt);
> > @@ -887,6 +915,7 @@ void svc_recv(struct svc_rqst *rqstp)
> >  		}
> >  	}
> >  #endif
> > +	return ret;
> >  }
> >  EXPORT_SYMBOL_GPL(svc_recv);
> > 
> > 
> > -- 
> > 2.52.0
> 
> The extensive use of atomic bit ops here is a little worrying.
> Those can be costly -- and the sp_flags field is going to get
> poked at by more and more threads as the pool's thread count
> increases.
> 

The current way that threading works is dependent on this today. We
could consider a spinlock and a non-atomic bitops, but that might be
even worse. I'll have to think about that.

Thanks for the review!
-- 
Jeff Layton <jlayton@kernel.org>