[v3] add threaded printing + the rest

[PATCH printk v3 03/19] printk: nbcon: Add function for printers to reacquire ownership

Posted by John Ogness 1 year, 6 months ago

Since ownership can be lost at any time due to handover or
takeover, a printing context _must_ be prepared to back out
immediately and carefully. However, there are scenarios where
the printing context must reacquire ownership in order to
finalize or revert hardware changes.

One such example is when interrupts are disabled during
printing. No other context will automagically re-enable the
interrupts. For this case, the disabling context _must_
reacquire nbcon ownership so that it can re-enable the
interrupts.

Provide nbcon_reacquire_nobuf() for exactly this purpose. It
allows a printing context to reacquire ownership using the same
priority as its previous ownership.

Note that after a successful reacquire the printing context
will have no output buffer because that has been lost. This
function cannot be used to resume printing.

Signed-off-by: John Ogness <john.ogness@linutronix.de>
---
 include/linux/console.h |  6 +++++
 kernel/printk/nbcon.c   | 56 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/include/linux/console.h b/include/linux/console.h
index 4aaf053840ee..38ef6e64da19 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -366,6 +366,10 @@ struct console {
 	 *
 	 * The callback should allow the takeover whenever it is safe. It
 	 * increases the chance to see messages when the system is in trouble.
+	 * If the driver must reacquire ownership in order to finalize or
+	 * revert hardware changes, nbcon_reacquire_nobuf() can be used.
+	 * However, on reacquire the buffer content is no longer available. A
+	 * reacquire cannot be used to resume printing.
 	 *
 	 * The callback can be called from any context (including NMI).
 	 * Therefore it must avoid usage of any locking and instead rely
@@ -559,6 +563,7 @@ extern void nbcon_cpu_emergency_flush(void);
 extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
 extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
 extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
+extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt);
 #else
 static inline void nbcon_cpu_emergency_enter(void) { }
 static inline void nbcon_cpu_emergency_exit(void) { }
@@ -566,6 +571,7 @@ static inline void nbcon_cpu_emergency_flush(void) { }
 static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
 static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
 static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
+static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { }
 #endif
 
 extern int console_set_on_cmdline;
diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c
index 1388e23a439f..18a83d181622 100644
--- a/kernel/printk/nbcon.c
+++ b/kernel/printk/nbcon.c
@@ -834,6 +834,47 @@ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
 }
 EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);
 
+static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
+					char *buf, unsigned int len)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+	struct console *con = ctxt->console;
+	struct nbcon_state cur;
+
+	wctxt->outbuf = buf;
+	wctxt->len = len;
+	nbcon_state_read(con, &cur);
+	wctxt->unsafe_takeover = cur.unsafe_takeover;
+}
+
+/**
+ * nbcon_reacquire_nobuf - Reacquire a console after losing ownership
+ *				while printing
+ * @wctxt:	The write context that was handed to the write callback
+ *
+ * Since ownership can be lost at any time due to handover or takeover, a
+ * printing context _must_ be prepared to back out immediately and
+ * carefully. However, there are scenarios where the printing context must
+ * reacquire ownership in order to finalize or revert hardware changes.
+ *
+ * This function allows a printing context to reacquire ownership using the
+ * same priority as its previous ownership.
+ *
+ * Note that after a successful reacquire the printing context will have no
+ * output buffer because that has been lost. This function cannot be used to
+ * resume printing.
+ */
+void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
+{
+	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
+
+	while (!nbcon_context_try_acquire(ctxt))
+		cpu_relax();
+
+	nbcon_write_context_set_buf(wctxt, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf);
+
 /**
  * nbcon_emit_next_record - Emit a record in the acquired context
  * @wctxt:	The write context that will be handed to the write function
@@ -859,7 +900,6 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
 		.pbufs = ctxt->pbufs,
 	};
 	unsigned long con_dropped;
-	struct nbcon_state cur;
 	unsigned long dropped;
 
 	/*
@@ -894,10 +934,7 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
 		goto update_con;
 
 	/* Initialize the write context for driver callbacks. */
-	wctxt->outbuf = &pmsg.pbufs->outbuf[0];
-	wctxt->len = pmsg.outbuf_len;
-	nbcon_state_read(con, &cur);
-	wctxt->unsafe_takeover = cur.unsafe_takeover;
+	nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);
 
 	if (con->write_atomic) {
 		con->write_atomic(con, wctxt);
@@ -911,6 +948,15 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
 		return false;
 	}
 
+	if (!wctxt->outbuf) {
+		/*
+		 * Ownership was lost and reacquired by the driver.
+		 * Handle it as if ownership was lost.
+		 */
+		nbcon_context_release(ctxt);
+		return false;
+	}
+
 	/*
 	 * Since any dropped message was successfully output, reset the
 	 * dropped count for the console.
-- 
2.39.2

Re: [PATCH printk v3 03/19] printk: nbcon: Add function for printers to reacquire ownership

Posted by Petr Mladek 1 year, 6 months ago

On Mon 2024-07-22 19:25:23, John Ogness wrote:
> Since ownership can be lost at any time due to handover or
> takeover, a printing context _must_ be prepared to back out
> immediately and carefully. However, there are scenarios where
> the printing context must reacquire ownership in order to
> finalize or revert hardware changes.
> 
> One such example is when interrupts are disabled during
> printing. No other context will automagically re-enable the
> interrupts. For this case, the disabling context _must_
> reacquire nbcon ownership so that it can re-enable the
> interrupts.

I am still not sure how this is going to be used. It is suspicious.
If the context lost the ownership than another started flushing
higher priority messages.

Is it really safe to manipulate the HW at this point?
Won't it break the higher priority context?

> --- a/kernel/printk/nbcon.c
> +++ b/kernel/printk/nbcon.c
> @@ -911,6 +948,15 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
>  		return false;
>  	}
>  
> +	if (!wctxt->outbuf) {

This check works only when con->write_atomic() called nbcon_reacquire_nobuf().

At least, we should clear the buffer also in nbcon_enter_unsafe() and
nbcon_exit_unsafe() when they realize that they do own the context.

Even better would be to add a check whether we still own the context.
Something like:

bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
{
	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
	struct nbcon_state cur;

	nbcon_state_read(con, &cur);

	return nbcon_context_can_proceed(ctxt, &cur);
}

> +		/*
> +		 * Ownership was lost and reacquired by the driver.
> +		 * Handle it as if ownership was lost.
> +		 */
> +		nbcon_context_release(ctxt);
> +		return false;
> +	}
> +
>  	/*
>  	 * Since any dropped message was successfully output, reset the
>  	 * dropped count for the console.

Best Regards,
Petr

Re: [PATCH printk v3 03/19] printk: nbcon: Add function for printers to reacquire ownership

Posted by John Ogness 1 year, 6 months ago

On 2024-07-26, Petr Mladek <pmladek@suse.com> wrote:
> On Mon 2024-07-22 19:25:23, John Ogness wrote:
>> Since ownership can be lost at any time due to handover or
>> takeover, a printing context _must_ be prepared to back out
>> immediately and carefully. However, there are scenarios where
>> the printing context must reacquire ownership in order to
>> finalize or revert hardware changes.
>> 
>> One such example is when interrupts are disabled during
>> printing. No other context will automagically re-enable the
>> interrupts. For this case, the disabling context _must_
>> reacquire nbcon ownership so that it can re-enable the
>> interrupts.
>
> I am still not sure how this is going to be used. It is suspicious.
> If the context lost the ownership than another started flushing
> higher priority messages.
>
> Is it really safe to manipulate the HW at this point?
> Won't it break the higher priority context?

Why would it break anything? It spins until it normally and safely
acquires ownership again. The commit message provides a simple example
of why it is necessary. With a threaded printer, this situation happens
almost every time a warning occurs.

>> --- a/kernel/printk/nbcon.c
>> +++ b/kernel/printk/nbcon.c
>> @@ -911,6 +948,15 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
>>  		return false;
>>  	}
>>  
>> +	if (!wctxt->outbuf) {
>
> This check works only when con->write_atomic() called
> nbcon_reacquire_nobuf().

Exactly. That is what it is for.

> At least, we should clear the buffer also in nbcon_enter_unsafe() and
> nbcon_exit_unsafe() when they realize that they do own the context.

OK.

> Even better would be to add a check whether we still own the context.
> Something like:
>
> bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
> {
> 	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
> 	struct nbcon_state cur;
>
> 	nbcon_state_read(con, &cur);
>
> 	return nbcon_context_can_proceed(ctxt, &cur);
> }

nbcon_can_proceed() is meant to check ownership. And it only makes sense
to use it within an unsafe section. Otherwise it is racy.

Once a reacquire has occurred, the driver is allowed to proceed. It just
is not allowed to print (because its buffer is gone).

>> +		/*
>> +		 * Ownership was lost and reacquired by the driver.
>> +		 * Handle it as if ownership was lost.
>> +		 */
>> +		nbcon_context_release(ctxt);
>> +		return false;

John

Re: [PATCH printk v3 03/19] printk: nbcon: Add function for printers to reacquire ownership

Posted by Petr Mladek 1 year, 6 months ago

On Mon 2024-07-29 10:42:04, John Ogness wrote:
> On 2024-07-26, Petr Mladek <pmladek@suse.com> wrote:
> > On Mon 2024-07-22 19:25:23, John Ogness wrote:
> >> Since ownership can be lost at any time due to handover or
> >> takeover, a printing context _must_ be prepared to back out
> >> immediately and carefully. However, there are scenarios where
> >> the printing context must reacquire ownership in order to
> >> finalize or revert hardware changes.
> >> 
> >> One such example is when interrupts are disabled during
> >> printing. No other context will automagically re-enable the
> >> interrupts. For this case, the disabling context _must_
> >> reacquire nbcon ownership so that it can re-enable the
> >> interrupts.
> >
> > I am still not sure how this is going to be used. It is suspicious.
> > If the context lost the ownership than another started flushing
> > higher priority messages.
> >
> > Is it really safe to manipulate the HW at this point?
> > Won't it break the higher priority context?
> 
> Why would it break anything? It spins until it normally and safely
> acquires ownership again. The commit message provides a simple example
> of why it is necessary. With a threaded printer, this situation happens
> almost every time a warning occurs.

I see. It makes sense now.

> >> --- a/kernel/printk/nbcon.c
> >> +++ b/kernel/printk/nbcon.c
> >> @@ -911,6 +948,15 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt)
> >>  		return false;
> >>  	}
> >>  
> >> +	if (!wctxt->outbuf) {
> >
> > This check works only when con->write_atomic() called
> > nbcon_reacquire_nobuf().
> 
> Exactly. That is what it is for.
> 
> > At least, we should clear the buffer also in nbcon_enter_unsafe() and
> > nbcon_exit_unsafe() when they realize that they do own the context.
> 
> OK.
> 
> > Even better would be to add a check whether we still own the context.
> > Something like:
> >
> > bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
> > {
> > 	struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
> > 	struct nbcon_state cur;
> >
> > 	nbcon_state_read(con, &cur);
> >
> > 	return nbcon_context_can_proceed(ctxt, &cur);
> > }
> 
> nbcon_can_proceed() is meant to check ownership. And it only makes sense
> to use it within an unsafe section. Otherwise it is racy.

My idea was: "If we still own the context that we have owned it all
	      the time and con-write_atomic() succeeded."

The race is is not important. If we lose the ownership before updating
nbcon_seq then the line will get written again anyway.

> Once a reacquire has occurred, the driver is allowed to proceed. It just
> is not allowed to print (because its buffer is gone).

I see. My idea does not work because the driver is going to reacquire
the ownership. It means that nbcon_can_proceed() would return true
even when con->atomic_write() failed.

But it is not documented anywhere. And what if the driver has a bug
and does not call reacquire. Or what if the driver does not need
to restore anything.

IMHO, nbcon_emit_next_record() should check both:

	if (use_atomic)
		con->write_atomic(con, wctxt);
	else
		con->write_thread(con, wctxt);

	/* Still owns the console? */
	if (!nbcon_can_proceed(wctxt)
		return false;

	if (!wctxt->outbuf) {
		/*
		 * Ownership was lost and reacquired by the driver.
		 * Handle it as if ownership was lost.
		 */
		nbcon_context_release(ctxt);
		return false;
	}

Best Regards,
Petr

Re: [PATCH printk v3 03/19] printk: nbcon: Add function for printers to reacquire ownership

Posted by John Ogness 1 year, 5 months ago

On 2024-07-30, Petr Mladek <pmladek@suse.com> wrote:
> My idea was: "If we still own the context that we have owned it all
> 	      the time and con-write_atomic() succeeded."
>
> The race is is not important. If we lose the ownership before updating
> nbcon_seq then the line will get written again anyway.
>
>> Once a reacquire has occurred, the driver is allowed to proceed. It just
>> is not allowed to print (because its buffer is gone).
>
> I see. My idea does not work because the driver is going to reacquire
> the ownership. It means that nbcon_can_proceed() would return true
> even when con->atomic_write() failed.
>
> But it is not documented anywhere. And what if the driver has a bug
> and does not call reacquire. Or what if the driver does not need
> to restore anything.
>
> IMHO, nbcon_emit_next_record() should check both:
>
> 	if (use_atomic)
> 		con->write_atomic(con, wctxt);
> 	else
> 		con->write_thread(con, wctxt);
>
> 	/* Still owns the console? */
> 	if (!nbcon_can_proceed(wctxt)
> 		return false;
>
> 	if (!wctxt->outbuf) {
> 		/*
> 		 * Ownership was lost and reacquired by the driver.
> 		 * Handle it as if ownership was lost.
> 		 */
> 		nbcon_context_release(ctxt);
> 		return false;
> 	}

Except that the next thing nbcon_emit_next_record() does is
nbcon_context_enter_unsafe(), which checks ownership. So your suggested
nbcon_can_proceed() is redundant.

For v4 I can add comments explaining this. It would look like this (at
this point in the series):

	/* Initialize the write context for driver callbacks. */
	nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);

	if (con->write_atomic) {
		con->write_atomic(con, wctxt);
	} else {
		/*
		 * This function should never be called for legacy consoles.
		 * Handle it as if ownership was lost and try to continue.
		 */
		WARN_ON_ONCE(1);
		nbcon_context_release(ctxt);
		return false;
	}

	if (!wctxt->outbuf) {
		/*
		 * Ownership was lost and reacquired by the driver. Handle it
		 * as if ownership was lost.
		 */
		nbcon_context_release(ctxt);
		return false;
	}

	/*
	 * Ownership may have been lost but _not_ reacquired by the driver.
	 * This case is detected and handled when entering unsafe to update
	 * dropped/seq values.
	 */

	/*
	 * Since any dropped message was successfully output, reset the
	 * dropped count for the console.
	 */
	dropped = 0;
update_con:
	/*
	 * The dropped count and the sequence number are updated within an
	 * unsafe section. This limits update races to the panic context and
	 * allows the panic context to win.
	 */

	if (!nbcon_context_enter_unsafe(ctxt))
		return false;

John Ogness