[v5] x86: Support for Intel Microcode Staging Feature

[PATCH v5 5/7] x86/microcode/intel: Implement staging handler

Posted by Chang S. Bae 1 month, 1 week ago

Previously, per-package staging invocations and their associated state
data were established. The next step is to implement the actual staging
handler according to the specified protocol. Below are key aspects to
note:

  (a)  Each staging process must begin by resetting the staging hardware.

  (b)  The staging hardware processes up to a page-sized chunk of the
       microcode image per iteration, requiring software to submit data
       incrementally.

  (c)  Once a data chunk is processed, the hardware responds with an
       offset in the image for the next chunk.

  (d) The offset may indicate completion or request retransmission of an
       already transferred chunk. As long as the total transferred data
       remains within the predefined limit (twice the image size),
       retransmissions should be acceptable.

With that, incorporate these code sequences to the staging handler:

  1.  Initialization: Map the MMIO space via ioremap(). Reset the staging
      hardware and initialize software state, ensuring a fresh staging
      process aligned with (a).

  2.  Processing Loop: Introduce a loop iterating over data chunk,
      following (b), with proper termination conditions established from
      (d) -- stop staging when the hardware signals completion, or if the
      total transmitted data exceeds the predefined limit.

  3.  Loop Body: Finally, compose the loop body with two steps --
      transmitting a data chunk and retrieving the next offset from the
      hardware response, aligning with (b) and (c).

Since data transmission and mailbox format handling require additional
details, they are implemented separately in next changes.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Tested-by: Anselm Busse <abusse@amazon.de>
---
V4 -> V5:
* Convert helper functions to return error codes (Dave)
* Consolidate loop-control logic
* Refactor next-chunk calculation/check for clarity
* Remove offset sanity check (moved to next patch)

V2 -> V3:
* Rework code to eliminate global variables (Dave)
* Remove redundant variable resets (Chao)

V1 -> V2:
* Re-write the changelog for clarity (Dave).
* Move staging handling code into intel.c (Boris).
* Add extensive comments to clarify staging logic and hardware
  interactions, along with function renaming (Dave).
---
 arch/x86/kernel/cpu/microcode/intel.c | 137 +++++++++++++++++++++++++-
 1 file changed, 133 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 3ca22457d839..a1b13202330d 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -20,6 +20,8 @@
 #include <linux/cpu.h>
 #include <linux/uio.h>
 #include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/io.h>
 
 #include <asm/cpu_device_id.h>
 #include <asm/processor.h>
@@ -33,6 +35,16 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
 
 #define UCODE_BSP_LOADED	((struct microcode_intel *)0x1UL)
 
+/* Defines for the microcode staging mailbox interface */
+
+#define MBOX_REG_NUM		4
+#define MBOX_REG_SIZE		sizeof(u32)
+
+#define MBOX_CONTROL_OFFSET	0x0
+#define MBOX_STATUS_OFFSET	0x4
+
+#define MASK_MBOX_CTRL_ABORT	BIT(0)
+
 /* Current microcode patch used in early patching on the APs. */
 static struct microcode_intel *ucode_patch_va __read_mostly;
 static struct microcode_intel *ucode_patch_late __read_mostly;
@@ -319,13 +331,130 @@ static __init struct microcode_intel *scan_microcode(void *data, size_t size,
 }
 
 /*
- * Handle the staging process using the mailbox MMIO interface.
- * Return the result state.
+ * Prepare for a new microcode transfer: reset hardware and record the
+ * image size.
+ */
+static void init_stage(struct staging_state *ss)
+{
+	ss->ucode_len = get_totalsize(&ucode_patch_late->hdr);
+
+	/*
+	 * Abort any ongoing process, effectively resetting the device.
+	 * Unlike regular mailbox data processing requests, this
+	 * operation does not require a status check.
+	 */
+	writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET);
+}
+
+/*
+ * Return PAGE_SIZE, or remaining bytes if this is the final chunk
+ */
+static inline unsigned int calc_next_chunk_size(unsigned int ucode_len, unsigned int offset)
+{
+	return min(PAGE_SIZE, ucode_len - offset);
+}
+
+/*
+ * Update the chunk size and decide whether another chunk can be sent.
+ * This accounts for remaining data and retry limits.
+ */
+static bool can_send_next_chunk(struct staging_state *ss)
+{
+	ss->chunk_size = calc_next_chunk_size(ss->ucode_len, ss->offset);
+	/*
+	 * Each microcode image is divided into chunks, each at most
+	 * one page size. A 10-chunk  image would typically require 10
+	 * transactions.
+	 *
+	 * However, the hardware managing the mailbox has limited
+	 * resources and may not cache the entire image, potentially
+	 * requesting the same chunk multiple times.
+	 *
+	 * To tolerate this behavior, allow up to twice the expected
+	 * number of transactions (i.e., a 10-chunk image can take up to
+	 * 20 attempts).
+	 *
+	 * If the number of attempts exceeds this limit, the hardware is
+	 * likely stuck and mark the state as timeout.
+	 */
+	if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) {
+		ss->state = UCODE_TIMEOUT;
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Determine whether staging is complete: either the hardware signaled
+ * the end offset, or no more transactions are permitted (retry limit
+ * reached).
+ */
+static inline bool staging_is_complete(struct staging_state *ss)
+{
+	return (ss->offset == UINT_MAX) || !can_send_next_chunk(ss);
+}
+
+/*
+ * Transmit a chunk of the microcode image to the hardware.
+ * Return 0 on success, or an error code on failure.
+ */
+static int send_data_chunk(struct staging_state *ss, void *ucode_ptr __maybe_unused)
+{
+	pr_debug_once("Staging mailbox loading code needs to be implemented.\n");
+	ss->state = UCODE_ERROR;
+	return -EPROTONOSUPPORT;
+}
+
+/*
+ * Retrieve the next offset from the hardware response.
+ * Return 0 on success, or an error code on failure.
+ */
+static int fetch_next_offset(struct staging_state *ss)
+{
+	pr_debug_once("Staging mailbox response handling code needs to be implemented.\n\n");
+	ss->state = UCODE_ERROR;
+	return -EPROTONOSUPPORT;
+}
+
+/*
+ * Handle the staging process using the mailbox MMIO interface. The
+ * microcode image is transferred in chunks until completion. Return the
+ * result state.
  */
 static enum ucode_state do_stage(u64 mmio_pa)
 {
-	pr_debug_once("Staging implementation is pending.\n");
-	return UCODE_ERROR;
+	struct staging_state ss = {};
+	int err;
+
+	ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE);
+	if (WARN_ON_ONCE(!ss.mmio_base))
+		return UCODE_ERROR;
+
+	init_stage(&ss);
+
+	/* Perform the staging process while within the retry limit */
+	while (!staging_is_complete(&ss)) {
+		/* Send a chunk of microcode each time: */
+		err = send_data_chunk(&ss, ucode_patch_late);
+		if (err)
+			break;
+		/*
+		 * Then, ask the hardware which piece of the image it
+		 * needs next. The same piece may be sent more than once.
+		 */
+		err = fetch_next_offset(&ss);
+		if (err)
+			break;
+	}
+
+	iounmap(ss.mmio_base);
+
+	/*
+	 * The helpers update ss.state on error. The final state is
+	 * returned to the caller.
+	 */
+	return ss.state;
 }
 
 static void stage_microcode(void)
-- 
2.48.1

Re: [PATCH v5 5/7] x86/microcode/intel: Implement staging handler

Posted by Borislav Petkov 3 weeks, 2 days ago

On Sat, Aug 23, 2025 at 08:52:08AM -0700, Chang S. Bae wrote:
> diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
> index 3ca22457d839..a1b13202330d 100644
> --- a/arch/x86/kernel/cpu/microcode/intel.c
> +++ b/arch/x86/kernel/cpu/microcode/intel.c
> @@ -20,6 +20,8 @@
>  #include <linux/cpu.h>
>  #include <linux/uio.h>
>  #include <linux/mm.h>
> +#include <linux/delay.h>
> +#include <linux/io.h>

You do see those are sorted by header name length in a reverse order, right?

>  
>  #include <asm/cpu_device_id.h>
>  #include <asm/processor.h>
> @@ -33,6 +35,16 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
>  
>  #define UCODE_BSP_LOADED	((struct microcode_intel *)0x1UL)
>  
> +/* Defines for the microcode staging mailbox interface */
> +

^ Superfluous newline.

> +#define MBOX_REG_NUM		4
> +#define MBOX_REG_SIZE		sizeof(u32)
> +
> +#define MBOX_CONTROL_OFFSET	0x0
> +#define MBOX_STATUS_OFFSET	0x4
> +
> +#define MASK_MBOX_CTRL_ABORT	BIT(0)
> +
>  /* Current microcode patch used in early patching on the APs. */
>  static struct microcode_intel *ucode_patch_va __read_mostly;
>  static struct microcode_intel *ucode_patch_late __read_mostly;

...

> +/*
> + * Return PAGE_SIZE, or remaining bytes if this is the final chunk
> + */
> +static inline unsigned int calc_next_chunk_size(unsigned int ucode_len, unsigned int offset)
> +{
> +	return min(PAGE_SIZE, ucode_len - offset);
> +}

That oneliner looks useless - sticking a comment over tne min() and putting it
at the single callsite below is good enough.

> +
> +/*
> + * Update the chunk size and decide whether another chunk can be sent.
> + * This accounts for remaining data and retry limits.
> + */
> +static bool can_send_next_chunk(struct staging_state *ss)
> +{
> +	ss->chunk_size = calc_next_chunk_size(ss->ucode_len, ss->offset);
> +	/*
> +	 * Each microcode image is divided into chunks, each at most
> +	 * one page size. A 10-chunk  image would typically require 10
				   ^^^^

> +	 * transactions.
> +	 *
> +	 * However, the hardware managing the mailbox has limited
> +	 * resources and may not cache the entire image, potentially
> +	 * requesting the same chunk multiple times.
> +	 *
> +	 * To tolerate this behavior, allow up to twice the expected
> +	 * number of transactions (i.e., a 10-chunk image can take up to
> +	 * 20 attempts).

Looks quirky but ok, let's try it in practice first...

> +	 *
> +	 * If the number of attempts exceeds this limit, the hardware is
> +	 * likely stuck and mark the state as timeout.
> +	 */
> +	if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) {
> +		ss->state = UCODE_TIMEOUT;
> +		return false;
> +	}
> +
> +	return true;
> +}

...

>  static enum ucode_state do_stage(u64 mmio_pa)
>  {
> -	pr_debug_once("Staging implementation is pending.\n");
> -	return UCODE_ERROR;
> +	struct staging_state ss = {};
> +	int err;
> +
> +	ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE);
> +	if (WARN_ON_ONCE(!ss.mmio_base))
> +		return UCODE_ERROR;
> +
> +	init_stage(&ss);
> +
> +	/* Perform the staging process while within the retry limit */
> +	while (!staging_is_complete(&ss)) {
> +		/* Send a chunk of microcode each time: */
> +		err = send_data_chunk(&ss, ucode_patch_late);
> +		if (err)
> +			break;
> +		/*
> +		 * Then, ask the hardware which piece of the image it
> +		 * needs next. The same piece may be sent more than once.

If this is part of normal operation, your send-max-2x-the-size heuristic might
fail quickly here. I'd track the number of chunks it wants you to send and
then set a per-chunk limit and when it reaches that limit, then cancel the
transaction. Dunno, let's try the simple scheme first...

> +		 */
> +		err = fetch_next_offset(&ss);
> +		if (err)
> +			break;
> +	}
> +
> +	iounmap(ss.mmio_base);
> +
> +	/*
> +	 * The helpers update ss.state on error. The final state is
> +	 * returned to the caller.
> +	 */
> +	return ss.state;
>  }
>  
>  static void stage_microcode(void)
> -- 
> 2.48.1
> 

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [PATCH v5 5/7] x86/microcode/intel: Implement staging handler

Posted by Chang S. Bae 3 weeks, 2 days ago

On 9/10/2025 11:33 AM, Borislav Petkov wrote:
> On Sat, Aug 23, 2025 at 08:52:08AM -0700, Chang S. Bae wrote:
>> diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
>> index 3ca22457d839..a1b13202330d 100644
>> --- a/arch/x86/kernel/cpu/microcode/intel.c
>> +++ b/arch/x86/kernel/cpu/microcode/intel.c
>> @@ -20,6 +20,8 @@
>>   #include <linux/cpu.h>
>>   #include <linux/uio.h>
>>   #include <linux/mm.h>
>> +#include <linux/delay.h>
>> +#include <linux/io.h>
> 
> You do see those are sorted by header name length in a reverse order, right?

Okay, fixed -- and I adjusted patch6 as well:

  #include <linux/initrd.h>
  #include <linux/io.h>
  #include <linux/kernel.h>
+#include <linux/pci_ids.h>
  #include <linux/slab.h>
  #include <linux/cpu.h>
  #include <linux/uio.h>

>> +/* Defines for the microcode staging mailbox interface */
>> +
> 
> ^ Superfluous newline.

Dropped.

>> +/*
>> + * Return PAGE_SIZE, or remaining bytes if this is the final chunk
>> + */
>> +static inline unsigned int calc_next_chunk_size(unsigned int ucode_len, unsigned int offset)
>> +{
>> +	return min(PAGE_SIZE, ucode_len - offset);
>> +}
> 
> That oneliner looks useless - sticking a comment over tne min() and putting it
> at the single callsite below is good enough.

Agreed -- removed the helper and moved them.

>> +/*
>> + * Update the chunk size and decide whether another chunk can be sent.
>> + * This accounts for remaining data and retry limits.
>> + */
>> +static bool can_send_next_chunk(struct staging_state *ss)
>> +{
>> +	ss->chunk_size = calc_next_chunk_size(ss->ucode_len, ss->offset);
>> +	/*
>> +	 * Each microcode image is divided into chunks, each at most
>> +	 * one page size. A 10-chunk  image would typically require 10
> 				   ^^^^

Fixed.

Just to make sure, include the diff here.

Thanks for the careful review and for sticking with this set.diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index c7a75afd2b9a..4d663cab4f48 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -12,16 +12,16 @@
  */
 #define pr_fmt(fmt) "microcode: " fmt
 #include <linux/earlycpio.h>
+#include <linux/delay.h>
 #include <linux/firmware.h>
 #include <linux/uaccess.h>
 #include <linux/initrd.h>
+#include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/uio.h>
 #include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/io.h>
 
 #include <asm/cpu_device_id.h>
 #include <asm/processor.h>
@@ -36,7 +36,6 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
 #define UCODE_BSP_LOADED	((struct microcode_intel *)0x1UL)
 
 /* Defines for the microcode staging mailbox interface */
-
 #define MBOX_REG_NUM		4
 #define MBOX_REG_SIZE		sizeof(u32)
 
@@ -344,24 +343,18 @@ static void init_stage(struct staging_state *ss)
 	writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET);
 }
 
-/*
- * Return PAGE_SIZE, or remaining bytes if this is the final chunk
- */
-static inline unsigned int calc_next_chunk_size(unsigned int ucode_len, unsigned int offset)
-{
-	return min(PAGE_SIZE, ucode_len - offset);
-}
-
 /*
  * Update the chunk size and decide whether another chunk can be sent.
  * This accounts for remaining data and retry limits.
  */
 static bool can_send_next_chunk(struct staging_state *ss, int *err)
 {
-	ss->chunk_size = calc_next_chunk_size(ss->ucode_len, ss->offset);
+	/* a page size or remaining bytes if this is the final chunk */
+	ss->chunk_size = min(PAGE_SIZE, ss->ucode_len - ss->offset);
+
 	/*
 	 * Each microcode image is divided into chunks, each at most
-	 * one page size. A 10-chunk  image would typically require 10
+	 * one page size. A 10-chunk image would typically require 10
 	 * transactions.
 	 *
 	 * However, the hardware managing the mailbox has limited

[PATCH v5 1/7] x86/cpu/topology: Make primary thread mask available with SMP=n
[PATCH v5 2/7] x86/microcode: Introduce staging step to reduce late-loading time
[PATCH v5 3/7] x86/microcode/intel: Establish staging control logic
[PATCH v5 4/7] x86/microcode/intel: Define staging state struct
[PATCH v5 5/7] x86/microcode/intel: Implement staging handler
[PATCH v5 6/7] x86/microcode/intel: Support mailbox transfer
[PATCH v5 7/7] x86/microcode/intel: Enable staging when available