[PATCH v4 5/6] x86/microcode/intel: Support mailbox transfer

Chang S. Bae posted 6 patches 1 month, 3 weeks ago
There is a newer version of this series
[PATCH v4 5/6] x86/microcode/intel: Support mailbox transfer
Posted by Chang S. Bae 1 month, 3 weeks ago
Previously, the functions for sending microcode data and retrieving the
next offset were placeholders, as they required handling the specific
mailbox format. Implement them as following:

== Mailbox Format ==

The staging mailbox consists of two primary sections: 'header' and
'data'. While the microcode must be transferred following this format,
the actual data transfer mechanism involves reading and writing to
specific MMIO registers.

== Mailbox Data Registers ==

Unlike conventional interfaces that allocate MMIO space for each data
chunk, the staging interface features a "narrow" interface, using only
two dword-sized registers for read and write operations.

For example, if writing 2 dwords of data to a device. Typically, the
device would expose 2 dwords of "wide" MMIO space. To send the data to
the device:

	writel(buf[0], io_addr + 0);
	writel(buf[1], io_addr + 1);

But, this interface is a bit different. Instead of having a "wide"
interface where there is separate MMIO space for each word in a
transaction, it has a "narrow" interface where several words are written
to the same spot in MMIO space:

	writel(buf[0], io_addr);
	writel(buf[1], io_addr);

The same goes for the read side.

== Implementation Summary ==

Given that, introduce two layers of helper functions at first:

  * Low-level helpers for reading and writing to data registers directly.
  * Wrapper functions for handling mailbox header and data sections.

Using them, implement send_data_chunk() and fetch_next_offset()
functions. Add explicit error and timeout handling routine in
wait_for_transaction(), finishing up the transfer.

Note: The kernel has support for similar mailboxes. But none of them are
compatible with this one. Trying to share code resulted in a bloated
mess, so this code is standalone.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Tested-by: Anselm Busse <abusse@amazon.de>
---
V2 -> V3:
* Update code to reflect the removal of a global variable (Dave).

V1 -> V2:
* Add lots of code comments and edit the changelog (Dave).
* Encapsulate register read/write operations for processing header and
  data sections.
---
 arch/x86/kernel/cpu/microcode/intel.c | 150 ++++++++++++++++++++++++--
 1 file changed, 144 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 3eb3331c5012..5402bcb96f47 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/delay.h>
 #include <linux/io.h>
+#include <linux/pci_ids.h>
 
 #include <asm/cpu_device_id.h>
 #include <asm/processor.h>
@@ -42,8 +43,30 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
 
 #define MBOX_CONTROL_OFFSET	0x0
 #define MBOX_STATUS_OFFSET	0x4
+#define MBOX_WRDATA_OFFSET	0x8
+#define MBOX_RDDATA_OFFSET	0xc
 
 #define MASK_MBOX_CTRL_ABORT	BIT(0)
+#define MASK_MBOX_CTRL_GO	BIT(31)
+
+#define MASK_MBOX_STATUS_ERROR	BIT(2)
+#define MASK_MBOX_STATUS_READY	BIT(31)
+
+#define MASK_MBOX_RESP_SUCCESS	BIT(0)
+#define MASK_MBOX_RESP_PROGRESS	BIT(1)
+#define MASK_MBOX_RESP_ERROR	BIT(2)
+
+#define MBOX_CMD_LOAD		0x3
+#define MBOX_OBJ_STAGING	0xb
+#define DWORD_ALIGN(size)	((size) / sizeof(u32))
+#define MBOX_HEADER(mbox_size)	(PCI_VENDOR_ID_INTEL | \
+				 (MBOX_OBJ_STAGING << 16) | \
+				 ((u64)DWORD_ALIGN(mbox_size) << 32))
+
+/* The size of each mailbox header */
+#define MBOX_HEADER_SIZE	sizeof(u64)
+/* The size of staging hardware response */
+#define MBOX_RESPONSE_SIZE	sizeof(u64)
 
 /*
  * Each microcode image is divided into chunks, each at most
@@ -57,6 +80,7 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
  */
 #define MBOX_XACTION_SIZE	PAGE_SIZE
 #define MBOX_XACTION_MAX(imgsz)	((imgsz) * 2)
+#define MBOX_XACTION_TIMEOUT_MS	(10 * MSEC_PER_SEC)
 
 /* Current microcode patch used in early patching on the APs. */
 static struct microcode_intel *ucode_patch_va __read_mostly;
@@ -345,6 +369,49 @@ static __init struct microcode_intel *scan_microcode(void *data, size_t size,
 	return size ? NULL : patch;
 }
 
+static inline u32 read_mbox_dword(void __iomem *mmio_base)
+{
+	u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET);
+
+	/* Acknowledge read completion to the staging hardware */
+	writel(0, mmio_base + MBOX_RDDATA_OFFSET);
+	return dword;
+}
+
+static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword)
+{
+	writel(dword, mmio_base + MBOX_WRDATA_OFFSET);
+}
+
+static inline u64 read_mbox_header(void __iomem *mmio_base)
+{
+	u32 high, low;
+
+	low  = read_mbox_dword(mmio_base);
+	high = read_mbox_dword(mmio_base);
+
+	return ((u64)high << 32) | low;
+}
+
+static inline void write_mbox_header(void __iomem *mmio_base, u64 value)
+{
+	write_mbox_dword(mmio_base, value);
+	write_mbox_dword(mmio_base, value >> 32);
+}
+
+static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_size)
+{
+	int i;
+
+	/*
+	 * The MMIO space is mapped as Uncached (UC). Each write arrives
+	 * at the device as an individual transaction in program order.
+	 * The device can then resemble the sequence accordingly.
+	 */
+	for (i = 0; i < DWORD_ALIGN(chunk_size); i++)
+		write_mbox_dword(mmio_base, chunk[i]);
+}
+
 /*
  * Prepare for a new microcode transfer by resetting hardware and
  * configuring microcode image info.
@@ -388,15 +455,71 @@ static bool can_send_next_chunk(struct staging_state *ss)
 	return true;
 }
 
+/*
+ * Wait for the hardware to complete a transaction.
+ * Return true on success, false on failure.
+ */
+static bool wait_for_transaction(struct staging_state *ss)
+{
+	u32 timeout, status;
+
+	/* Allow time for hardware to complete the operation: */
+	for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) {
+		msleep(1);
+
+		status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);
+		/* Break out early if the hardware is ready: */
+		if (status & MASK_MBOX_STATUS_READY)
+			break;
+	}
+
+	status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);
+
+	/* Check for explicit error response */
+	if (status & MASK_MBOX_STATUS_ERROR) {
+		ss->state = UCODE_ERROR;
+		return false;
+	}
+
+	/*
+	 * Hardware is neither responded to the action nor
+	 * signaled any error. Treat the case as timeout.
+	 */
+	if (!(status & MASK_MBOX_STATUS_READY)) {
+		ss->state = UCODE_TIMEOUT;
+		return false;
+	}
+
+	ss->state = UCODE_OK;
+	return true;
+}
+
 /*
  * Transmit a chunk of the microcode image to the hardware.
  * Return true if the chunk is processed successfully.
  */
 static bool send_data_chunk(struct staging_state *ss)
 {
-	pr_debug_once("Staging mailbox loading code needs to be implemented.\n");
-	ss->state = UCODE_ERROR;
-	return false;
+	u16 mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size;
+	u32 *chunk = ss->ucode_ptr + ss->offset;
+
+	/*
+	 * Write 'request' mailbox object in the following order:
+	 * - Mailbox header includes total size
+	 * - Command header specifies the load operation
+	 * - Data section contains a microcode chunk
+	 */
+	write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size));
+	write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD);
+	write_mbox_data(ss->mmio_base, chunk, ss->chunk_size);
+	ss->bytes_sent += ss->chunk_size;
+
+	/*
+	 * Notify the hardware that the mailbox is ready for processing.
+	 * The staging hardware will process the request asynchronously.
+	 */
+	writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET);
+	return wait_for_transaction(ss);
 }
 
 /*
@@ -405,9 +528,24 @@ static bool send_data_chunk(struct staging_state *ss)
  */
 static bool fetch_next_offset(struct staging_state *ss)
 {
-	pr_debug_once("Staging mailbox response handling code needs to be implemented.\n\n");
-	ss->state = UCODE_ERROR;
-	return false;
+	const u16 mbox_size = MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE;
+
+	/* All responses begin with the same header value: */
+	WARN_ON_ONCE(read_mbox_header(ss->mmio_base) != MBOX_HEADER(mbox_size));
+
+	/*
+	 * The 'response' mailbox contains two dword data:
+	 * - First has next offset in microcode image
+	 * - Second delivers status flag
+	 */
+	ss->offset = read_mbox_dword(ss->mmio_base);
+	if (read_mbox_dword(ss->mmio_base) & MASK_MBOX_RESP_ERROR) {
+		ss->state = UCODE_ERROR;
+		return false;
+	}
+
+	ss->state = UCODE_OK;
+	return true;
 }
 
 /*
-- 
2.48.1
Re: [PATCH v4 5/6] x86/microcode/intel: Support mailbox transfer
Posted by Dave Hansen 1 month, 3 weeks ago
On 8/13/25 10:26, Chang S. Bae wrote:
> Previously, the functions for sending microcode data and retrieving the
> next offset were placeholders, as they required handling the specific
> mailbox format. Implement them as following:
> 
> == Mailbox Format ==
> 
> The staging mailbox consists of two primary sections: 'header' and
> 'data'. While the microcode must be transferred following this format,
> the actual data transfer mechanism involves reading and writing to
> specific MMIO registers.
> 
> == Mailbox Data Registers ==
> 
> Unlike conventional interfaces that allocate MMIO space for each data
> chunk, the staging interface features a "narrow" interface, using only
> two dword-sized registers for read and write operations.
> 
> For example, if writing 2 dwords of data to a device. Typically, the
> device would expose 2 dwords of "wide" MMIO space. To send the data to
> the device:
> 
> 	writel(buf[0], io_addr + 0);
> 	writel(buf[1], io_addr + 1);
> 
> But, this interface is a bit different. Instead of having a "wide"
> interface where there is separate MMIO space for each word in a
> transaction, it has a "narrow" interface where several words are written
> to the same spot in MMIO space:
> 
> 	writel(buf[0], io_addr);
> 	writel(buf[1], io_addr);
> 
> The same goes for the read side.
> 
> == Implementation Summary ==
> 
> Given that, introduce two layers of helper functions at first:
> 
>   * Low-level helpers for reading and writing to data registers directly.
>   * Wrapper functions for handling mailbox header and data sections.
> 
> Using them, implement send_data_chunk() and fetch_next_offset()
> functions. Add explicit error and timeout handling routine in
> wait_for_transaction(), finishing up the transfer.
> 
> Note: The kernel has support for similar mailboxes. But none of them are
> compatible with this one. Trying to share code resulted in a bloated
> mess, so this code is standalone.
> 
> Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
> Tested-by: Anselm Busse <abusse@amazon.de>
> ---
> V2 -> V3:
> * Update code to reflect the removal of a global variable (Dave).
> 
> V1 -> V2:
> * Add lots of code comments and edit the changelog (Dave).
> * Encapsulate register read/write operations for processing header and
>   data sections.
> ---
>  arch/x86/kernel/cpu/microcode/intel.c | 150 ++++++++++++++++++++++++--
>  1 file changed, 144 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
> index 3eb3331c5012..5402bcb96f47 100644
> --- a/arch/x86/kernel/cpu/microcode/intel.c
> +++ b/arch/x86/kernel/cpu/microcode/intel.c
> @@ -22,6 +22,7 @@
>  #include <linux/mm.h>
>  #include <linux/delay.h>
>  #include <linux/io.h>
> +#include <linux/pci_ids.h>
>  
>  #include <asm/cpu_device_id.h>
>  #include <asm/processor.h>
> @@ -42,8 +43,30 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
>  
>  #define MBOX_CONTROL_OFFSET	0x0
>  #define MBOX_STATUS_OFFSET	0x4
> +#define MBOX_WRDATA_OFFSET	0x8
> +#define MBOX_RDDATA_OFFSET	0xc
>  
>  #define MASK_MBOX_CTRL_ABORT	BIT(0)
> +#define MASK_MBOX_CTRL_GO	BIT(31)
> +
> +#define MASK_MBOX_STATUS_ERROR	BIT(2)
> +#define MASK_MBOX_STATUS_READY	BIT(31)
> +
> +#define MASK_MBOX_RESP_SUCCESS	BIT(0)
> +#define MASK_MBOX_RESP_PROGRESS	BIT(1)
> +#define MASK_MBOX_RESP_ERROR	BIT(2)
> +
> +#define MBOX_CMD_LOAD		0x3
> +#define MBOX_OBJ_STAGING	0xb
> +#define DWORD_ALIGN(size)	((size) / sizeof(u32))
> +#define MBOX_HEADER(mbox_size)	(PCI_VENDOR_ID_INTEL | \
> +				 (MBOX_OBJ_STAGING << 16) | \
> +				 ((u64)DWORD_ALIGN(mbox_size) << 32))
> +
> +/* The size of each mailbox header */
> +#define MBOX_HEADER_SIZE	sizeof(u64)
> +/* The size of staging hardware response */
> +#define MBOX_RESPONSE_SIZE	sizeof(u64)
>  
>  /*
>   * Each microcode image is divided into chunks, each at most
> @@ -57,6 +80,7 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
>   */
>  #define MBOX_XACTION_SIZE	PAGE_SIZE
>  #define MBOX_XACTION_MAX(imgsz)	((imgsz) * 2)
> +#define MBOX_XACTION_TIMEOUT_MS	(10 * MSEC_PER_SEC)
>  
>  /* Current microcode patch used in early patching on the APs. */
>  static struct microcode_intel *ucode_patch_va __read_mostly;
> @@ -345,6 +369,49 @@ static __init struct microcode_intel *scan_microcode(void *data, size_t size,
>  	return size ? NULL : patch;
>  }
>  
> +static inline u32 read_mbox_dword(void __iomem *mmio_base)
> +{
> +	u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET);
> +
> +	/* Acknowledge read completion to the staging hardware */
> +	writel(0, mmio_base + MBOX_RDDATA_OFFSET);
> +	return dword;
> +}
> +
> +static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword)
> +{
> +	writel(dword, mmio_base + MBOX_WRDATA_OFFSET);
> +}
> +
> +static inline u64 read_mbox_header(void __iomem *mmio_base)
> +{
> +	u32 high, low;
> +
> +	low  = read_mbox_dword(mmio_base);
> +	high = read_mbox_dword(mmio_base);
> +
> +	return ((u64)high << 32) | low;
> +}
> +
> +static inline void write_mbox_header(void __iomem *mmio_base, u64 value)
> +{
> +	write_mbox_dword(mmio_base, value);
> +	write_mbox_dword(mmio_base, value >> 32);
> +}
> +
> +static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_size)
> +{
> +	int i;
> +
> +	/*
> +	 * The MMIO space is mapped as Uncached (UC). Each write arrives
> +	 * at the device as an individual transaction in program order.
> +	 * The device can then resemble the sequence accordingly.
> +	 */
> +	for (i = 0; i < DWORD_ALIGN(chunk_size); i++)

I don't really like the DWORD_ALIGN(). To me, this is much more clear:

	for (i = 0; i < chunk_size / sizeof(u32); i++)

> +		write_mbox_dword(mmio_base, chunk[i]);
> +}

I'd _maybe_ make it "chunk_bytes" instead of "chunk_size", but that's
just a naming nit.

>  /*
>   * Prepare for a new microcode transfer by resetting hardware and
>   * configuring microcode image info.
> @@ -388,15 +455,71 @@ static bool can_send_next_chunk(struct staging_state *ss)
>  	return true;
>  }
>  
> +/*
> + * Wait for the hardware to complete a transaction.
> + * Return true on success, false on failure.
> + */
> +static bool wait_for_transaction(struct staging_state *ss)
> +{
> +	u32 timeout, status;
> +
> +	/* Allow time for hardware to complete the operation: */
> +	for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) {
> +		msleep(1);
> +
> +		status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);
> +		/* Break out early if the hardware is ready: */
> +		if (status & MASK_MBOX_STATUS_READY)
> +			break;
> +	}
> +
> +	status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);

Why is another read needed here? It had to go through the loop above at
_least_ once, right?


>  /*
>   * Transmit a chunk of the microcode image to the hardware.
>   * Return true if the chunk is processed successfully.
>   */
>  static bool send_data_chunk(struct staging_state *ss)
>  {
> -	pr_debug_once("Staging mailbox loading code needs to be implemented.\n");
> -	ss->state = UCODE_ERROR;
> -	return false;
> +	u16 mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size;

The "* 2" needs to be explained.

> +	u32 *chunk = ss->ucode_ptr + ss->offset;

I think calling this "src_chunk" would make a lot of sense.

> +	/*
> +	 * Write 'request' mailbox object in the following order:
> +	 * - Mailbox header includes total size
> +	 * - Command header specifies the load operation
> +	 * - Data section contains a microcode chunk
> +	 */
> +	write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size));
> +	write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD);
> +	write_mbox_data(ss->mmio_base, chunk, ss->chunk_size);
> +	ss->bytes_sent += ss->chunk_size;
> +
> +	/*
> +	 * Notify the hardware that the mailbox is ready for processing.
> +	 * The staging hardware will process the request asynchronously.
> +	 */
> +	writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET);
> +	return wait_for_transaction(ss);
>  }

Why is it important that "The staging hardware will process the request
asynchronously."? We *could* go off and do other things, but we don't.
We sit here and poll. So what does it matter?

>  
>  /*
> @@ -405,9 +528,24 @@ static bool send_data_chunk(struct staging_state *ss)
>   */
>  static bool fetch_next_offset(struct staging_state *ss)
>  {
> -	pr_debug_once("Staging mailbox response handling code needs to be implemented.\n\n");
> -	ss->state = UCODE_ERROR;
> -	return false;
> +	const u16 mbox_size = MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE;
> +
> +	/* All responses begin with the same header value: */
> +	WARN_ON_ONCE(read_mbox_header(ss->mmio_base) != MBOX_HEADER(mbox_size));

Again, this seems like a pr_warn() and then 'return false', not a "panic
the system" kind of error.

We should not be panic'ing the system in code that's purely an
optimization at this point.

It's also, IMNHO, bad form to do important actions inside of a WARN_ON()
condition. I'd much rather it be:

/*
 * All responses start with the same mailbox header. Consume
 * it and ensure that it has the expected contents.
 */
static inline int consume_mbox_header(ss)
{
	u32 header = read_mbox_header(ss->mmio_base);
	u32 expected_header = MBOX_HEADER(mbox_size));

	if (header != expected_header) {
		pr_warn_once(...);
		return -EINVAL;
	}

	return 0;
}	

The other way to do it is to do the three reads next to each other and
make it obvious that there are 3 dwords coming out of the device:

	u32 header    = read_mbox_dword(ss->mmio_base);
	u32 offset    = read_mbox_dword(ss->mmio_base);
	u32 err_word  = read_mbox_dword(ss->mmio_base);

... then go do all the checking after that.
[PATCH v4a 5/6] x86/microcode/intel: Support mailbox transfer
Posted by Chang S. Bae 1 month, 1 week ago
Previously, the functions for sending microcode data and retrieving the
next offset were placeholders, as they required handling the specific
mailbox format. Implement them as following:

== Mailbox Format ==

The staging mailbox consists of two primary sections: 'header' and
'data'. While the microcode must be transferred following this format,
the actual data transfer mechanism involves reading and writing to
specific MMIO registers.

== Mailbox Data Registers ==

Unlike conventional interfaces that allocate MMIO space for each data
chunk, the staging interface features a "narrow" interface, using only
two dword-sized registers for read and write operations.

For example, if writing 2 dwords of data to a device. Typically, the
device would expose 2 dwords of "wide" MMIO space. To send the data to
the device:

	writel(buf[0], io_addr + 0);
	writel(buf[1], io_addr + 1);

But, this interface is a bit different. Instead of having a "wide"
interface where there is separate MMIO space for each word in a
transaction, it has a "narrow" interface where several words are written
to the same spot in MMIO space:

	writel(buf[0], io_addr);
	writel(buf[1], io_addr);

The same goes for the read side.

== Implementation Summary ==

Given that, introduce two layers of helper functions at first:

  * Low-level helpers for reading and writing to data registers directly.
  * Wrapper functions for handling mailbox header and data sections.

Using them, implement send_data_chunk() and fetch_next_offset()
functions. Add explicit error and timeout handling routine in
wait_for_transaction(), finishing up the transfer.

Both hardware error states and implicit errors -- invalid header or
offset -- result in UCODE_ERROR. Emit a clear message for the latter.

Note: The kernel has support for similar mailboxes. But none of them are
compatible with this one. Trying to share code resulted in a bloated
mess, so this code is standalone.

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Tested-by: Anselm Busse <abusse@amazon.de>
---
V4 -> V4a: Addressed Dave's feedback
* fetch_next_offset():
  - Make dword reads explicit
  - Consolidate offset validation -- adding another user for the
    end-offset checker
  - Convert WARN_* with pr_err_once()
* Simplify transaction waiting logic a bit
---
 arch/x86/kernel/cpu/microcode/intel.c | 186 +++++++++++++++++++++++++-
 1 file changed, 179 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index a1b13202330d..f6b365eba6a2 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -22,6 +22,7 @@
 #include <linux/mm.h>
 #include <linux/delay.h>
 #include <linux/io.h>
+#include <linux/pci_ids.h>
 
 #include <asm/cpu_device_id.h>
 #include <asm/processor.h>
@@ -42,8 +43,31 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
 
 #define MBOX_CONTROL_OFFSET	0x0
 #define MBOX_STATUS_OFFSET	0x4
+#define MBOX_WRDATA_OFFSET	0x8
+#define MBOX_RDDATA_OFFSET	0xc
 
 #define MASK_MBOX_CTRL_ABORT	BIT(0)
+#define MASK_MBOX_CTRL_GO	BIT(31)
+
+#define MASK_MBOX_STATUS_ERROR	BIT(2)
+#define MASK_MBOX_STATUS_READY	BIT(31)
+
+#define MASK_MBOX_RESP_SUCCESS	BIT(0)
+#define MASK_MBOX_RESP_PROGRESS	BIT(1)
+#define MASK_MBOX_RESP_ERROR	BIT(2)
+
+#define MBOX_CMD_LOAD		0x3
+#define MBOX_OBJ_STAGING	0xb
+#define MBOX_HEADER(size)	((PCI_VENDOR_ID_INTEL)    | \
+				 (MBOX_OBJ_STAGING << 16) | \
+				 ((u64)((size) / sizeof(u32)) << 32))
+
+/* The size of each mailbox header */
+#define MBOX_HEADER_SIZE	sizeof(u64)
+/* The size of staging hardware response */
+#define MBOX_RESPONSE_SIZE	sizeof(u64)
+
+#define MBOX_XACTION_TIMEOUT_MS	(10 * MSEC_PER_SEC)
 
 /* Current microcode patch used in early patching on the APs. */
 static struct microcode_intel *ucode_patch_va __read_mostly;
@@ -330,6 +354,49 @@ static __init struct microcode_intel *scan_microcode(void *data, size_t size,
 	return size ? NULL : patch;
 }
 
+static inline u32 read_mbox_dword(void __iomem *mmio_base)
+{
+	u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET);
+
+	/* Acknowledge read completion to the staging hardware */
+	writel(0, mmio_base + MBOX_RDDATA_OFFSET);
+	return dword;
+}
+
+static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword)
+{
+	writel(dword, mmio_base + MBOX_WRDATA_OFFSET);
+}
+
+static inline u64 read_mbox_header(void __iomem *mmio_base)
+{
+	u32 high, low;
+
+	low  = read_mbox_dword(mmio_base);
+	high = read_mbox_dword(mmio_base);
+
+	return ((u64)high << 32) | low;
+}
+
+static inline void write_mbox_header(void __iomem *mmio_base, u64 value)
+{
+	write_mbox_dword(mmio_base, value);
+	write_mbox_dword(mmio_base, value >> 32);
+}
+
+static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_bytes)
+{
+	int i;
+
+	/*
+	 * The MMIO space is mapped as Uncached (UC). Each write arrives
+	 * at the device as an individual transaction in program order.
+	 * The device can then resemble the sequence accordingly.
+	 */
+	for (i = 0; i < chunk_bytes / sizeof(u32); i++)
+		write_mbox_dword(mmio_base, chunk[i]);
+}
+
 /*
  * Prepare for a new microcode transfer: reset hardware and record the
  * image size.
@@ -385,6 +452,14 @@ static bool can_send_next_chunk(struct staging_state *ss)
 	return true;
 }
 
+/*
+ * The hardware indicates completion by returning a sentinel end offset
+ */
+static inline bool is_end_offset(u32 offset)
+{
+	return offset == UINT_MAX;
+}
+
 /*
  * Determine whether staging is complete: either the hardware signaled
  * the end offset, or no more transactions are permitted (retry limit
@@ -392,18 +467,73 @@ static bool can_send_next_chunk(struct staging_state *ss)
  */
 static inline bool staging_is_complete(struct staging_state *ss)
 {
-	return (ss->offset == UINT_MAX) || !can_send_next_chunk(ss);
+	return is_end_offset(ss->offset) || !can_send_next_chunk(ss);
+}
+
+/*
+ * Wait for the hardware to complete a transaction.
+ * Return 0 on success, or an error code on failure.
+ */
+static int wait_for_transaction(struct staging_state *ss)
+{
+	u32 timeout, status;
+
+	/* Allow time for hardware to complete the operation: */
+	for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) {
+		msleep(1);
+
+		status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);
+		/* Break out early if the hardware is ready: */
+		if (status & MASK_MBOX_STATUS_READY)
+			break;
+	}
+
+	/* Check for explicit error response */
+	if (status & MASK_MBOX_STATUS_ERROR) {
+		ss->state = UCODE_ERROR;
+		return -EPROTO;
+	}
+
+	/*
+	 * Hardware is neither responded to the action nor signaled any
+	 * error. Treat this as timeout.
+	 */
+	if (!(status & MASK_MBOX_STATUS_READY)) {
+		ss->state = UCODE_TIMEOUT;
+		return -ETIMEDOUT;
+	}
+
+	ss->state = UCODE_OK;
+	return 0;
 }
 
 /*
  * Transmit a chunk of the microcode image to the hardware.
  * Return 0 on success, or an error code on failure.
  */
-static int send_data_chunk(struct staging_state *ss, void *ucode_ptr __maybe_unused)
+static int send_data_chunk(struct staging_state *ss, void *ucode_ptr)
 {
-	pr_debug_once("Staging mailbox loading code needs to be implemented.\n");
-	ss->state = UCODE_ERROR;
-	return -EPROTONOSUPPORT;
+	u32 *src_chunk = ucode_ptr + ss->offset;
+	u16 mbox_size;
+
+	/*
+	 * Write a 'request' mailbox object in this order:
+	 *  1. Mailbox header includes total size
+	 *  2. Command header specifies the load operation
+	 *  3. Data section contains a microcode chunk
+	 *
+	 * Thus, the mailbox size is two headers plus the chunk size.
+	 */
+	mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size;
+	write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size));
+	write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD);
+	write_mbox_data(ss->mmio_base, src_chunk, ss->chunk_size);
+	ss->bytes_sent += ss->chunk_size;
+
+	/* Notify the hardware that the mailbox is ready for processing. */
+	writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET);
+
+	return wait_for_transaction(ss);
 }
 
 /*
@@ -412,9 +542,51 @@ static int send_data_chunk(struct staging_state *ss, void *ucode_ptr __maybe_unu
  */
 static int fetch_next_offset(struct staging_state *ss)
 {
-	pr_debug_once("Staging mailbox response handling code needs to be implemented.\n\n");
+	const u64 expected_header = MBOX_HEADER(MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE);
+	u32 offset, status;
+	u64 header;
+	int err;
+
+	/*
+	 * The 'response' mailbox returns three fields, in order:
+	 *  1. Header
+	 *  2. Next offset in the microcode image
+	 *  3. Status flags
+	 */
+	header = read_mbox_header(ss->mmio_base);
+	offset = read_mbox_dword(ss->mmio_base);
+	status = read_mbox_dword(ss->mmio_base);
+
+	/* All valid responses must start with the expected header. */
+	if (header != expected_header) {
+		pr_err_once("staging: invalid response header\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	/*
+	 * Verify the offset: If not at the end marker, it must not
+	 * exceed the microcode image length
+	 */
+	if (!is_end_offset(offset) && offset > ss->ucode_len) {
+		pr_err_once("staging: invalid response offset\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	/* Hardware may report errors explicitly in the status field */
+	if (status & MASK_MBOX_RESP_ERROR) {
+		err = -EPROTO;
+		goto err_out;
+	}
+
+	ss->offset = offset;
+	ss->state  = UCODE_OK;
+	return 0;
+
+err_out:
 	ss->state = UCODE_ERROR;
-	return -EPROTONOSUPPORT;
+	return err;
 }
 
 /*
-- 
2.48.1