The main function of the Redirected interrupt controller is to manage the
redirected-interrupt table, which consists of many redirected entries.
When MSI interrupts are requested, the driver creates a corresponding
redirected entry that describes the target CPU/vector number and the
operating mode of the interrupt. The redirected interrupt module has an
independent cache, and during the interrupt routing process, it will
prioritize the redirected entries that hit the cache. The driver
invalidates certain entry caches via a command queue.
Co-developed-by: Liupu Wang <wangliupu@loongson.cn>
Signed-off-by: Liupu Wang <wangliupu@loongson.cn>
Signed-off-by: Tianyang Zhang <zhangtianyang@loongson.cn>
---
arch/loongarch/include/asm/cpu-features.h | 1 +
arch/loongarch/include/asm/cpu.h | 2 +
arch/loongarch/include/asm/loongarch.h | 6 +
arch/loongarch/kernel/cpu-probe.c | 2 +
drivers/irqchip/Makefile | 2 +-
drivers/irqchip/irq-loongarch-avec.c | 25 +-
drivers/irqchip/irq-loongarch-ir.c | 562 ++++++++++++++++++++++
drivers/irqchip/irq-loongson.h | 12 +
include/linux/cpuhotplug.h | 1 +
9 files changed, 599 insertions(+), 14 deletions(-)
create mode 100644 drivers/irqchip/irq-loongarch-ir.c
diff --git a/arch/loongarch/include/asm/cpu-features.h b/arch/loongarch/include/asm/cpu-features.h
index fc83bb32f9f0..03f7e93e81e0 100644
--- a/arch/loongarch/include/asm/cpu-features.h
+++ b/arch/loongarch/include/asm/cpu-features.h
@@ -68,5 +68,6 @@
#define cpu_has_ptw cpu_opt(LOONGARCH_CPU_PTW)
#define cpu_has_lspw cpu_opt(LOONGARCH_CPU_LSPW)
#define cpu_has_avecint cpu_opt(LOONGARCH_CPU_AVECINT)
+#define cpu_has_redirectint cpu_opt(LOONGARCH_CPU_REDIRECTINT)
#endif /* __ASM_CPU_FEATURES_H */
diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h
index 98cf4d7b4b0a..33cd96e569d8 100644
--- a/arch/loongarch/include/asm/cpu.h
+++ b/arch/loongarch/include/asm/cpu.h
@@ -102,6 +102,7 @@ enum cpu_type_enum {
#define CPU_FEATURE_PTW 27 /* CPU has hardware page table walker */
#define CPU_FEATURE_LSPW 28 /* CPU has LSPW (lddir/ldpte instructions) */
#define CPU_FEATURE_AVECINT 29 /* CPU has AVEC interrupt */
+#define CPU_FEATURE_REDIRECTINT 30 /* CPU has interrupt remmap */
#define LOONGARCH_CPU_CPUCFG BIT_ULL(CPU_FEATURE_CPUCFG)
#define LOONGARCH_CPU_LAM BIT_ULL(CPU_FEATURE_LAM)
@@ -133,5 +134,6 @@ enum cpu_type_enum {
#define LOONGARCH_CPU_PTW BIT_ULL(CPU_FEATURE_PTW)
#define LOONGARCH_CPU_LSPW BIT_ULL(CPU_FEATURE_LSPW)
#define LOONGARCH_CPU_AVECINT BIT_ULL(CPU_FEATURE_AVECINT)
+#define LOONGARCH_CPU_REDIRECTINT BIT_ULL(CPU_FEATURE_REDIRECTINT)
#endif /* _ASM_CPU_H */
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index 52651aa0e583..95e06cb6831e 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -1130,6 +1130,7 @@
#define IOCSRF_FLATMODE BIT_ULL(10)
#define IOCSRF_VM BIT_ULL(11)
#define IOCSRF_AVEC BIT_ULL(15)
+#define IOCSRF_REDIRECTINT BIT_ULL(16)
#define LOONGARCH_IOCSR_VENDOR 0x10
@@ -1189,6 +1190,11 @@
#define LOONGARCH_IOCSR_EXTIOI_NODEMAP_BASE 0x14a0
#define LOONGARCH_IOCSR_EXTIOI_IPMAP_BASE 0x14c0
+#define LOONGARCH_IOCSR_REDIRECT_CFG 0x15e0
+#define LOONGARCH_IOCSR_REDIRECT_TBR 0x15e8 /* IRT BASE REG*/
+#define LOONGARCH_IOCSR_REDIRECT_CQB 0x15f0 /* IRT CACHE QUEUE BASE */
+#define LOONGARCH_IOCSR_REDIRECT_CQH 0x15f8 /* IRT CACHE QUEUE HEAD, 32bit */
+#define LOONGARCH_IOCSR_REDIRECT_CQT 0x15fc /* IRT CACHE QUEUE TAIL, 32bit */
#define LOONGARCH_IOCSR_EXTIOI_EN_BASE 0x1600
#define LOONGARCH_IOCSR_EXTIOI_BOUNCE_BASE 0x1680
#define LOONGARCH_IOCSR_EXTIOI_ISR_BASE 0x1800
diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c
index fedaa67cde41..543474fd1399 100644
--- a/arch/loongarch/kernel/cpu-probe.c
+++ b/arch/loongarch/kernel/cpu-probe.c
@@ -289,6 +289,8 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int
c->options |= LOONGARCH_CPU_EIODECODE;
if (config & IOCSRF_AVEC)
c->options |= LOONGARCH_CPU_AVECINT;
+ if (config & IOCSRF_REDIRECTINT)
+ c->options |= LOONGARCH_CPU_REDIRECTINT;
if (config & IOCSRF_VM)
c->options |= LOONGARCH_CPU_HYPERVISOR;
}
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 365bcea9a61f..2bb8618f96d1 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -114,7 +114,7 @@ obj-$(CONFIG_LS1X_IRQ) += irq-ls1x.o
obj-$(CONFIG_TI_SCI_INTR_IRQCHIP) += irq-ti-sci-intr.o
obj-$(CONFIG_TI_SCI_INTA_IRQCHIP) += irq-ti-sci-inta.o
obj-$(CONFIG_TI_PRUSS_INTC) += irq-pruss-intc.o
-obj-$(CONFIG_IRQ_LOONGARCH_CPU) += irq-loongarch-cpu.o irq-loongarch-avec.o
+obj-$(CONFIG_IRQ_LOONGARCH_CPU) += irq-loongarch-cpu.o irq-loongarch-avec.o irq-loongarch-ir.o
obj-$(CONFIG_LOONGSON_LIOINTC) += irq-loongson-liointc.o
obj-$(CONFIG_LOONGSON_EIOINTC) += irq-loongson-eiointc.o
obj-$(CONFIG_LOONGSON_HTPIC) += irq-loongson-htpic.o
diff --git a/drivers/irqchip/irq-loongarch-avec.c b/drivers/irqchip/irq-loongarch-avec.c
index 80e55955a29f..4e114c8ca3d0 100644
--- a/drivers/irqchip/irq-loongarch-avec.c
+++ b/drivers/irqchip/irq-loongarch-avec.c
@@ -24,7 +24,6 @@
#define VECTORS_PER_REG 64
#define IRR_VECTOR_MASK 0xffUL
#define IRR_INVALID_MASK 0x80000000UL
-#define AVEC_MSG_OFFSET 0x100000
#ifdef CONFIG_SMP
struct pending_list {
@@ -47,15 +46,6 @@ struct avecintc_chip {
static struct avecintc_chip loongarch_avec;
-struct avecintc_data {
- struct list_head entry;
- unsigned int cpu;
- unsigned int vec;
- unsigned int prev_cpu;
- unsigned int prev_vec;
- unsigned int moving;
-};
-
static inline void avecintc_enable(void)
{
u64 value;
@@ -85,7 +75,7 @@ static inline void pending_list_init(int cpu)
INIT_LIST_HEAD(&plist->head);
}
-static void avecintc_sync(struct avecintc_data *adata)
+void avecintc_sync(struct avecintc_data *adata)
{
struct pending_list *plist;
@@ -109,7 +99,12 @@ static int avecintc_set_affinity(struct irq_data *data, const struct cpumask *de
return -EBUSY;
if (cpu_online(adata->cpu) && cpumask_test_cpu(adata->cpu, dest))
- return 0;
+ /*
+ * The new affinity configuration has taken effect,
+ * and returning IRQ_SET_MASK_OK_DONE here indicates that no further
+ * changes need to be made in the subsequent process
+ */
+ return IRQ_SET_MASK_OK_DONE;
cpumask_and(&intersect_mask, dest, cpu_online_mask);
@@ -121,7 +116,8 @@ static int avecintc_set_affinity(struct irq_data *data, const struct cpumask *de
adata->cpu = cpu;
adata->vec = vector;
per_cpu_ptr(irq_map, adata->cpu)[adata->vec] = irq_data_to_desc(data);
- avecintc_sync(adata);
+ if (!cpu_has_redirectint)
+ avecintc_sync(adata);
}
irq_data_update_effective_affinity(data, cpumask_of(cpu));
@@ -412,6 +408,9 @@ static int __init pch_msi_parse_madt(union acpi_subtable_headers *header,
static inline int __init acpi_cascade_irqdomain_init(void)
{
+ if (cpu_has_redirectint)
+ return redirect_acpi_init(loongarch_avec.domain);
+
return acpi_table_parse_madt(ACPI_MADT_TYPE_MSI_PIC, pch_msi_parse_madt, 1);
}
diff --git a/drivers/irqchip/irq-loongarch-ir.c b/drivers/irqchip/irq-loongarch-ir.c
new file mode 100644
index 000000000000..ae42ec5028d2
--- /dev/null
+++ b/drivers/irqchip/irq-loongarch-ir.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Loongson Technologies, Inc.
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/spinlock.h>
+#include <linux/msi.h>
+
+#include <asm/irq.h>
+#include <asm/loongarch.h>
+#include <asm/setup.h>
+#include <larchintrin.h>
+
+#include "irq-loongson.h"
+#include "irq-msi-lib.h"
+
+#define IRD_ENTRIES 65536
+
+/* redirect entry size 128bits */
+#define IRD_PAGE_ORDER (20 - PAGE_SHIFT)
+
+/* irt cache invalid queue */
+#define INVALID_QUEUE_SIZE 4096
+
+#define INVALID_QUEUE_PAGE_ORDER (16 - PAGE_SHIFT)
+
+#define GPID_ADDR_MASK 0x3ffffffffffULL
+#define GPID_ADDR_SHIFT 6
+
+#define CQB_SIZE_SHIFT 0
+#define CQB_SIZE_MASK 0xf
+#define CQB_ADDR_SHIFT 12
+#define CQB_ADDR_MASK (0xfffffffffULL)
+
+#define CFG_DISABLE_IDLE 2
+#define INVALID_INDEX 0
+
+#define MAX_IR_ENGINES 16
+
+struct irq_domain *redirect_domain;
+
+struct redirect_entry {
+ struct {
+ __u64 valid : 1,
+ res1 : 5,
+ gpid : 42,
+ res2 : 8,
+ vector : 8;
+ } lo;
+ __u64 hi;
+};
+
+struct redirect_gpid {
+ u64 pir[4]; /* Pending interrupt requested */
+ u8 en : 1, /* doorbell */
+ res0 : 7;
+ u8 irqnum;
+ u16 res1;
+ u32 dst;
+ u32 rsvd[6];
+} __aligned(64);
+
+struct redirect_table {
+ int node;
+ struct redirect_entry *table;
+ unsigned long *bitmap;
+ unsigned int nr_ird;
+ struct page *page;
+ raw_spinlock_t lock;
+};
+
+struct redirect_item {
+ int index;
+ struct redirect_entry *entry;
+ struct redirect_gpid *gpid;
+ struct redirect_table *table;
+};
+
+struct redirect_queue {
+ int node;
+ u64 base;
+ u32 max_size;
+ int head;
+ int tail;
+ struct page *page;
+ raw_spinlock_t lock;
+};
+
+struct irde_desc {
+ struct redirect_table ird_table;
+ struct redirect_queue inv_queue;
+ int finish;
+};
+
+struct irde_inv_cmd {
+ union {
+ __u64 cmd_info;
+ struct {
+ __u64 res1 : 4,
+ type : 1,
+ need_notice : 1,
+ pad : 2,
+ index : 16,
+ pad2 : 40;
+ } index;
+ };
+ __u64 notice_addr;
+};
+
+static struct irde_desc irde_descs[MAX_IR_ENGINES];
+static phys_addr_t msi_base_addr;
+static phys_addr_t redirect_reg_base = 0x1fe00000;
+
+#define REDIRECT_REG_BASE(reg, node) \
+ (UNCACHE_BASE | redirect_reg_base | (u64)(node) << NODE_ADDRSPACE_SHIFT | (reg))
+#define redirect_reg_queue_head(node) REDIRECT_REG_BASE(LOONGARCH_IOCSR_REDIRECT_CQH, (node))
+#define redirect_reg_queue_tail(node) REDIRECT_REG_BASE(LOONGARCH_IOCSR_REDIRECT_CQT, (node))
+#define read_queue_head(node) (*((u32 *)(redirect_reg_queue_head(node))))
+#define read_queue_tail(node) (*((u32 *)(redirect_reg_queue_tail(node))))
+#define write_queue_tail(node, val) (*((u32 *)(redirect_reg_queue_tail(node))) = (val))
+
+static inline bool invalid_queue_is_full(int node, u32 *tail)
+{
+ u32 head = read_queue_head(node);
+
+ *tail = read_queue_tail(node);
+
+ return head == ((*tail + 1) % INVALID_QUEUE_SIZE);
+}
+
+static void invalid_enqueue(struct redirect_queue *rqueue, struct irde_inv_cmd *cmd)
+{
+ struct irde_inv_cmd *inv_addr;
+ u32 tail;
+
+ guard(raw_spinlock_irqsave)(&rqueue->lock);
+
+ while (invalid_queue_is_full(rqueue->node, &tail))
+ cpu_relax();
+
+ inv_addr = (struct irde_inv_cmd *)(rqueue->base + tail * sizeof(struct irde_inv_cmd));
+ memcpy(inv_addr, cmd, sizeof(struct irde_inv_cmd));
+ tail = (tail + 1) % INVALID_QUEUE_SIZE;
+
+ /*
+ * the barrier ensure that the previously written data is visible
+ * before updating the tail register
+ */
+ wmb();
+
+ write_queue_tail(rqueue->node, tail);
+}
+
+static void irde_invlid_entry_node(struct redirect_item *item)
+{
+ struct redirect_queue *rqueue;
+ struct irde_inv_cmd cmd;
+ volatile u64 raddr = 0;
+ int node = item->table->node;
+
+ rqueue = &(irde_descs[node].inv_queue);
+ cmd.cmd_info = 0;
+ cmd.index.type = INVALID_INDEX;
+ cmd.index.need_notice = 1;
+ cmd.index.index = item->index;
+ cmd.notice_addr = (u64)(__pa(&raddr));
+
+ invalid_enqueue(rqueue, &cmd);
+
+ while (!raddr)
+ cpu_relax();
+
+}
+
+static inline struct avecintc_data *irq_data_get_avec_data(struct irq_data *data)
+{
+ return data->parent_data->chip_data;
+}
+
+static int redirect_table_alloc(struct redirect_item *item, struct redirect_table *ird_table)
+{
+ int index;
+
+ guard(raw_spinlock_irqsave)(&ird_table->lock);
+
+ index = find_first_zero_bit(ird_table->bitmap, IRD_ENTRIES);
+ if (index > IRD_ENTRIES) {
+ pr_err("No redirect entry to use\n");
+ return -ENOMEM;
+ }
+
+ __set_bit(index, ird_table->bitmap);
+
+ item->index = index;
+ item->entry = &ird_table->table[index];
+ item->table = ird_table;
+
+ return 0;
+}
+
+static void redirect_table_free(struct redirect_item *item)
+{
+ struct redirect_table *ird_table;
+ struct redirect_entry *entry;
+
+ ird_table = item->table;
+
+ entry = item->entry;
+ memset(entry, 0, sizeof(struct redirect_entry));
+
+ scoped_guard(raw_spinlock_irqsave, &ird_table->lock)
+ bitmap_release_region(ird_table->bitmap, item->index, 0);
+
+ kfree(item->gpid);
+
+ irde_invlid_entry_node(item);
+}
+
+static inline void redirect_domain_prepare_entry(struct redirect_item *item,
+ struct avecintc_data *adata)
+{
+ struct redirect_entry *entry = item->entry;
+
+ item->gpid->en = 1;
+ item->gpid->irqnum = adata->vec;
+ item->gpid->dst = adata->cpu;
+
+ entry->lo.valid = 1;
+ entry->lo.gpid = ((long)item->gpid >> GPID_ADDR_SHIFT) & (GPID_ADDR_MASK);
+ entry->lo.vector = 0xff;
+}
+
+static int redirect_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force)
+{
+ struct redirect_item *item = data->chip_data;
+ struct avecintc_data *adata;
+ int ret;
+
+ ret = irq_chip_set_affinity_parent(data, dest, force);
+ if (ret == IRQ_SET_MASK_OK_DONE) {
+ return ret;
+ } else if (ret) {
+ pr_err("IRDE:set_affinity error %d\n", ret);
+ return ret;
+ }
+
+ adata = irq_data_get_avec_data(data);
+ redirect_domain_prepare_entry(item, adata);
+ irde_invlid_entry_node(item);
+ avecintc_sync(adata);
+
+ return IRQ_SET_MASK_OK;
+}
+
+static void redirect_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
+{
+ struct redirect_item *item;
+
+ item = irq_data_get_irq_chip_data(d);
+ msg->address_lo = (msi_base_addr | 1 << 2 | ((item->index & 0xffff) << 4));
+ msg->address_hi = 0x0;
+ msg->data = 0x0;
+}
+
+static struct irq_chip loongarch_redirect_chip = {
+ .name = "REDIRECT",
+ .irq_ack = irq_chip_ack_parent,
+ .irq_mask = irq_chip_mask_parent,
+ .irq_unmask = irq_chip_unmask_parent,
+ .irq_set_affinity = redirect_set_affinity,
+ .irq_compose_msi_msg = redirect_compose_msi_msg,
+};
+
+static void redirect_free_resources(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ for (int i = 0; i < nr_irqs; i++) {
+ struct irq_data *irq_data;
+
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ if (irq_data && irq_data->chip_data) {
+ struct redirect_item *item;
+
+ item = irq_data->chip_data;
+ redirect_table_free(item);
+ kfree(item);
+ }
+ }
+}
+
+static int redirect_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct redirect_table *ird_table;
+ struct avecintc_data *avec_data;
+ struct irq_data *irq_data;
+ msi_alloc_info_t *info;
+ int ret, i, node;
+
+ info = (msi_alloc_info_t *)arg;
+ node = dev_to_node(info->desc->dev);
+ ird_table = &irde_descs[node].ird_table;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_irqs; i++) {
+ struct redirect_item *item;
+
+ item = kzalloc(sizeof(struct redirect_item), GFP_KERNEL);
+ if (!item) {
+ pr_err("Alloc redirect descriptor failed\n");
+ goto out_free_resources;
+ }
+
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+
+ avec_data = irq_data_get_avec_data(irq_data);
+ ret = redirect_table_alloc(item, ird_table);
+ if (ret) {
+ pr_err("Alloc redirect table entry failed\n");
+ goto out_free_resources;
+ }
+
+ item->gpid = kzalloc_node(sizeof(struct redirect_gpid), GFP_KERNEL, node);
+ if (!item->gpid) {
+ pr_err("Alloc redirect GPID failed\n");
+ goto out_free_resources;
+ }
+
+ irq_data->chip_data = item;
+ irq_data->chip = &loongarch_redirect_chip;
+ redirect_domain_prepare_entry(item, avec_data);
+ }
+ return 0;
+
+out_free_resources:
+ redirect_free_resources(domain, virq, nr_irqs);
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+
+ return -ENOMEM;
+}
+
+static void redirect_domain_free(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs)
+{
+ redirect_free_resources(domain, virq, nr_irqs);
+ return irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops redirect_domain_ops = {
+ .alloc = redirect_domain_alloc,
+ .free = redirect_domain_free,
+ .select = msi_lib_irq_domain_select,
+};
+
+static int redirect_queue_init(int node)
+{
+ struct redirect_queue *rqueue = &(irde_descs[node].inv_queue);
+ struct page *pages;
+
+ pages = alloc_pages_node(0, GFP_KERNEL | __GFP_ZERO, INVALID_QUEUE_PAGE_ORDER);
+ if (!pages) {
+ pr_err("Node [%d] Invalid Queue alloc pages failed!\n", node);
+ return -ENOMEM;
+ }
+
+ rqueue->page = pages;
+ rqueue->base = (u64)page_address(pages);
+ rqueue->max_size = INVALID_QUEUE_SIZE;
+ rqueue->head = 0;
+ rqueue->tail = 0;
+ rqueue->node = node;
+ raw_spin_lock_init(&rqueue->lock);
+
+ iocsr_write32(0, LOONGARCH_IOCSR_REDIRECT_CQH);
+ iocsr_write32(0, LOONGARCH_IOCSR_REDIRECT_CQT);
+ iocsr_write64(((rqueue->base & (CQB_ADDR_MASK << CQB_ADDR_SHIFT)) |
+ (CQB_SIZE_MASK << CQB_SIZE_SHIFT)), LOONGARCH_IOCSR_REDIRECT_CQB);
+ return 0;
+}
+
+static int redirect_table_init(int node)
+{
+ struct redirect_table *ird_table = &(irde_descs[node].ird_table);
+ unsigned long *bitmap;
+ struct page *pages;
+ int ret;
+
+ pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, IRD_PAGE_ORDER);
+ if (!pages) {
+ pr_err("Node [%d] redirect table alloc pages failed!\n", node);
+ return -ENOMEM;
+ }
+ ird_table->page = pages;
+ ird_table->table = page_address(pages);
+
+ bitmap = bitmap_zalloc(IRD_ENTRIES, GFP_KERNEL);
+ if (!bitmap) {
+ pr_err("Node [%d] redirect table bitmap alloc pages failed!\n", node);
+ ret = -ENOMEM;
+ goto free_pages;
+ }
+
+ ird_table->bitmap = bitmap;
+ ird_table->nr_ird = IRD_ENTRIES;
+ ird_table->node = node;
+
+ raw_spin_lock_init(&ird_table->lock);
+
+ if (redirect_queue_init(node)) {
+ ret = -EINVAL;
+ goto free_bitmap;
+ }
+
+ iocsr_write64(CFG_DISABLE_IDLE, LOONGARCH_IOCSR_REDIRECT_CFG);
+ iocsr_write64(__pa(ird_table->table), LOONGARCH_IOCSR_REDIRECT_TBR);
+
+ return 0;
+
+free_pages:
+ __free_pages(pages, IRD_PAGE_ORDER);
+free_bitmap:
+ bitmap_free(bitmap);
+ return ret;
+}
+
+static void redirect_table_fini(int node)
+{
+ struct redirect_table *ird_table = &(irde_descs[node].ird_table);
+ struct redirect_queue *rqueue = &(irde_descs[node].inv_queue);
+
+ if (ird_table->page) {
+ __free_pages(ird_table->page, IRD_PAGE_ORDER);
+ ird_table->table = NULL;
+ ird_table->page = NULL;
+ }
+
+ if (ird_table->page) {
+ bitmap_free(ird_table->bitmap);
+ ird_table->bitmap = NULL;
+ }
+
+ if (rqueue->page) {
+ __free_pages(rqueue->page, INVALID_QUEUE_PAGE_ORDER);
+ rqueue->page = NULL;
+ rqueue->base = 0;
+ }
+
+ iocsr_write64(0, LOONGARCH_IOCSR_REDIRECT_CQB);
+ iocsr_write64(0, LOONGARCH_IOCSR_REDIRECT_TBR);
+}
+
+static int redirect_cpu_online(unsigned int cpu)
+{
+ int ret, node = cpu_to_node(cpu);
+
+ if (cpu != cpumask_first(cpumask_of_node(node)))
+ return 0;
+
+ if (irde_descs[node].finish)
+ return 0;
+
+ ret = redirect_table_init(node);
+ if (ret) {
+ redirect_table_fini(node);
+ return -EINVAL;
+ }
+
+ irde_descs[node].finish = 1;
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+static int __init redirect_reg_base_init(void)
+{
+ acpi_status status;
+ uint64_t addr;
+
+ if (acpi_disabled)
+ return 0;
+
+ status = acpi_evaluate_integer(NULL, "\\_SB.NO00", NULL, &addr);
+ if (ACPI_FAILURE(status))
+ pr_info("redirect_iocsr_base used default 0x1fe00000\n");
+ else
+ redirect_reg_base = addr;
+
+ return 0;
+}
+subsys_initcall_sync(redirect_reg_base_init);
+
+static int __init pch_msi_parse_madt(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_msi_pic *pchmsi_entry = (struct acpi_madt_msi_pic *)header;
+
+ msi_base_addr = pchmsi_entry->msg_address - AVEC_MSG_OFFSET;
+
+ return pch_msi_acpi_init_avec(redirect_domain);
+}
+
+static int __init acpi_cascade_irqdomain_init(void)
+{
+ return acpi_table_parse_madt(ACPI_MADT_TYPE_MSI_PIC, pch_msi_parse_madt, 1);
+}
+
+int __init redirect_acpi_init(struct irq_domain *parent)
+{
+ struct fwnode_handle *fwnode;
+ struct irq_domain *domain;
+ int ret;
+
+ fwnode = irq_domain_alloc_named_fwnode("redirect");
+ if (!fwnode) {
+ pr_err("Unable to alloc redirect domain handle\n");
+ goto fail;
+ }
+
+ domain = irq_domain_create_hierarchy(parent, 0, IRD_ENTRIES, fwnode,
+ &redirect_domain_ops, irde_descs);
+ if (!domain) {
+ pr_err("Unable to alloc redirect domain\n");
+ goto out_free_fwnode;
+ }
+
+ redirect_domain = domain;
+
+ ret = redirect_table_init(0);
+ if (ret)
+ goto out_free_table;
+
+ ret = acpi_cascade_irqdomain_init();
+ if (ret < 0) {
+ pr_err("Failed to cascade IRQ domain, ret=%d\n", ret);
+ goto out_free_table;
+ }
+
+ cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_REDIRECT_STARTING,
+ "irqchip/loongarch/redirect:starting",
+ redirect_cpu_online, NULL);
+
+ pr_info("loongarch irq redirect modules init succeeded\n");
+ return 0;
+
+out_free_table:
+ redirect_table_fini(0);
+ irq_domain_remove(redirect_domain);
+ redirect_domain = NULL;
+out_free_fwnode:
+ irq_domain_free_fwnode(fwnode);
+fail:
+ return -EINVAL;
+}
+#endif
diff --git a/drivers/irqchip/irq-loongson.h b/drivers/irqchip/irq-loongson.h
index 11fa138d1f44..05ad40ffb62b 100644
--- a/drivers/irqchip/irq-loongson.h
+++ b/drivers/irqchip/irq-loongson.h
@@ -5,6 +5,15 @@
#ifndef _DRIVERS_IRQCHIP_IRQ_LOONGSON_H
#define _DRIVERS_IRQCHIP_IRQ_LOONGSON_H
+#define AVEC_MSG_OFFSET 0x100000
+struct avecintc_data {
+ struct list_head entry;
+ unsigned int cpu;
+ unsigned int vec;
+ unsigned int prev_cpu;
+ unsigned int prev_vec;
+ unsigned int moving;
+};
int find_pch_pic(u32 gsi);
@@ -24,4 +33,7 @@ int pch_msi_acpi_init(struct irq_domain *parent,
struct acpi_madt_msi_pic *acpi_pchmsi);
int pch_msi_acpi_init_avec(struct irq_domain *parent);
+int redirect_acpi_init(struct irq_domain *parent);
+
+void avecintc_sync(struct avecintc_data *adata);
#endif /* _DRIVERS_IRQCHIP_IRQ_LOONGSON_H */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 1987400000b4..6a4ff072db42 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -145,6 +145,7 @@ enum cpuhp_state {
CPUHP_AP_IRQ_MIPS_GIC_STARTING,
CPUHP_AP_IRQ_EIOINTC_STARTING,
CPUHP_AP_IRQ_AVECINTC_STARTING,
+ CPUHP_AP_IRQ_REDIRECT_STARTING,
CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
CPUHP_AP_IRQ_THEAD_ACLINT_SSWI_STARTING,
CPUHP_AP_IRQ_RISCV_IMSIC_STARTING,
--
2.41.0
On Tue, Jun 10 2025 at 19:42, Tianyang Zhang wrote: > if (cpu_online(adata->cpu) && cpumask_test_cpu(adata->cpu, dest)) > - return 0; > + /* > + * The new affinity configuration has taken effect, > + * and returning IRQ_SET_MASK_OK_DONE here indicates that no further > + * changes need to be made in the subsequent process This is not what IRQ_SET_MASK_OK_DONE is about. The documentation clearly says: * IRQ_SET_MASK_OK_DONE - Same as IRQ_SET_MASK_OK for core. Special code to * support stacked irqchips, which indicates skipping * all descendant irqchips. It's not about further changes. It's about preventing invoking set_affinity() callbacks down the hierarchy. And you still fail to explain why this change is correct for the existing code. That explanation wants to be in the changelog of the seperate patch doing this change. And then you can spare the comment, which is btw. also violating the bracket rules in the tip maintainers documentation. > + */ > + return IRQ_SET_MASK_OK_DONE; > > cpumask_and(&intersect_mask, dest, cpu_online_mask); > > @@ -121,7 +116,8 @@ static int avecintc_set_affinity(struct irq_data *data, const struct cpumask *de > adata->cpu = cpu; > adata->vec = vector; > per_cpu_ptr(irq_map, adata->cpu)[adata->vec] = irq_data_to_desc(data); > - avecintc_sync(adata); > + if (!cpu_has_redirectint) > + avecintc_sync(adata); > } > > irq_data_update_effective_affinity(data, cpumask_of(cpu)); > @@ -412,6 +408,9 @@ static int __init pch_msi_parse_madt(union acpi_subtable_headers *header, > > static inline int __init acpi_cascade_irqdomain_init(void) > { > + if (cpu_has_redirectint) > + return redirect_acpi_init(loongarch_avec.domain); > + > return acpi_table_parse_madt(ACPI_MADT_TYPE_MSI_PIC, pch_msi_parse_madt, 1); > } > > diff --git a/drivers/irqchip/irq-loongarch-ir.c b/drivers/irqchip/irq-loongarch-ir.c > new file mode 100644 > index 000000000000..ae42ec5028d2 > --- /dev/null > +++ b/drivers/irqchip/irq-loongarch-ir.c > @@ -0,0 +1,562 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2020 Loongson Technologies, Inc. > + */ > + > +#include <linux/cpuhotplug.h> > +#include <linux/init.h> > +#include <linux/interrupt.h> > +#include <linux/kernel.h> > +#include <linux/irq.h> > +#include <linux/irqchip.h> > +#include <linux/irqdomain.h> > +#include <linux/spinlock.h> > +#include <linux/msi.h> > + > +#include <asm/irq.h> > +#include <asm/loongarch.h> > +#include <asm/setup.h> > +#include <larchintrin.h> > + > +#include "irq-loongson.h" > +#include "irq-msi-lib.h" > + > +#define IRD_ENTRIES 65536 > + > +/* redirect entry size 128bits */ > +#define IRD_PAGE_ORDER (20 - PAGE_SHIFT) > + > +/* irt cache invalid queue */ > +#define INVALID_QUEUE_SIZE 4096 Use SPACE after #define, not TAB. All over the place... > +#define INVALID_QUEUE_PAGE_ORDER (16 - PAGE_SHIFT) I'm pretty sure that the above magic numbers have dependencies in some way, right? So why is it not expressed in a way which makes this obvious? These magic numbers are just incomprehensible as they make the reader guess what this is about. Here is my (probably not so) wild guess: #define IRD_ENTRY_SIZE 16 #define IRD_ENTRIES 65536 #define IRD_PAGE_ORDER get_order(IRD_ENTRIES * IRD_ENTRY_SIZE) #define INVALID_QUEUE_SIZE 4096 #define IRD_INVALID__QUEUE_PAGE_ORDER get_order(INVALID_QUEUE_SIZE * IRD_ENTRY_SIZE) No? > +#define GPID_ADDR_MASK 0x3ffffffffffULL GENMASK() > +#define GPID_ADDR_SHIFT 6 > + > +#define CQB_SIZE_SHIFT 0 > +#define CQB_SIZE_MASK 0xf > +#define CQB_ADDR_SHIFT 12 > +#define CQB_ADDR_MASK (0xfffffffffULL) GENMASK() > +#define CFG_DISABLE_IDLE 2 > +#define INVALID_INDEX 0 > + > +#define MAX_IR_ENGINES 16 > +struct redirect_gpid { > + u64 pir[4]; /* Pending interrupt requested */ > + u8 en : 1, /* doorbell */ Use C++ style tail comments for this as documented. Do you I really have to point out every single item in the documentation explicitely or can you just read all of it and follow the guidelines? > +struct irde_desc { > + struct redirect_table ird_table; > + struct redirect_queue inv_queue; > + int finish; Groan. "Struct declarations should align the struct member names in a tabular fashion: struct bar_order { unsigned int guest_id; int ordered_item; struct menu *menu; };" It clearly says to align the struct member names, no? Otherwise the example would be: struct bar_order { unsigned int guest_id; int ordered_item; struct menu *menu; }; which is unreadable garbage obviously. > +}; > +static void invalid_enqueue(struct redirect_queue *rqueue, struct irde_inv_cmd *cmd) > +{ > + struct irde_inv_cmd *inv_addr; > + u32 tail; > + > + guard(raw_spinlock_irqsave)(&rqueue->lock); > + > + while (invalid_queue_is_full(rqueue->node, &tail)) > + cpu_relax(); > + > + inv_addr = (struct irde_inv_cmd *)(rqueue->base + tail * sizeof(struct irde_inv_cmd)); > + memcpy(inv_addr, cmd, sizeof(struct irde_inv_cmd)); Seriously? struct redirect_queue { int node; union { u64 base; struct irde_inv_cmd *cmds; }; u32 max_size; ... }; and then this becomes memcpy(&rqueue->cmds[tail], cmd, sizeof(cmd)); That's too comprehensible, right? > + tail = (tail + 1) % INVALID_QUEUE_SIZE; Why is this before the barrier? The barrier does not do anything about this and you can simplify this. See below. > + /* > + * the barrier ensure that the previously written data is visible This barrier ensures ..... > + * before updating the tail register And as there is no rmb() counterpart you want to explain that this is serializing against the hardware. > + */ > + wmb(); > + > + write_queue_tail(rqueue->node, tail); write_queue_tail(rqueue->node, (tail + 1) & INVALID_QUEUE_MASK); No? > +} > + > +static void irde_invlid_entry_node(struct redirect_item *item) > +{ > + struct redirect_queue *rqueue; > + struct irde_inv_cmd cmd; > + volatile u64 raddr = 0; No. See Documentation/process/volatile-considered-harmful.rst > +static void redirect_table_free(struct redirect_item *item) > +{ > + struct redirect_table *ird_table; > + struct redirect_entry *entry; > + > + ird_table = item->table; > + > + entry = item->entry; > + memset(entry, 0, sizeof(struct redirect_entry)); memset(entry, 0, sizeof(entry)); It's obvious why using sizeof(type) is a bad idea. > + scoped_guard(raw_spinlock_irqsave, &ird_table->lock) raw_spinlock_irq as this code can never be invoked from an interrupt disabled region. > + bitmap_release_region(ird_table->bitmap, item->index, 0); > + > + kfree(item->gpid); > + > + irde_invlid_entry_node(item); > +} > +static inline void redirect_domain_prepare_entry(struct redirect_item *item, > + struct avecintc_data *adata) > +{ > + struct redirect_entry *entry = item->entry; > + > + item->gpid->en = 1; > + item->gpid->irqnum = adata->vec; > + item->gpid->dst = adata->cpu; > + > + entry->lo.valid = 1; > + entry->lo.gpid = ((long)item->gpid >> GPID_ADDR_SHIFT) & (GPID_ADDR_MASK); Hardware addresses are type unsigned long and not long. > + entry->lo.vector = 0xff; > +} > +static void redirect_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) > +{ > + struct redirect_item *item; > + > + item = irq_data_get_irq_chip_data(d); Just move the initialization into the declaration line. > + msg->address_lo = (msi_base_addr | 1 << 2 | ((item->index & 0xffff) << 4)); > + msg->address_hi = 0x0; > + msg->data = 0x0; > +} > +static void redirect_free_resources(struct irq_domain *domain, unsigned int virq, > + unsigned int nr_irqs) > +{ > + for (int i = 0; i < nr_irqs; i++) { > + struct irq_data *irq_data; > + > + irq_data = irq_domain_get_irq_data(domain, virq + i); Ditto. > + if (irq_data && irq_data->chip_data) { > + struct redirect_item *item; > + > + item = irq_data->chip_data; Same and all over the place. > + redirect_table_free(item); > + kfree(item); > + } > + } > +} > + > +static int redirect_domain_alloc(struct irq_domain *domain, unsigned int virq, > + unsigned int nr_irqs, void *arg) I asked you before to align the arguments of the second line properly and according to documentation.. > +{ > + struct redirect_table *ird_table; > + struct avecintc_data *avec_data; > + struct irq_data *irq_data; > + msi_alloc_info_t *info; > + int ret, i, node; > + > + info = (msi_alloc_info_t *)arg; What's that type cast for? > + node = dev_to_node(info->desc->dev); > + ird_table = &irde_descs[node].ird_table; > + > + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); > + if (ret < 0) > + return ret; > + > + for (i = 0; i < nr_irqs; i++) { > + struct redirect_item *item; > + > + item = kzalloc(sizeof(struct redirect_item), GFP_KERNEL); > + if (!item) { > + pr_err("Alloc redirect descriptor failed\n"); > + goto out_free_resources; > + } > + > + irq_data = irq_domain_get_irq_data(domain, virq + i); > + > + avec_data = irq_data_get_avec_data(irq_data); Neither irq_data nor avec_data are used here and only required way down. Can you structure your code so it makes sense? > + ret = redirect_table_alloc(item, ird_table); > + if (ret) { > + pr_err("Alloc redirect table entry failed\n"); > + goto out_free_resources; > + } > + > + item->gpid = kzalloc_node(sizeof(struct redirect_gpid), GFP_KERNEL, node); > + if (!item->gpid) { > + pr_err("Alloc redirect GPID failed\n"); > + goto out_free_resources; > + } Why do you need this extra allocation here instead of simply embedding gpid into item? > + irq_data->chip_data = item; > + irq_data->chip = &loongarch_redirect_chip; > + redirect_domain_prepare_entry(item, avec_data); > + } > + return 0; > + iocsr_write64(((rqueue->base & (CQB_ADDR_MASK << CQB_ADDR_SHIFT)) | > + (CQB_SIZE_MASK << CQB_SIZE_SHIFT)), LOONGARCH_IOCSR_REDIRECT_CQB); Align second line properly. > + return 0; > +} > + > +static int redirect_table_init(int node) > +{ > + struct redirect_table *ird_table = &(irde_descs[node].ird_table); Remove the pointless brackets. > + unsigned long *bitmap; > + struct page *pages; > + int ret; > + > + pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, IRD_PAGE_ORDER); > + if (!pages) { > + pr_err("Node [%d] redirect table alloc pages failed!\n", node); > + return -ENOMEM; > + } > + ird_table->page = pages; > + ird_table->table = page_address(pages); > + > + bitmap = bitmap_zalloc(IRD_ENTRIES, GFP_KERNEL); > + if (!bitmap) { > + pr_err("Node [%d] redirect table bitmap alloc pages failed!\n", node); > + ret = -ENOMEM; > + goto free_pages; > + } > + > + ird_table->bitmap = bitmap; > + ird_table->nr_ird = IRD_ENTRIES; > + ird_table->node = node; > + > + raw_spin_lock_init(&ird_table->lock); > + > + if (redirect_queue_init(node)) { > + ret = -EINVAL; > + goto free_bitmap; So redirect_queue_init() returns -ENOMEM which is then converted to -EINVAL here. That makes absolutely no sense at all. Neither makes the 'ret' variable sense because all failures should return -ENOMEM and therefore you can make redirect_queue_init() return bool (true on success) and return -ENOMEM in the failure path. No? > +static void redirect_table_fini(int node) > +{ > + struct redirect_table *ird_table = &(irde_descs[node].ird_table); > + struct redirect_queue *rqueue = &(irde_descs[node].inv_queue); No brackets. They have no value and just disturb reading. > +static int redirect_cpu_online(unsigned int cpu) > +{ > + int ret, node = cpu_to_node(cpu); > + > + if (cpu != cpumask_first(cpumask_of_node(node))) > + return 0; > + > + if (irde_descs[node].finish) > + return 0; What's this finish thing about? Neither the CPU mask check nor this finish hack is required: if (irde_descs[node].pages) return 0; covers all of it, no? > + ret = redirect_table_init(node); > + if (ret) { > + redirect_table_fini(node); What is this for? You already mopped up the mess in the failure path of redirect_table_init(), so doing it again makes no sense. Just get rid of the failure path in redirect_table_init() and let that return a bool (true on success) and invoke redirect_table_fini() here > + return -EINVAL; Seriously? The failure condition is -ENOMEM so why do you want to change that? > + } > + > + irde_descs[node].finish = 1; > + return 0; > +} > + > +#ifdef CONFIG_ACPI What's the TAB for? > +static int __init redirect_reg_base_init(void) > +{ > + acpi_status status; > + uint64_t addr; > + > + if (acpi_disabled) > + return 0; > + > + status = acpi_evaluate_integer(NULL, "\\_SB.NO00", NULL, &addr); > + if (ACPI_FAILURE(status)) > + pr_info("redirect_iocsr_base used default 0x1fe00000\n"); > + else > + redirect_reg_base = addr; > + > + return 0; > +} > +subsys_initcall_sync(redirect_reg_base_init); > + > +static int __init pch_msi_parse_madt(union acpi_subtable_headers *header, > + const unsigned long end) Sigh. > +int __init redirect_acpi_init(struct irq_domain *parent) > +{ > + struct fwnode_handle *fwnode; > + struct irq_domain *domain; > + int ret; > + > + fwnode = irq_domain_alloc_named_fwnode("redirect"); > + if (!fwnode) { > + pr_err("Unable to alloc redirect domain handle\n"); > + goto fail; > + } > + > + domain = irq_domain_create_hierarchy(parent, 0, IRD_ENTRIES, fwnode, > + &redirect_domain_ops, irde_descs); > + if (!domain) { > + pr_err("Unable to alloc redirect domain\n"); > + goto out_free_fwnode; > + } > + > + redirect_domain = domain; What's the point of this local domain variable? > + ret = redirect_table_init(0); > + if (ret) > + goto out_free_table; > + Oh well... Thanks, tglx
Hi, Thomas 在 2025/6/13 下午11:20, Thomas Gleixner 写道: > On Tue, Jun 10 2025 at 19:42, Tianyang Zhang wrote: >> if (cpu_online(adata->cpu) && cpumask_test_cpu(adata->cpu, dest)) >> - return 0; >> + /* >> + * The new affinity configuration has taken effect, >> + * and returning IRQ_SET_MASK_OK_DONE here indicates that no further >> + * changes need to be made in the subsequent process > This is not what IRQ_SET_MASK_OK_DONE is about. The documentation > clearly says: > > * IRQ_SET_MASK_OK_DONE - Same as IRQ_SET_MASK_OK for core. Special code to > * support stacked irqchips, which indicates skipping > * all descendant irqchips. > > It's not about further changes. It's about preventing invoking > set_affinity() callbacks down the hierarchy. > > And you still fail to explain why this change is correct for the > existing code. That explanation wants to be in the changelog of the > seperate patch doing this change. > > And then you can spare the comment, which is btw. also violating the > bracket rules in the tip maintainers documentation. OK, This means that this change involves adjustments to the code organization and cannot be included solely in a large patch. It requires clearer submission records and explanations If that's the case, I will make the necessary modifications as requested. Thank you > > >> + */ >> + return IRQ_SET_MASK_OK_DONE; >> >> cpumask_and(&intersect_mask, dest, cpu_online_mask); >> >> @@ -121,7 +116,8 @@ static int avecintc_set_affinity(struct irq_data *data, const struct cpumask *de >> adata->cpu = cpu; >> adata->vec = vector; >> per_cpu_ptr(irq_map, adata->cpu)[adata->vec] = irq_data_to_desc(data); >> - avecintc_sync(adata); >> + if (!cpu_has_redirectint) >> + avecintc_sync(adata); >> } >> >> irq_data_update_effective_affinity(data, cpumask_of(cpu)); >> @@ -412,6 +408,9 @@ static int __init pch_msi_parse_madt(union acpi_subtable_headers *header, >> >> static inline int __init acpi_cascade_irqdomain_init(void) >> { >> + if (cpu_has_redirectint) >> + return redirect_acpi_init(loongarch_avec.domain); >> + >> return acpi_table_parse_madt(ACPI_MADT_TYPE_MSI_PIC, pch_msi_parse_madt, 1); >> } >> >> diff --git a/drivers/irqchip/irq-loongarch-ir.c b/drivers/irqchip/irq-loongarch-ir.c >> new file mode 100644 >> index 000000000000..ae42ec5028d2 >> --- /dev/null >> +++ b/drivers/irqchip/irq-loongarch-ir.c >> @@ -0,0 +1,562 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> +/* >> + * Copyright (C) 2020 Loongson Technologies, Inc. >> + */ >> + >> +#include <linux/cpuhotplug.h> >> +#include <linux/init.h> >> +#include <linux/interrupt.h> >> +#include <linux/kernel.h> >> +#include <linux/irq.h> >> +#include <linux/irqchip.h> >> +#include <linux/irqdomain.h> >> +#include <linux/spinlock.h> >> +#include <linux/msi.h> >> + >> +#include <asm/irq.h> >> +#include <asm/loongarch.h> >> +#include <asm/setup.h> >> +#include <larchintrin.h> >> + >> +#include "irq-loongson.h" >> +#include "irq-msi-lib.h" >> + >> +#define IRD_ENTRIES 65536 >> + >> +/* redirect entry size 128bits */ >> +#define IRD_PAGE_ORDER (20 - PAGE_SHIFT) >> + >> +/* irt cache invalid queue */ >> +#define INVALID_QUEUE_SIZE 4096 > Use SPACE after #define, not TAB. All over the place... OK, I got it > >> +#define INVALID_QUEUE_PAGE_ORDER (16 - PAGE_SHIFT) > I'm pretty sure that the above magic numbers have dependencies in some > way, right? So why is it not expressed in a way which makes this obvious? > > These magic numbers are just incomprehensible as they make the reader > guess what this is about. Here is my (probably not so) wild guess: > > #define IRD_ENTRY_SIZE 16 > > #define IRD_ENTRIES 65536 > #define IRD_PAGE_ORDER get_order(IRD_ENTRIES * IRD_ENTRY_SIZE) > > #define INVALID_QUEUE_SIZE 4096 > #define IRD_INVALID__QUEUE_PAGE_ORDER get_order(INVALID_QUEUE_SIZE * IRD_ENTRY_SIZE) > > No? Yes, this is better way , and I will pay attention to improving the readability of the code in the future > > GENMASK() Ok , I got it , thanks > >> +#define CFG_DISABLE_IDLE 2 >> +#define INVALID_INDEX 0 >> + >> +#define MAX_IR_ENGINES 16 > >> +struct redirect_gpid { >> + u64 pir[4]; /* Pending interrupt requested */ >> + u8 en : 1, /* doorbell */ > Use C++ style tail comments for this as documented. Do you I really have > to point out every single item in the documentation explicitely or can > you just read all of it and follow the guidelines? Okay, here are some references to existing code.... I will reorganize all code style issues according to the documentation > >> +struct irde_desc { >> + struct redirect_table ird_table; >> + struct redirect_queue inv_queue; >> + int finish; > Groan. > > "Struct declarations should align the struct member names in a tabular fashion: > > struct bar_order { > unsigned int guest_id; > int ordered_item; > struct menu *menu; > };" > > It clearly says to align the struct member names, no? > > Otherwise the example would be: > > struct bar_order { > unsigned int guest_id; > int ordered_item; > struct menu *menu; > }; > > which is unreadable garbage obviously. Ok, I will adjust here and check for other similar issues > >> +static void invalid_enqueue(struct redirect_queue *rqueue, struct irde_inv_cmd *cmd) >> +{ >> + struct irde_inv_cmd *inv_addr; >> + u32 tail; >> + >> + guard(raw_spinlock_irqsave)(&rqueue->lock); >> + >> + while (invalid_queue_is_full(rqueue->node, &tail)) >> + cpu_relax(); >> + >> + inv_addr = (struct irde_inv_cmd *)(rqueue->base + tail * sizeof(struct irde_inv_cmd)); >> + memcpy(inv_addr, cmd, sizeof(struct irde_inv_cmd)); > Seriously? > > struct redirect_queue { > int node; > union { > u64 base; > struct irde_inv_cmd *cmds; > }; > u32 max_size; > ... > }; > > and then this becomes > > memcpy(&rqueue->cmds[tail], cmd, sizeof(cmd)); > > That's too comprehensible, right? Yes, this is a more readable writing style, thank you > >> + tail = (tail + 1) % INVALID_QUEUE_SIZE; > Why is this before the barrier? The barrier does not do anything about > this and you can simplify this. See below. > > And as there is no rmb() counterpart you want to explain that this is > serializing against the hardware. >> + */ >> + wmb(); >> + >> + write_queue_tail(rqueue->node, tail); > write_queue_tail(rqueue->node, (tail + 1) & INVALID_QUEUE_MASK); > > No? The reason fo coding here is that during testing, it was found that a barrier is needed between the update of temporary variable 'tail' and the operation of register with 'write_queue_tail' , otherwise write_queue_tail will probabilistically fail to obtain the correct value. We will conduct testing on the above methods, and if possible, make changes as required, or promote improvements in code readability thanks > >> +} >> + >> +static void irde_invlid_entry_node(struct redirect_item *item) >> +{ >> + struct redirect_queue *rqueue; >> + struct irde_inv_cmd cmd; >> + volatile u64 raddr = 0; > No. See Documentation/process/volatile-considered-harmful.rst Ok, I will follow the agreement here > >> +static void redirect_table_free(struct redirect_item *item) >> +{ >> + struct redirect_table *ird_table; >> + struct redirect_entry *entry; >> + >> + ird_table = item->table; >> + >> + entry = item->entry; >> + memset(entry, 0, sizeof(struct redirect_entry)); > memset(entry, 0, sizeof(entry)); > > It's obvious why using sizeof(type) is a bad idea. Ok, I got it > >> + scoped_guard(raw_spinlock_irqsave, &ird_table->lock) > raw_spinlock_irq as this code can never be invoked from an interrupt > disabled region. Ok, I will make the necessary corrections here > >> + bitmap_release_region(ird_table->bitmap, item->index, 0); >> + >> + kfree(item->gpid); >> + >> + irde_invlid_entry_node(item); >> +} >> +static inline void redirect_domain_prepare_entry(struct redirect_item *item, >> + struct avecintc_data *adata) >> +{ >> + struct redirect_entry *entry = item->entry; >> + >> + item->gpid->en = 1; >> + item->gpid->irqnum = adata->vec; >> + item->gpid->dst = adata->cpu; >> + >> + entry->lo.valid = 1; >> + entry->lo.gpid = ((long)item->gpid >> GPID_ADDR_SHIFT) & (GPID_ADDR_MASK); > Hardware addresses are type unsigned long and not long. Ok , I got it > >> + entry->lo.vector = 0xff; >> +} >> +static void redirect_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) >> +{ >> + struct redirect_item *item; >> + >> + item = irq_data_get_irq_chip_data(d); > Just move the initialization into the declaration line. Ok, I got it > >> + if (irq_data && irq_data->chip_data) { >> + struct redirect_item *item; >> + >> + item = irq_data->chip_data; > Same and all over the place. > >> + redirect_table_free(item); >> + kfree(item); >> + } >> + } >> +} >> + >> +static int redirect_domain_alloc(struct irq_domain *domain, unsigned int virq, >> + unsigned int nr_irqs, void *arg) > I asked you before to align the arguments of the second line properly > and according to documentation.. Sorry, I have reconfirmed the modification records and found that it was fixed in the first temporary version. However, the modification was lost during several testing environment migrations in the middle. I will adjust my code organization process > >> +{ >> + struct redirect_table *ird_table; >> + struct avecintc_data *avec_data; >> + struct irq_data *irq_data; >> + msi_alloc_info_t *info; >> + int ret, i, node; >> + >> + info = (msi_alloc_info_t *)arg; > What's that type cast for? emm......, An oversight during the code modification process > >> + node = dev_to_node(info->desc->dev); >> + ird_table = &irde_descs[node].ird_table; >> + >> + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); >> + if (ret < 0) >> + return ret; >> + >> + for (i = 0; i < nr_irqs; i++) { >> + struct redirect_item *item; >> + >> + item = kzalloc(sizeof(struct redirect_item), GFP_KERNEL); >> + if (!item) { >> + pr_err("Alloc redirect descriptor failed\n"); >> + goto out_free_resources; >> + } >> + >> + irq_data = irq_domain_get_irq_data(domain, virq + i); >> + >> + avec_data = irq_data_get_avec_data(irq_data); > Neither irq_data nor avec_data are used here and only required way > down. Can you structure your code so it makes sense? Ok, I will follow your advice > >> + ret = redirect_table_alloc(item, ird_table); >> + if (ret) { >> + pr_err("Alloc redirect table entry failed\n"); >> + goto out_free_resources; >> + } >> + >> + item->gpid = kzalloc_node(sizeof(struct redirect_gpid), GFP_KERNEL, node); >> + if (!item->gpid) { >> + pr_err("Alloc redirect GPID failed\n"); >> + goto out_free_resources; >> + } > Why do you need this extra allocation here instead of simply embedding > gpid into item? Ok, I got it , thanks >> + irq_data->chip_data = item; >> + irq_data->chip = &loongarch_redirect_chip; >> + redirect_domain_prepare_entry(item, avec_data); >> + } >> + return 0; >> + iocsr_write64(((rqueue->base & (CQB_ADDR_MASK << CQB_ADDR_SHIFT)) | >> + (CQB_SIZE_MASK << CQB_SIZE_SHIFT)), LOONGARCH_IOCSR_REDIRECT_CQB); > Align second line properly. Ok, I got it , thanks > >> + return 0; >> +} >> + >> +static int redirect_table_init(int node) >> +{ >> + struct redirect_table *ird_table = &(irde_descs[node].ird_table); > Remove the pointless brackets. Ok, I got it , thanks > >> + unsigned long *bitmap; >> + struct page *pages; >> + int ret; >> + >> + pages = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, IRD_PAGE_ORDER); >> + if (!pages) { >> + pr_err("Node [%d] redirect table alloc pages failed!\n", node); >> + return -ENOMEM; >> + } >> + ird_table->page = pages; >> + ird_table->table = page_address(pages); >> + >> + bitmap = bitmap_zalloc(IRD_ENTRIES, GFP_KERNEL); >> + if (!bitmap) { >> + pr_err("Node [%d] redirect table bitmap alloc pages failed!\n", node); >> + ret = -ENOMEM; >> + goto free_pages; >> + } >> + >> + ird_table->bitmap = bitmap; >> + ird_table->nr_ird = IRD_ENTRIES; >> + ird_table->node = node; >> + >> + raw_spin_lock_init(&ird_table->lock); >> + >> + if (redirect_queue_init(node)) { >> + ret = -EINVAL; >> + goto free_bitmap; > So redirect_queue_init() returns -ENOMEM which is then converted to > -EINVAL here. That makes absolutely no sense at all. > > Neither makes the 'ret' variable sense because all failures should > return -ENOMEM and therefore you can make redirect_queue_init() return > bool (true on success) and return -ENOMEM in the failure path. > > No? Yes, it is a more reasonable way to do this >> +static void redirect_table_fini(int node) >> +{ >> + struct redirect_table *ird_table = &(irde_descs[node].ird_table); >> + struct redirect_queue *rqueue = &(irde_descs[node].inv_queue); > No brackets. They have no value and just disturb reading. Ok, I got it , thanks > >> +static int redirect_cpu_online(unsigned int cpu) >> +{ >> + int ret, node = cpu_to_node(cpu); >> + >> + if (cpu != cpumask_first(cpumask_of_node(node))) >> + return 0; >> + >> + if (irde_descs[node].finish) >> + return 0; > What's this finish thing about? > > Neither the CPU mask check nor this finish hack is required: > > if (irde_descs[node].pages) > return 0; > > covers all of it, no? Ok, I got it , thanks > >> + ret = redirect_table_init(node); >> + if (ret) { >> + redirect_table_fini(node); > What is this for? You already mopped up the mess in the failure path of > redirect_table_init(), so doing it again makes no sense. > > Just get rid of the failure path in redirect_table_init() and let that > return a bool (true on success) and invoke redirect_table_fini() here There may have been a logical confusion in the version iteration here, and I will reorganize this part of the code, thanks > >> + return -EINVAL; > Seriously? The failure condition is -ENOMEM so why do you want to change > that? Ok , return ret is enought there, thanks >> + } >> + >> + irde_descs[node].finish = 1; >> + return 0; >> +} >> + >> +#ifdef CONFIG_ACPI > What's the TAB for? Ok. I got it , thanks >> +int __init redirect_acpi_init(struct irq_domain *parent) >> +{ >> + struct fwnode_handle *fwnode; >> + struct irq_domain *domain; >> + int ret; >> + >> + fwnode = irq_domain_alloc_named_fwnode("redirect"); >> + if (!fwnode) { >> + pr_err("Unable to alloc redirect domain handle\n"); >> + goto fail; >> + } >> + >> + domain = irq_domain_create_hierarchy(parent, 0, IRD_ENTRIES, fwnode, >> + &redirect_domain_ops, irde_descs); >> + if (!domain) { >> + pr_err("Unable to alloc redirect domain\n"); >> + goto out_free_fwnode; >> + } >> + >> + redirect_domain = domain; > What's the point of this local domain variable? This is indeed meaningless, thanks > >> + ret = redirect_table_init(0); >> + if (ret) >> + goto out_free_table; >> + > Oh well... There may have been a logical confusion in the version iteration here, and I will reorganize this part of the code, thanks > > Thanks, > > tglx Thanks Tianyang
On Mon, Jun 23 2025 at 10:45, Tianyang Zhang wrote: > 在 2025/6/13 下午11:20, Thomas Gleixner 写道: >> On Tue, Jun 10 2025 at 19:42, Tianyang Zhang wrote: >> >>> + tail = (tail + 1) % INVALID_QUEUE_SIZE; >> Why is this before the barrier? The barrier does not do anything about >> this and you can simplify this. See below. >> >> And as there is no rmb() counterpart you want to explain that this is >> serializing against the hardware. >>> + */ >>> + wmb(); >>> + >>> + write_queue_tail(rqueue->node, tail); >> write_queue_tail(rqueue->node, (tail + 1) & INVALID_QUEUE_MASK); >> >> No? > > The reason fo coding here is that during testing, it was found that a > barrier is needed between the update of temporary variable 'tail' and > the operation of register with 'write_queue_tail' , otherwise > write_queue_tail will probabilistically fail to obtain the correct > value. How so? tail is the software managed part of the ringbuffer which is shared with the hardware, right? So even if the compiler would be allowed to reevalutate tail after the barrier (it is NOT), then tail would still contain the same value as before, no? The wmb() is required to ensure that the hardware can observe the full write of the command _before_ it can observe the update to the tail index. Anything else is voodoo. Thanks, tglx
Hi, Thomas 在 2025/6/23 下午4:27, Thomas Gleixner 写道: > On Mon, Jun 23 2025 at 10:45, Tianyang Zhang wrote: >> 在 2025/6/13 下午11:20, Thomas Gleixner 写道: >>> On Tue, Jun 10 2025 at 19:42, Tianyang Zhang wrote: >>> >>>> + tail = (tail + 1) % INVALID_QUEUE_SIZE; >>> Why is this before the barrier? The barrier does not do anything about >>> this and you can simplify this. See below. >>> >>> And as there is no rmb() counterpart you want to explain that this is >>> serializing against the hardware. >>>> + */ >>>> + wmb(); >>>> + >>>> + write_queue_tail(rqueue->node, tail); >>> write_queue_tail(rqueue->node, (tail + 1) & INVALID_QUEUE_MASK); >>> >>> No? >> The reason fo coding here is that during testing, it was found that a >> barrier is needed between the update of temporary variable 'tail' and >> the operation of register with 'write_queue_tail' , otherwise >> write_queue_tail will probabilistically fail to obtain the correct >> value. > How so? > > tail is the software managed part of the ringbuffer which is shared with > the hardware, right? > > So even if the compiler would be allowed to reevalutate tail after the > barrier (it is NOT), then tail would still contain the same value as > before, no? > > The wmb() is required to ensure that the hardware can observe the full > write of the command _before_ it can observe the update to the tail > index. > > Anything else is voodoo. > > Thanks, > > tglx In my previous understanding, tail'value is actually a part of 'full write of the command '. We must ensure that tail is updated to the correct value first, and then write this value into the register (perhaps by adding wmb in write_queue_tail ). In other words, this is originally to prevent the write register instruction from being executed out of order before updating tail. The above is just my personal understanding Thanks Tianyang
On Mon, Jun 23 2025 at 17:33, Tianyang Zhang wrote: > 在 2025/6/23 下午4:27, Thomas Gleixner 写道: >> tail is the software managed part of the ringbuffer which is shared with >> the hardware, right? >> >> So even if the compiler would be allowed to reevalutate tail after the >> barrier (it is NOT), then tail would still contain the same value as >> before, no? >> >> The wmb() is required to ensure that the hardware can observe the full >> write of the command _before_ it can observe the update to the tail >> index. >> >> Anything else is voodoo. >> >> Thanks, >> >> tglx > > In my previous understanding, tail'value is actually a part of 'full > write of the command '. Of course. The hardware observes the tail value. If it is not updated then the command is not executed. But these are two distinct things: The invalidate command is written to a location in the command buffer which is determined by tail: inv_addr = (struct irde_inv_cmd *)(rqueue->base + tail * sizeof(struct irde_inv_cmd)); memcpy(inv_addr, cmd, sizeof(struct irde_inv_cmd)); requeue::base points to an array of invalidate commands. The array size is INVALID_QUEUE_SIZE. tail is the position in the array to which the software writes the next command. tail is software managed and written to a completely different location via write_queue_tail(...): static phys_addr_t redirect_reg_base = 0x1fe00000; #define REDIRECT_REG_BASE(reg, node) \ (UNCACHE_BASE | redirect_reg_base | (u64)(node) << NODE_ADDRSPACE_SHIFT | (reg)) #define redirect_reg_queue_tail(node) REDIRECT_REG_BASE(LOONGARCH_IOCSR_REDIRECT_CQT, (node)) #define read_queue_tail(node) (*((u32 *)(redirect_reg_queue_tail(node)))) #define write_queue_tail(node, val) (*((u32 *)(redirect_reg_queue_tail(node))) = (val)) The hardware maintains the head index. It's the last command index the hardware processed. When the hardware observes that head != tail then it processes the next entry and after completion it updates head with the next index. This repeats until head == tail. > We must ensure that tail is updated to the correct value first, and > then write this value into the register (perhaps by adding wmb in > write_queue_tail ). No. The local variable 'tail' is purely local to the CPU and the invalidation hardware does not even know that it exists. There are two things which are relevant to the hardware: 1) command is written to the hardware visible array at index of tail 2) hardware visible tail memory (register) is updated to tail + 1 The memory barrier is required to prevent that #2 is written to the hardware _before_ #1 completed and is fully visible to the hardware. > In other words, this is originally to prevent the write register > instruction from being executed out of order before updating tail. No. The barrier is solely for the above #1 vs. #2 ordering. There is a difference between program flow ordering and memory ordering. The hardware _CANNOT_ execute the write _before_ the value it writes is computed. That's enforced by program flow order. So it's always guaranteed that the CPU executes tail + 1 _before_ executing the write to the register because that's a program order dependency. If that would not be guaranteed, then your CPU would have more serious problems than this piece of code. It simply would not even survive the first five instructions in the boot loader. But what's not guaranteed is that consecutive writes become visible to other parts of the system (other CPUs, the invalidation hardware, ....) in the same consecutive order. To ensure that ordering you need a WMB(). If you would have CPU to CPU communication then you would need a RMB() on the reader side to ensure that the command is not read before the tail. In your case the hardware side takes care of that. > The above is just my personal understanding Which might need some slight adjustments :) Thanks, tglx
Hi, Thomas 在 2025/6/24 上午4:05, Thomas Gleixner 写道: > On Mon, Jun 23 2025 at 17:33, Tianyang Zhang wrote: >> 在 2025/6/23 下午4:27, Thomas Gleixner 写道: >>> tail is the software managed part of the ringbuffer which is shared with >>> the hardware, right? >>> >>> So even if the compiler would be allowed to reevalutate tail after the >>> barrier (it is NOT), then tail would still contain the same value as >>> before, no? >>> >>> The wmb() is required to ensure that the hardware can observe the full >>> write of the command _before_ it can observe the update to the tail >>> index. >>> >>> Anything else is voodoo. >>> >>> Thanks, >>> >>> tglx >> In my previous understanding, tail'value is actually a part of 'full >> write of the command '. > Of course. The hardware observes the tail value. If it is not updated > then the command is not executed. But these are two distinct things: > > The invalidate command is written to a location in the command buffer > which is determined by tail: > > inv_addr = (struct irde_inv_cmd *)(rqueue->base + tail * sizeof(struct irde_inv_cmd)); > memcpy(inv_addr, cmd, sizeof(struct irde_inv_cmd)); > > requeue::base points to an array of invalidate commands. The array > size is INVALID_QUEUE_SIZE. tail is the position in the array to which > the software writes the next command. tail is software managed and > written to a completely different location via write_queue_tail(...): > > static phys_addr_t redirect_reg_base = 0x1fe00000; > > #define REDIRECT_REG_BASE(reg, node) \ > (UNCACHE_BASE | redirect_reg_base | (u64)(node) << NODE_ADDRSPACE_SHIFT | (reg)) > #define redirect_reg_queue_tail(node) REDIRECT_REG_BASE(LOONGARCH_IOCSR_REDIRECT_CQT, (node)) > #define read_queue_tail(node) (*((u32 *)(redirect_reg_queue_tail(node)))) > #define write_queue_tail(node, val) (*((u32 *)(redirect_reg_queue_tail(node))) = (val)) > > The hardware maintains the head index. It's the last command index the > hardware processed. When the hardware observes that head != tail then it > processes the next entry and after completion it updates head with the > next index. This repeats until head == tail. > >> We must ensure that tail is updated to the correct value first, and >> then write this value into the register (perhaps by adding wmb in >> write_queue_tail ). > No. The local variable 'tail' is purely local to the CPU and the > invalidation hardware does not even know that it exists. > > There are two things which are relevant to the hardware: > > 1) command is written to the hardware visible array at index of tail > > 2) hardware visible tail memory (register) is updated to tail + 1 > > The memory barrier is required to prevent that #2 is written to the > hardware _before_ #1 completed and is fully visible to the hardware. > >> In other words, this is originally to prevent the write register >> instruction from being executed out of order before updating tail. > No. The barrier is solely for the above #1 vs. #2 ordering. > > There is a difference between program flow ordering and memory ordering. > > The hardware _CANNOT_ execute the write _before_ the value it writes is > computed. That's enforced by program flow order. > > So it's always guaranteed that the CPU executes > > tail + 1 > > _before_ executing the write to the register because that's a program > order dependency. > > If that would not be guaranteed, then your CPU would have more serious > problems than this piece of code. It simply would not even survive the > first five instructions in the boot loader. > > But what's not guaranteed is that consecutive writes become visible to > other parts of the system (other CPUs, the invalidation hardware, ....) > in the same consecutive order. > > To ensure that ordering you need a WMB(). If you would have CPU to CPU > communication then you would need a RMB() on the reader side to ensure > that the command is not read before the tail. In your case the hardware > side takes care of that. > >> The above is just my personal understanding > Which might need some slight adjustments :) > > Thanks, > > tglx Hmm... it seems I confused program-flow-ordering and memory-ordering . my understanding of the previous issue may not have been right. Your reply has taught me a great deal, truly appreciate it Thank you very much Tianyang
© 2016 - 2025 Red Hat, Inc.