[Xen-devel] [PATCH v4 00/13] x86: IRQ management adjustments

Jan Beulich posted 13 patches 4 years, 9 months ago
Only 0 patches received!
[Xen-devel] [PATCH v4 00/13] x86: IRQ management adjustments
Posted by Jan Beulich 4 years, 9 months ago
First and foremost this series is trying to deal with CPU offlining
issues, which have become more prominent with the recently
added SMT enable/disable operation in xen-hptool. Later patches
in the series then carry out more or less unrelated changes
(hopefully improvements) noticed while looking at various pieces
of involved code.

01: deal with move-in-progress state in fixup_irqs()
02: deal with move cleanup count state in fixup_irqs()
03: desc->affinity should strictly represent the requested value
04: consolidate use of ->arch.cpu_mask
05: fix locking around vector management
06: x86/IOMMU: don't restrict IRQ affinities to online CPUs
07: target online CPUs when binding guest IRQ
08: correct/tighten vector check in _clear_irq_vector()
09: make fixup_irqs() skip unconnected internally used interrupts
10: drop redundant cpumask_empty() from move_masked_irq()
11: tighten vector checks
12: eliminate some on-stack cpumask_t instances
13: move {,_}clear_irq_vector()

In principle patches 1-7 and maybe 9 are backporting candidates.
Their intrusive nature makes wanting to do so questionable, though.

For v4 specific information please see the individual patches.

Full set of patches attached here due to still unresolved email
issues over here.

Jan
x86/IRQ: deal with move-in-progress state in fixup_irqs()

The flag being set may prevent affinity changes, as these often imply
assignment of a new vector. When there's no possible destination left
for the IRQ, the clearing of the flag needs to happen right from
fixup_irqs().

Additionally _assign_irq_vector() needs to avoid setting the flag when
there's no online CPU left in what gets put into ->arch.old_cpu_mask.
The old vector can be released right away in this case.

Also extend the log message about broken affinity to include the new
affinity as well, allowing to notice issues with affinity changes not
actually having taken place. Swap the if/else-if order there at the
same time to reduce the amount of conditions checked.

At the same time replace two open coded instances of the new helper
function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
---
v4: Use cpumask_bits() in printk() invocation. Re-base.
v3: Move release_old_vec() further up (so a later patch won't need to).
    Re-base.
v2: Add/use valid_irq_vector().
v1b: Also update vector_irq[] in the code added to fixup_irqs().

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -99,6 +99,27 @@ void unlock_vector_lock(void)
     spin_unlock(&vector_lock);
 }
 
+static inline bool valid_irq_vector(unsigned int vector)
+{
+    return vector >= FIRST_DYNAMIC_VECTOR && vector <= LAST_HIPRIORITY_VECTOR;
+}
+
+static void release_old_vec(struct irq_desc *desc)
+{
+    unsigned int vector = desc->arch.old_vector;
+
+    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
+    cpumask_clear(desc->arch.old_cpu_mask);
+
+    if ( !valid_irq_vector(vector) )
+        ASSERT_UNREACHABLE();
+    else if ( desc->arch.used_vectors )
+    {
+        ASSERT(test_bit(vector, desc->arch.used_vectors));
+        clear_bit(vector, desc->arch.used_vectors);
+    }
+}
+
 static void _trace_irq_mask(uint32_t event, int irq, int vector,
                             const cpumask_t *mask)
 {
@@ -295,14 +316,7 @@ static void __clear_irq_vector(int irq)
         per_cpu(vector_irq, cpu)[old_vector] = ~irq;
     }
 
-    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
-    cpumask_clear(desc->arch.old_cpu_mask);
-
-    if ( desc->arch.used_vectors )
-    {
-        ASSERT(test_bit(old_vector, desc->arch.used_vectors));
-        clear_bit(old_vector, desc->arch.used_vectors);
-    }
+    release_old_vec(desc);
 
     desc->arch.move_in_progress = 0;
 }
@@ -527,12 +541,21 @@ next:
         /* Found one! */
         current_vector = vector;
         current_offset = offset;
-        if (old_vector > 0) {
-            desc->arch.move_in_progress = 1;
-            cpumask_copy(desc->arch.old_cpu_mask, desc->arch.cpu_mask);
+
+        if ( old_vector > 0 )
+        {
+            cpumask_and(desc->arch.old_cpu_mask, desc->arch.cpu_mask,
+                        &cpu_online_map);
             desc->arch.old_vector = desc->arch.vector;
+            if ( !cpumask_empty(desc->arch.old_cpu_mask) )
+                desc->arch.move_in_progress = 1;
+            else
+                /* This can happen while offlining a CPU. */
+                release_old_vec(desc);
         }
+
         trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, &tmp_mask);
+
         for_each_cpu(new_cpu, &tmp_mask)
             per_cpu(vector_irq, new_cpu)[vector] = irq;
         desc->arch.vector = vector;
@@ -702,14 +725,8 @@ void irq_move_cleanup_interrupt(struct c
 
         if ( desc->arch.move_cleanup_count == 0 )
         {
-            desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
-            cpumask_clear(desc->arch.old_cpu_mask);
-
-            if ( desc->arch.used_vectors )
-            {
-                ASSERT(test_bit(vector, desc->arch.used_vectors));
-                clear_bit(vector, desc->arch.used_vectors);
-            }
+            ASSERT(vector == desc->arch.old_vector);
+            release_old_vec(desc);
         }
 unlock:
         spin_unlock(&desc->lock);
@@ -2409,6 +2426,33 @@ void fixup_irqs(const cpumask_t *mask, b
             continue;
         }
 
+        /*
+         * In order for the affinity adjustment below to be successful, we
+         * need __assign_irq_vector() to succeed. This in particular means
+         * clearing desc->arch.move_in_progress if this would otherwise
+         * prevent the function from succeeding. Since there's no way for the
+         * flag to get cleared anymore when there's no possible destination
+         * left (the only possibility then would be the IRQs enabled window
+         * after this loop), there's then also no race with us doing it here.
+         *
+         * Therefore the logic here and there need to remain in sync.
+         */
+        if ( desc->arch.move_in_progress &&
+             !cpumask_intersects(mask, desc->arch.cpu_mask) )
+        {
+            unsigned int cpu;
+
+            cpumask_and(&affinity, desc->arch.old_cpu_mask, &cpu_online_map);
+
+            spin_lock(&vector_lock);
+            for_each_cpu(cpu, &affinity)
+                per_cpu(vector_irq, cpu)[desc->arch.old_vector] = ~irq;
+            spin_unlock(&vector_lock);
+
+            release_old_vec(desc);
+            desc->arch.move_in_progress = 0;
+        }
+
         cpumask_and(&affinity, &affinity, mask);
         if ( cpumask_empty(&affinity) )
         {
@@ -2427,15 +2471,18 @@ void fixup_irqs(const cpumask_t *mask, b
         if ( desc->handler->enable )
             desc->handler->enable(desc);
 
+        cpumask_copy(&affinity, desc->affinity);
+
         spin_unlock(&desc->lock);
 
         if ( !verbose )
             continue;
 
-        if ( break_affinity && set_affinity )
-            printk("Broke affinity for irq %i\n", irq);
-        else if ( !set_affinity )
-            printk("Cannot set affinity for irq %i\n", irq);
+        if ( !set_affinity )
+            printk("Cannot set affinity for IRQ%u\n", irq);
+        else if ( break_affinity )
+            printk("Broke affinity for IRQ%u, new: %*pb\n",
+                   irq, nr_cpu_ids, cpumask_bits(&affinity));
     }
 
     /* That doesn't seem sufficient.  Give it 1ms. */
x86/IRQ: deal with move cleanup count state in fixup_irqs()

The cleanup IPI may get sent immediately before a CPU gets removed from
the online map. In such a case the IPI would get handled on the CPU
being offlined no earlier than in the interrupts disabled window after
fixup_irqs()' main loop. This is too late, however, because a possible
affinity change may incur the need for vector assignment, which will
fail when the IRQ's move cleanup count is still non-zero.

To fix this
- record the set of CPUs the cleanup IPIs gets actually sent to alongside
  setting their count,
- adjust the count in fixup_irqs(), accounting for all CPUs that the
  cleanup IPI was sent to, but that are no longer online,
- bail early from the cleanup IPI handler when the CPU is no longer
  online, to prevent double accounting.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -675,6 +675,9 @@ void irq_move_cleanup_interrupt(struct c
     ack_APIC_irq();
 
     me = smp_processor_id();
+    if ( !cpu_online(me) )
+        return;
+
     for ( vector = FIRST_DYNAMIC_VECTOR;
           vector <= LAST_HIPRIORITY_VECTOR; vector++)
     {
@@ -735,11 +738,14 @@ unlock:
 
 static void send_cleanup_vector(struct irq_desc *desc)
 {
-    cpumask_t cleanup_mask;
+    cpumask_and(desc->arch.old_cpu_mask, desc->arch.old_cpu_mask,
+                &cpu_online_map);
+    desc->arch.move_cleanup_count = cpumask_weight(desc->arch.old_cpu_mask);
 
-    cpumask_and(&cleanup_mask, desc->arch.old_cpu_mask, &cpu_online_map);
-    desc->arch.move_cleanup_count = cpumask_weight(&cleanup_mask);
-    send_IPI_mask(&cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+    if ( desc->arch.move_cleanup_count )
+        send_IPI_mask(desc->arch.old_cpu_mask, IRQ_MOVE_CLEANUP_VECTOR);
+    else
+        release_old_vec(desc);
 
     desc->arch.move_in_progress = 0;
 }
@@ -2419,6 +2425,16 @@ void fixup_irqs(const cpumask_t *mask, b
              vector <= LAST_HIPRIORITY_VECTOR )
             cpumask_and(desc->arch.cpu_mask, desc->arch.cpu_mask, mask);
 
+        if ( desc->arch.move_cleanup_count )
+        {
+            /* The cleanup IPI may have got sent while we were still online. */
+            cpumask_andnot(&affinity, desc->arch.old_cpu_mask,
+                           &cpu_online_map);
+            desc->arch.move_cleanup_count -= cpumask_weight(&affinity);
+            if ( !desc->arch.move_cleanup_count )
+                release_old_vec(desc);
+        }
+
         cpumask_copy(&affinity, desc->affinity);
         if ( !desc->action || cpumask_subset(&affinity, mask) )
         {
x86/IRQ: desc->affinity should strictly represent the requested value

desc->arch.cpu_mask reflects the actual set of target CPUs. Don't ever
fiddle with desc->affinity itself, except to store caller requested
values. Note that assign_irq_vector() now takes a NULL incoming CPU mask
to mean "all CPUs" now, rather than just "all currently online CPUs".
This way no further affinity adjustment is needed after onlining further
CPUs.

This renders both set_native_irq_info() uses (which weren't using proper
locking anyway) redundant - drop the function altogether.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
---
v4: Use %*pbl.
---
TBD: To reduce the bad effect on the so far tabular output of the 'i'
     debug key, shifting the two affinity values further to the right
     may be worthwhile to consider.

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -1039,7 +1039,6 @@ static void __init setup_IO_APIC_irqs(vo
             SET_DEST(entry, logical, cpu_mask_to_apicid(TARGET_CPUS));
             spin_lock_irqsave(&ioapic_lock, flags);
             __ioapic_write_entry(apic, pin, 0, entry);
-            set_native_irq_info(irq, TARGET_CPUS);
             spin_unlock_irqrestore(&ioapic_lock, flags);
         }
     }
@@ -2248,7 +2247,6 @@ int io_apic_set_pci_routing (int ioapic,
 
     spin_lock_irqsave(&ioapic_lock, flags);
     __ioapic_write_entry(ioapic, pin, 0, entry);
-    set_native_irq_info(irq, TARGET_CPUS);
     spin_unlock(&ioapic_lock);
 
     spin_lock(&desc->lock);
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -589,11 +589,16 @@ int assign_irq_vector(int irq, const cpu
 
     spin_lock_irqsave(&vector_lock, flags);
     ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS);
-    if (!ret) {
+    if ( !ret )
+    {
         ret = desc->arch.vector;
-        cpumask_copy(desc->affinity, desc->arch.cpu_mask);
+        if ( mask )
+            cpumask_copy(desc->affinity, mask);
+        else
+            cpumask_setall(desc->affinity);
     }
     spin_unlock_irqrestore(&vector_lock, flags);
+
     return ret;
 }
 
@@ -2345,9 +2350,10 @@ static void dump_irqs(unsigned char key)
 
         spin_lock_irqsave(&desc->lock, flags);
 
-        printk("   IRQ:%4d aff:%*pb vec:%02x %-15s status=%03x ",
-               irq, nr_cpu_ids, cpumask_bits(desc->affinity), desc->arch.vector,
-               desc->handler->typename, desc->status);
+        printk("   IRQ:%4d aff:{%*pbl}/{%*pbl} vec:%02x %-15s status=%03x ",
+               irq, nr_cpu_ids, cpumask_bits(desc->affinity),
+               nr_cpu_ids, cpumask_bits(desc->arch.cpu_mask),
+               desc->arch.vector, desc->handler->typename, desc->status);
 
         if ( ssid )
             printk("Z=%-25s ", ssid);
@@ -2435,8 +2441,7 @@ void fixup_irqs(const cpumask_t *mask, b
                 release_old_vec(desc);
         }
 
-        cpumask_copy(&affinity, desc->affinity);
-        if ( !desc->action || cpumask_subset(&affinity, mask) )
+        if ( !desc->action || cpumask_subset(desc->affinity, mask) )
         {
             spin_unlock(&desc->lock);
             continue;
@@ -2469,12 +2474,13 @@ void fixup_irqs(const cpumask_t *mask, b
             desc->arch.move_in_progress = 0;
         }
 
-        cpumask_and(&affinity, &affinity, mask);
-        if ( cpumask_empty(&affinity) )
+        if ( !cpumask_intersects(mask, desc->affinity) )
         {
             break_affinity = true;
-            cpumask_copy(&affinity, mask);
+            cpumask_setall(&affinity);
         }
+        else
+            cpumask_copy(&affinity, desc->affinity);
 
         if ( desc->handler->disable )
             desc->handler->disable(desc);
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -162,11 +162,6 @@ extern irq_desc_t *domain_spin_lock_irq_
 extern irq_desc_t *pirq_spin_lock_irq_desc(
     const struct pirq *, unsigned long *pflags);
 
-static inline void set_native_irq_info(unsigned int irq, const cpumask_t *mask)
-{
-    cpumask_copy(irq_to_desc(irq)->affinity, mask);
-}
-
 unsigned int set_desc_affinity(struct irq_desc *, const cpumask_t *);
 
 #ifndef arch_hwdom_irqs
x86/IRQ: consolidate use of ->arch.cpu_mask

Mixed meaning was implied so far by different pieces of code -
disagreement was in particular about whether to expect offline CPUs'
bits to possibly be set. Switch to a mostly consistent meaning
(exception being high priority interrupts, which would perhaps better
be switched to the same model as well in due course). Use the field to
record the vector allocation mask, i.e. potentially including bits of
offline (parked) CPUs. This implies that before passing the mask to
certain functions (most notably cpu_mask_to_apicid()) it needs to be
further reduced to the online subset.

The exception of high priority interrupts is also why for the moment
_bind_irq_vector() is left as is, despite looking wrong: It's used
exclusively for IRQ0, which isn't supposed to move off CPU0 at any time.

The prior lack of restricting to online CPUs in set_desc_affinity()
before calling cpu_mask_to_apicid() in particular allowed (in x2APIC
clustered mode) offlined CPUs to end up enabled in an IRQ's destination
field. (I wonder whether vector_allocation_cpumask_flat() shouldn't
follow a similar model, using cpu_present_map in favor of
cpu_online_map.)

For IO-APIC code it was definitely wrong to potentially store, as a
fallback, TARGET_CPUS (i.e. all online ones) into the field, as that
would have caused problems when determining on which CPUs to release
vectors when they've gone out of use. Disable interrupts instead when
no valid target CPU can be established (which code elsewhere should
guarantee to never happen), and log a message in such an unlikely event.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v2: New.

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -680,7 +680,7 @@ void /*__init*/ setup_ioapic_dest(void)
                 continue;
             irq = pin_2_irq(irq_entry, ioapic, pin);
             desc = irq_to_desc(irq);
-            BUG_ON(cpumask_empty(desc->arch.cpu_mask));
+            BUG_ON(!cpumask_intersects(desc->arch.cpu_mask, &cpu_online_map));
             set_ioapic_affinity_irq(desc, desc->arch.cpu_mask);
         }
 
@@ -2194,7 +2194,6 @@ int io_apic_set_pci_routing (int ioapic,
 {
     struct irq_desc *desc = irq_to_desc(irq);
     struct IO_APIC_route_entry entry;
-    cpumask_t mask;
     unsigned long flags;
     int vector;
 
@@ -2229,11 +2228,17 @@ int io_apic_set_pci_routing (int ioapic,
         return vector;
     entry.vector = vector;
 
-    cpumask_copy(&mask, TARGET_CPUS);
-    /* Don't chance ending up with an empty mask. */
-    if (cpumask_intersects(&mask, desc->arch.cpu_mask))
-        cpumask_and(&mask, &mask, desc->arch.cpu_mask);
-    SET_DEST(entry, logical, cpu_mask_to_apicid(&mask));
+    if (cpumask_intersects(desc->arch.cpu_mask, TARGET_CPUS)) {
+        cpumask_t *mask = this_cpu(scratch_cpumask);
+
+        cpumask_and(mask, desc->arch.cpu_mask, TARGET_CPUS);
+        SET_DEST(entry, logical, cpu_mask_to_apicid(mask));
+    } else {
+        printk(XENLOG_ERR "IRQ%d: no target CPU (%*pb vs %*pb)\n",
+               irq, nr_cpu_ids, cpumask_bits(desc->arch.cpu_mask),
+               nr_cpu_ids, cpumask_bits(TARGET_CPUS));
+        desc->status |= IRQ_DISABLED;
+    }
 
     apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
 		"(%d-%d -> %#x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
@@ -2419,7 +2424,21 @@ int ioapic_guest_write(unsigned long phy
     /* Set the vector field to the real vector! */
     rte.vector = desc->arch.vector;
 
-    SET_DEST(rte, logical, cpu_mask_to_apicid(desc->arch.cpu_mask));
+    if ( cpumask_intersects(desc->arch.cpu_mask, TARGET_CPUS) )
+    {
+        cpumask_t *mask = this_cpu(scratch_cpumask);
+
+        cpumask_and(mask, desc->arch.cpu_mask, TARGET_CPUS);
+        SET_DEST(rte, logical, cpu_mask_to_apicid(mask));
+    }
+    else
+    {
+        gprintk(XENLOG_ERR, "IRQ%d: no target CPU (%*pb vs %*pb)\n",
+               irq, nr_cpu_ids, cpumask_bits(desc->arch.cpu_mask),
+               nr_cpu_ids, cpumask_bits(TARGET_CPUS));
+        desc->status |= IRQ_DISABLED;
+        rte.mask = 1;
+    }
 
     __ioapic_write_entry(apic, pin, 0, rte);
     
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -478,11 +478,13 @@ static int __assign_irq_vector(
      */
     static int current_vector = FIRST_DYNAMIC_VECTOR, current_offset = 0;
     int cpu, err, old_vector;
-    cpumask_t tmp_mask;
     vmask_t *irq_used_vectors = NULL;
 
     old_vector = irq_to_vector(irq);
-    if (old_vector > 0) {
+    if ( old_vector > 0 )
+    {
+        cpumask_t tmp_mask;
+
         cpumask_and(&tmp_mask, mask, &cpu_online_map);
         if (cpumask_intersects(&tmp_mask, desc->arch.cpu_mask)) {
             desc->arch.vector = old_vector;
@@ -505,7 +507,9 @@ static int __assign_irq_vector(
     else
         irq_used_vectors = irq_get_used_vector_mask(irq);
 
-    for_each_cpu(cpu, mask) {
+    for_each_cpu(cpu, mask)
+    {
+        const cpumask_t *vec_mask;
         int new_cpu;
         int vector, offset;
 
@@ -513,8 +517,7 @@ static int __assign_irq_vector(
         if (!cpu_online(cpu))
             continue;
 
-        cpumask_and(&tmp_mask, vector_allocation_cpumask(cpu),
-                    &cpu_online_map);
+        vec_mask = vector_allocation_cpumask(cpu);
 
         vector = current_vector;
         offset = current_offset;
@@ -535,7 +538,7 @@ next:
             && test_bit(vector, irq_used_vectors) )
             goto next;
 
-        for_each_cpu(new_cpu, &tmp_mask)
+        for_each_cpu(new_cpu, vec_mask)
             if (per_cpu(vector_irq, new_cpu)[vector] >= 0)
                 goto next;
         /* Found one! */
@@ -554,12 +557,12 @@ next:
                 release_old_vec(desc);
         }
 
-        trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, &tmp_mask);
+        trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, vec_mask);
 
-        for_each_cpu(new_cpu, &tmp_mask)
+        for_each_cpu(new_cpu, vec_mask)
             per_cpu(vector_irq, new_cpu)[vector] = irq;
         desc->arch.vector = vector;
-        cpumask_copy(desc->arch.cpu_mask, &tmp_mask);
+        cpumask_copy(desc->arch.cpu_mask, vec_mask);
 
         desc->arch.used = IRQ_USED;
         ASSERT((desc->arch.used_vectors == NULL)
@@ -791,6 +794,7 @@ unsigned int set_desc_affinity(struct ir
 
     cpumask_copy(desc->affinity, mask);
     cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
+    cpumask_and(&dest_mask, &dest_mask, &cpu_online_map);
 
     return cpu_mask_to_apicid(&dest_mask);
 }
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -32,6 +32,12 @@ struct irq_desc;
 struct arch_irq_desc {
         s16 vector;                  /* vector itself is only 8 bits, */
         s16 old_vector;              /* but we use -1 for unassigned  */
+        /*
+         * Except for high priority interrupts @cpu_mask may have bits set for
+         * offline CPUs.  Consumers need to be careful to mask this down to
+         * online ones as necessary.  There is supposed to always be a non-
+         * empty intersection with cpu_online_map.
+         */
         cpumask_var_t cpu_mask;
         cpumask_var_t old_cpu_mask;
         cpumask_var_t pending_mask;
x86/IRQ: fix locking around vector management

All of __{assign,bind,clear}_irq_vector() manipulate struct irq_desc
fields, and hence ought to be called with the descriptor lock held in
addition to vector_lock. This is currently the case for only
set_desc_affinity() (in the common case) and destroy_irq(), which also
clarifies what the nesting behavior between the locks has to be.
Reflect the new expectation by having these functions all take a
descriptor as parameter instead of an interrupt number.

Also take care of the two special cases of calls to set_desc_affinity():
set_ioapic_affinity_irq() and VT-d's dma_msi_set_affinity() get called
directly as well, and in these cases the descriptor locks hadn't got
acquired till now. For set_ioapic_affinity_irq() this means acquiring /
releasing of the IO-APIC lock can be plain spin_{,un}lock() then.

Drop one of the two leading underscores from all three functions at
the same time.

There's one case left where descriptors get manipulated with just
vector_lock held: setup_vector_irq() assumes its caller to acquire
vector_lock, and hence can't itself acquire the descriptor locks (wrong
lock order). I don't currently see how to address this.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com> [VT-d]
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v4: Adjust comment ahead of setup_vector_irq().
v3: Also drop one leading underscore from a comment. Re-base.
v2: Also adjust set_ioapic_affinity_irq() and VT-d's
    dma_msi_set_affinity().

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -550,14 +550,14 @@ static void clear_IO_APIC (void)
 static void
 set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask)
 {
-    unsigned long flags;
     unsigned int dest;
     int pin, irq;
     struct irq_pin_list *entry;
 
     irq = desc->irq;
 
-    spin_lock_irqsave(&ioapic_lock, flags);
+    spin_lock(&ioapic_lock);
+
     dest = set_desc_affinity(desc, mask);
     if (dest != BAD_APICID) {
         if ( !x2apic_enabled )
@@ -580,8 +580,8 @@ set_ioapic_affinity_irq(struct irq_desc
             entry = irq_2_pin + entry->next;
         }
     }
-    spin_unlock_irqrestore(&ioapic_lock, flags);
 
+    spin_unlock(&ioapic_lock);
 }
 
 /*
@@ -674,16 +674,19 @@ void /*__init*/ setup_ioapic_dest(void)
     for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
         for (pin = 0; pin < nr_ioapic_entries[ioapic]; pin++) {
             struct irq_desc *desc;
+            unsigned long flags;
 
             irq_entry = find_irq_entry(ioapic, pin, mp_INT);
             if (irq_entry == -1)
                 continue;
             irq = pin_2_irq(irq_entry, ioapic, pin);
             desc = irq_to_desc(irq);
+
+            spin_lock_irqsave(&desc->lock, flags);
             BUG_ON(!cpumask_intersects(desc->arch.cpu_mask, &cpu_online_map));
             set_ioapic_affinity_irq(desc, desc->arch.cpu_mask);
+            spin_unlock_irqrestore(&desc->lock, flags);
         }
-
     }
 }
 
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -27,6 +27,7 @@
 #include <public/physdev.h>
 
 static int parse_irq_vector_map_param(const char *s);
+static void _clear_irq_vector(struct irq_desc *desc);
 
 /* opt_noirqbalance: If true, software IRQ balancing/affinity is disabled. */
 bool __read_mostly opt_noirqbalance;
@@ -143,13 +144,12 @@ static void trace_irq_mask(uint32_t even
         _trace_irq_mask(event, irq, vector, mask);
 }
 
-static int __init __bind_irq_vector(int irq, int vector, const cpumask_t *cpu_mask)
+static int __init _bind_irq_vector(struct irq_desc *desc, int vector,
+                                   const cpumask_t *cpu_mask)
 {
     cpumask_t online_mask;
     int cpu;
-    struct irq_desc *desc = irq_to_desc(irq);
 
-    BUG_ON((unsigned)irq >= nr_irqs);
     BUG_ON((unsigned)vector >= NR_VECTORS);
 
     cpumask_and(&online_mask, cpu_mask, &cpu_online_map);
@@ -160,9 +160,9 @@ static int __init __bind_irq_vector(int
         return 0;
     if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
         return -EBUSY;
-    trace_irq_mask(TRC_HW_IRQ_BIND_VECTOR, irq, vector, &online_mask);
+    trace_irq_mask(TRC_HW_IRQ_BIND_VECTOR, desc->irq, vector, &online_mask);
     for_each_cpu(cpu, &online_mask)
-        per_cpu(vector_irq, cpu)[vector] = irq;
+        per_cpu(vector_irq, cpu)[vector] = desc->irq;
     desc->arch.vector = vector;
     cpumask_copy(desc->arch.cpu_mask, &online_mask);
     if ( desc->arch.used_vectors )
@@ -176,12 +176,18 @@ static int __init __bind_irq_vector(int
 
 int __init bind_irq_vector(int irq, int vector, const cpumask_t *cpu_mask)
 {
+    struct irq_desc *desc = irq_to_desc(irq);
     unsigned long flags;
     int ret;
 
-    spin_lock_irqsave(&vector_lock, flags);
-    ret = __bind_irq_vector(irq, vector, cpu_mask);
-    spin_unlock_irqrestore(&vector_lock, flags);
+    BUG_ON((unsigned)irq >= nr_irqs);
+
+    spin_lock_irqsave(&desc->lock, flags);
+    spin_lock(&vector_lock);
+    ret = _bind_irq_vector(desc, vector, cpu_mask);
+    spin_unlock(&vector_lock);
+    spin_unlock_irqrestore(&desc->lock, flags);
+
     return ret;
 }
 
@@ -266,18 +272,20 @@ void destroy_irq(unsigned int irq)
 
     spin_lock_irqsave(&desc->lock, flags);
     desc->handler = &no_irq_type;
-    clear_irq_vector(irq);
+    spin_lock(&vector_lock);
+    _clear_irq_vector(desc);
+    spin_unlock(&vector_lock);
     desc->arch.used_vectors = NULL;
     spin_unlock_irqrestore(&desc->lock, flags);
 
     xfree(action);
 }
 
-static void __clear_irq_vector(int irq)
+static void _clear_irq_vector(struct irq_desc *desc)
 {
-    int cpu, vector, old_vector;
+    unsigned int cpu;
+    int vector, old_vector, irq = desc->irq;
     cpumask_t tmp_mask;
-    struct irq_desc *desc = irq_to_desc(irq);
 
     BUG_ON(!desc->arch.vector);
 
@@ -323,11 +331,14 @@ static void __clear_irq_vector(int irq)
 
 void clear_irq_vector(int irq)
 {
+    struct irq_desc *desc = irq_to_desc(irq);
     unsigned long flags;
 
-    spin_lock_irqsave(&vector_lock, flags);
-    __clear_irq_vector(irq);
-    spin_unlock_irqrestore(&vector_lock, flags);
+    spin_lock_irqsave(&desc->lock, flags);
+    spin_lock(&vector_lock);
+    _clear_irq_vector(desc);
+    spin_unlock(&vector_lock);
+    spin_unlock_irqrestore(&desc->lock, flags);
 }
 
 int irq_to_vector(int irq)
@@ -462,8 +473,7 @@ static vmask_t *irq_get_used_vector_mask
     return ret;
 }
 
-static int __assign_irq_vector(
-    int irq, struct irq_desc *desc, const cpumask_t *mask)
+static int _assign_irq_vector(struct irq_desc *desc, const cpumask_t *mask)
 {
     /*
      * NOTE! The local APIC isn't very good at handling
@@ -477,7 +487,8 @@ static int __assign_irq_vector(
      * 0x80, because int 0x80 is hm, kind of importantish. ;)
      */
     static int current_vector = FIRST_DYNAMIC_VECTOR, current_offset = 0;
-    int cpu, err, old_vector;
+    unsigned int cpu;
+    int err, old_vector, irq = desc->irq;
     vmask_t *irq_used_vectors = NULL;
 
     old_vector = irq_to_vector(irq);
@@ -590,8 +601,12 @@ int assign_irq_vector(int irq, const cpu
     
     BUG_ON(irq >= nr_irqs || irq <0);
 
-    spin_lock_irqsave(&vector_lock, flags);
-    ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS);
+    spin_lock_irqsave(&desc->lock, flags);
+
+    spin_lock(&vector_lock);
+    ret = _assign_irq_vector(desc, mask ?: TARGET_CPUS);
+    spin_unlock(&vector_lock);
+
     if ( !ret )
     {
         ret = desc->arch.vector;
@@ -600,14 +615,16 @@ int assign_irq_vector(int irq, const cpu
         else
             cpumask_setall(desc->affinity);
     }
-    spin_unlock_irqrestore(&vector_lock, flags);
+
+    spin_unlock_irqrestore(&desc->lock, flags);
 
     return ret;
 }
 
 /*
  * Initialize vector_irq on a new cpu. This function must be called
- * with vector_lock held.
+ * with vector_lock held.  For this reason it may not itself acquire
+ * the IRQ descriptor locks, as lock nesting is the other way around.
  */
 void setup_vector_irq(unsigned int cpu)
 {
@@ -775,7 +792,6 @@ void irq_complete_move(struct irq_desc *
 
 unsigned int set_desc_affinity(struct irq_desc *desc, const cpumask_t *mask)
 {
-    unsigned int irq;
     int ret;
     unsigned long flags;
     cpumask_t dest_mask;
@@ -783,10 +799,8 @@ unsigned int set_desc_affinity(struct ir
     if (!cpumask_intersects(mask, &cpu_online_map))
         return BAD_APICID;
 
-    irq = desc->irq;
-
     spin_lock_irqsave(&vector_lock, flags);
-    ret = __assign_irq_vector(irq, desc, mask);
+    ret = _assign_irq_vector(desc, mask);
     spin_unlock_irqrestore(&vector_lock, flags);
 
     if (ret < 0)
@@ -2453,7 +2467,7 @@ void fixup_irqs(const cpumask_t *mask, b
 
         /*
          * In order for the affinity adjustment below to be successful, we
-         * need __assign_irq_vector() to succeed. This in particular means
+         * need _assign_irq_vector() to succeed. This in particular means
          * clearing desc->arch.move_in_progress if this would otherwise
          * prevent the function from succeeding. Since there's no way for the
          * flag to get cleared anymore when there's no possible destination
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2134,11 +2134,16 @@ static void adjust_irq_affinity(struct a
     unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
                              : NUMA_NO_NODE;
     const cpumask_t *cpumask = &cpu_online_map;
+    struct irq_desc *desc;
 
     if ( node < MAX_NUMNODES && node_online(node) &&
          cpumask_intersects(&node_to_cpumask(node), cpumask) )
         cpumask = &node_to_cpumask(node);
-    dma_msi_set_affinity(irq_to_desc(drhd->iommu->msi.irq), cpumask);
+
+    desc = irq_to_desc(drhd->iommu->msi.irq);
+    spin_lock_irq(&desc->lock);
+    dma_msi_set_affinity(desc, cpumask);
+    spin_unlock_irq(&desc->lock);
 }
 
 static int adjust_vtd_irq_affinities(void)
x86/IOMMU: don't restrict IRQ affinities to online CPUs

In line with "x86/IRQ: desc->affinity should strictly represent the
requested value" the internally used IRQ(s) also shouldn't be restricted
to online ones. Make set_desc_affinity() (set_msi_affinity() then does
by implication) cope with a NULL mask being passed (just like
assign_irq_vector() does), and have IOMMU code pass NULL instead of
&cpu_online_map (when, for VT-d, there's no NUMA node information
available).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -796,18 +796,26 @@ unsigned int set_desc_affinity(struct ir
     unsigned long flags;
     cpumask_t dest_mask;
 
-    if (!cpumask_intersects(mask, &cpu_online_map))
+    if ( mask && !cpumask_intersects(mask, &cpu_online_map) )
         return BAD_APICID;
 
     spin_lock_irqsave(&vector_lock, flags);
-    ret = _assign_irq_vector(desc, mask);
+    ret = _assign_irq_vector(desc, mask ?: TARGET_CPUS);
     spin_unlock_irqrestore(&vector_lock, flags);
 
-    if (ret < 0)
+    if ( ret < 0 )
         return BAD_APICID;
 
-    cpumask_copy(desc->affinity, mask);
-    cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
+    if ( mask )
+    {
+        cpumask_copy(desc->affinity, mask);
+        cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
+    }
+    else
+    {
+        cpumask_setall(desc->affinity);
+        cpumask_copy(&dest_mask, desc->arch.cpu_mask);
+    }
     cpumask_and(&dest_mask, &dest_mask, &cpu_online_map);
 
     return cpu_mask_to_apicid(&dest_mask);
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -888,7 +888,7 @@ static void enable_iommu(struct amd_iomm
 
     desc = irq_to_desc(iommu->msi.irq);
     spin_lock(&desc->lock);
-    set_msi_affinity(desc, &cpu_online_map);
+    set_msi_affinity(desc, NULL);
     spin_unlock(&desc->lock);
 
     amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2133,11 +2133,11 @@ static void adjust_irq_affinity(struct a
     const struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
     unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
                              : NUMA_NO_NODE;
-    const cpumask_t *cpumask = &cpu_online_map;
+    const cpumask_t *cpumask = NULL;
     struct irq_desc *desc;
 
     if ( node < MAX_NUMNODES && node_online(node) &&
-         cpumask_intersects(&node_to_cpumask(node), cpumask) )
+         cpumask_intersects(&node_to_cpumask(node), &cpu_online_map) )
         cpumask = &node_to_cpumask(node);
 
     desc = irq_to_desc(drhd->iommu->msi.irq);
x86/IRQ: target online CPUs when binding guest IRQ

fixup_irqs() skips interrupts without action. Hence such interrupts can
retain affinity to just offline CPUs. With "noirqbalance" in effect,
pirq_guest_bind() so far would have left them alone, resulting in a non-
working interrupt.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v3: New.
---
I've not observed this problem in practice - the change is just the
result of code inspection after having noticed action-less IRQs in 'i'
debug key output pointing at all parked/offline CPUs.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1703,9 +1703,27 @@ int pirq_guest_bind(struct vcpu *v, stru
 
         desc->status |= IRQ_GUEST;
 
-        /* Attempt to bind the interrupt target to the correct CPU. */
-        if ( !opt_noirqbalance && (desc->handler->set_affinity != NULL) )
-            desc->handler->set_affinity(desc, cpumask_of(v->processor));
+        /*
+         * Attempt to bind the interrupt target to the correct (or at least
+         * some online) CPU.
+         */
+        if ( desc->handler->set_affinity )
+        {
+            const cpumask_t *affinity = NULL;
+
+            if ( !opt_noirqbalance )
+                affinity = cpumask_of(v->processor);
+            else if ( !cpumask_intersects(desc->affinity, &cpu_online_map) )
+            {
+                cpumask_setall(desc->affinity);
+                affinity = &cpumask_all;
+            }
+            else if ( !cpumask_intersects(desc->arch.cpu_mask,
+                                          &cpu_online_map) )
+                affinity = desc->affinity;
+            if ( affinity )
+                desc->handler->set_affinity(desc, affinity);
+        }
 
         desc->status &= ~IRQ_DISABLED;
         desc->handler->startup(desc);
x86/IRQs: correct/tighten vector check in _clear_irq_vector()

If any particular value was to be checked against, it would need to be
IRQ_VECTOR_UNASSIGNED.

Reported-by: Roger Pau Monné <roger.pau@citrix.com>

Be more strict though and use valid_irq_vector() instead.

Take the opportunity and also convert local variables to unsigned int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v2: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -283,14 +283,13 @@ void destroy_irq(unsigned int irq)
 
 static void _clear_irq_vector(struct irq_desc *desc)
 {
-    unsigned int cpu;
-    int vector, old_vector, irq = desc->irq;
+    unsigned int cpu, old_vector, irq = desc->irq;
+    unsigned int vector = desc->arch.vector;
     cpumask_t tmp_mask;
 
-    BUG_ON(!desc->arch.vector);
+    BUG_ON(!valid_irq_vector(vector));
 
     /* Always clear desc->arch.vector */
-    vector = desc->arch.vector;
     cpumask_and(&tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
 
     for_each_cpu(cpu, &tmp_mask) {
x86/IRQ: make fixup_irqs() skip unconnected internally used interrupts

Since the "Cannot set affinity ..." warning is a one time one, avoid
triggering it already at boot time when parking secondary threads and
the serial console uses a (still unconnected at that time) PCI IRQ.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2472,8 +2472,20 @@ void fixup_irqs(const cpumask_t *mask, b
         vector = irq_to_vector(irq);
         if ( vector >= FIRST_HIPRIORITY_VECTOR &&
              vector <= LAST_HIPRIORITY_VECTOR )
+        {
             cpumask_and(desc->arch.cpu_mask, desc->arch.cpu_mask, mask);
 
+            /*
+             * This can in particular happen when parking secondary threads
+             * during boot and when the serial console wants to use a PCI IRQ.
+             */
+            if ( desc->handler == &no_irq_type )
+            {
+                spin_unlock(&desc->lock);
+                continue;
+            }
+        }
+
         if ( desc->arch.move_cleanup_count )
         {
             /* The cleanup IPI may have got sent while we were still online. */
x86/IRQ: drop redundant cpumask_empty() from move_masked_irq()

The subsequent cpumask_intersects() covers the "empty" case quite fine.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -658,9 +658,6 @@ void move_masked_irq(struct irq_desc *de
     
     desc->status &= ~IRQ_MOVE_PENDING;
 
-    if (unlikely(cpumask_empty(pending_mask)))
-        return;
-
     if (!desc->handler->set_affinity)
         return;
 
x86/IRQ: tighten vector checks

Use valid_irq_vector() rather than "> 0".

Also replace an open-coded use of IRQ_VECTOR_UNASSIGNED.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v3: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -342,7 +342,7 @@ void clear_irq_vector(int irq)
 
 int irq_to_vector(int irq)
 {
-    int vector = -1;
+    int vector = IRQ_VECTOR_UNASSIGNED;
 
     BUG_ON(irq >= nr_irqs || irq < 0);
 
@@ -452,15 +452,18 @@ static vmask_t *irq_get_used_vector_mask
             int vector;
             
             vector = irq_to_vector(irq);
-            if ( vector > 0 )
+            if ( valid_irq_vector(vector) )
             {
-                printk(XENLOG_INFO "IRQ %d already assigned vector %d\n",
+                printk(XENLOG_INFO "IRQ%d already assigned vector %02x\n",
                        irq, vector);
                 
                 ASSERT(!test_bit(vector, ret));
 
                 set_bit(vector, ret);
             }
+            else if ( vector != IRQ_VECTOR_UNASSIGNED )
+                printk(XENLOG_WARNING "IRQ%d mapped to bogus vector %02x\n",
+                       irq, vector);
         }
     }
     else if ( IO_APIC_IRQ(irq) &&
@@ -491,7 +494,7 @@ static int _assign_irq_vector(struct irq
     vmask_t *irq_used_vectors = NULL;
 
     old_vector = irq_to_vector(irq);
-    if ( old_vector > 0 )
+    if ( valid_irq_vector(old_vector) )
     {
         cpumask_t tmp_mask;
 
@@ -555,7 +558,7 @@ next:
         current_vector = vector;
         current_offset = offset;
 
-        if ( old_vector > 0 )
+        if ( valid_irq_vector(old_vector) )
         {
             cpumask_and(desc->arch.old_cpu_mask, desc->arch.cpu_mask,
                         &cpu_online_map);
x86/IRQ: eliminate some on-stack cpumask_t instances

Use scratch_cpumask where possible, to avoid creating these possibly
large stack objects. We can't use it in _assign_irq_vector() and
set_desc_affinity(), as these get called in IRQ context.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v4: Re-base over changes earlier in the series.
v3: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -285,14 +285,15 @@ static void _clear_irq_vector(struct irq
 {
     unsigned int cpu, old_vector, irq = desc->irq;
     unsigned int vector = desc->arch.vector;
-    cpumask_t tmp_mask;
+    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
 
     BUG_ON(!valid_irq_vector(vector));
 
     /* Always clear desc->arch.vector */
-    cpumask_and(&tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
+    cpumask_and(tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
 
-    for_each_cpu(cpu, &tmp_mask) {
+    for_each_cpu(cpu, tmp_mask)
+    {
         ASSERT( per_cpu(vector_irq, cpu)[vector] == irq );
         per_cpu(vector_irq, cpu)[vector] = ~irq;
     }
@@ -308,16 +309,17 @@ static void _clear_irq_vector(struct irq
 
     desc->arch.used = IRQ_UNUSED;
 
-    trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, &tmp_mask);
+    trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
 
     if ( likely(!desc->arch.move_in_progress) )
         return;
 
     /* If we were in motion, also clear desc->arch.old_vector */
     old_vector = desc->arch.old_vector;
-    cpumask_and(&tmp_mask, desc->arch.old_cpu_mask, &cpu_online_map);
+    cpumask_and(tmp_mask, desc->arch.old_cpu_mask, &cpu_online_map);
 
-    for_each_cpu(cpu, &tmp_mask) {
+    for_each_cpu(cpu, tmp_mask)
+    {
         ASSERT( per_cpu(vector_irq, cpu)[old_vector] == irq );
         TRACE_3D(TRC_HW_IRQ_MOVE_FINISH, irq, old_vector, cpu);
         per_cpu(vector_irq, cpu)[old_vector] = ~irq;
@@ -1169,7 +1171,6 @@ static void irq_guest_eoi_timer_fn(void
     struct irq_desc *desc = data;
     unsigned int i, irq = desc - irq_desc;
     irq_guest_action_t *action;
-    cpumask_t cpu_eoi_map;
 
     spin_lock_irq(&desc->lock);
     
@@ -1206,14 +1207,18 @@ static void irq_guest_eoi_timer_fn(void
 
     switch ( action->ack_type )
     {
+        cpumask_t *cpu_eoi_map;
+
     case ACKTYPE_UNMASK:
         if ( desc->handler->end )
             desc->handler->end(desc, 0);
         break;
+
     case ACKTYPE_EOI:
-        cpumask_copy(&cpu_eoi_map, action->cpu_eoi_map);
+        cpu_eoi_map = this_cpu(scratch_cpumask);
+        cpumask_copy(cpu_eoi_map, action->cpu_eoi_map);
         spin_unlock_irq(&desc->lock);
-        on_selected_cpus(&cpu_eoi_map, set_eoi_ready, desc, 0);
+        on_selected_cpus(cpu_eoi_map, set_eoi_ready, desc, 0);
         return;
     }
 
@@ -2458,7 +2463,7 @@ void fixup_irqs(const cpumask_t *mask, b
     {
         bool break_affinity = false, set_affinity = true;
         unsigned int vector;
-        cpumask_t affinity;
+        cpumask_t *affinity = this_cpu(scratch_cpumask);
 
         if ( irq == 2 )
             continue;
@@ -2489,9 +2494,9 @@ void fixup_irqs(const cpumask_t *mask, b
         if ( desc->arch.move_cleanup_count )
         {
             /* The cleanup IPI may have got sent while we were still online. */
-            cpumask_andnot(&affinity, desc->arch.old_cpu_mask,
+            cpumask_andnot(affinity, desc->arch.old_cpu_mask,
                            &cpu_online_map);
-            desc->arch.move_cleanup_count -= cpumask_weight(&affinity);
+            desc->arch.move_cleanup_count -= cpumask_weight(affinity);
             if ( !desc->arch.move_cleanup_count )
                 release_old_vec(desc);
         }
@@ -2518,10 +2523,10 @@ void fixup_irqs(const cpumask_t *mask, b
         {
             unsigned int cpu;
 
-            cpumask_and(&affinity, desc->arch.old_cpu_mask, &cpu_online_map);
+            cpumask_and(affinity, desc->arch.old_cpu_mask, &cpu_online_map);
 
             spin_lock(&vector_lock);
-            for_each_cpu(cpu, &affinity)
+            for_each_cpu(cpu, affinity)
                 per_cpu(vector_irq, cpu)[desc->arch.old_vector] = ~irq;
             spin_unlock(&vector_lock);
 
@@ -2532,23 +2537,23 @@ void fixup_irqs(const cpumask_t *mask, b
         if ( !cpumask_intersects(mask, desc->affinity) )
         {
             break_affinity = true;
-            cpumask_setall(&affinity);
+            cpumask_setall(affinity);
         }
         else
-            cpumask_copy(&affinity, desc->affinity);
+            cpumask_copy(affinity, desc->affinity);
 
         if ( desc->handler->disable )
             desc->handler->disable(desc);
 
         if ( desc->handler->set_affinity )
-            desc->handler->set_affinity(desc, &affinity);
+            desc->handler->set_affinity(desc, affinity);
         else if ( !(warned++) )
             set_affinity = false;
 
         if ( desc->handler->enable )
             desc->handler->enable(desc);
 
-        cpumask_copy(&affinity, desc->affinity);
+        cpumask_copy(affinity, desc->affinity);
 
         spin_unlock(&desc->lock);
 
@@ -2559,7 +2564,7 @@ void fixup_irqs(const cpumask_t *mask, b
             printk("Cannot set affinity for IRQ%u\n", irq);
         else if ( break_affinity )
             printk("Broke affinity for IRQ%u, new: %*pb\n",
-                   irq, nr_cpu_ids, cpumask_bits(&affinity));
+                   irq, nr_cpu_ids, cpumask_bits(affinity));
     }
 
     /* That doesn't seem sufficient.  Give it 1ms. */
x86/IRQ: move {,_}clear_irq_vector()

This is largely to drop a forward declaration. There's one functional
change - clear_irq_vector() gets marked __init, as its only caller is
check_timer(). Beyond this only a few stray blanks get removed.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v3: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -27,7 +27,6 @@
 #include <public/physdev.h>
 
 static int parse_irq_vector_map_param(const char *s);
-static void _clear_irq_vector(struct irq_desc *desc);
 
 /* opt_noirqbalance: If true, software IRQ balancing/affinity is disabled. */
 bool __read_mostly opt_noirqbalance;
@@ -191,6 +190,67 @@ int __init bind_irq_vector(int irq, int
     return ret;
 }
 
+static void _clear_irq_vector(struct irq_desc *desc)
+{
+    unsigned int cpu, old_vector, irq = desc->irq;
+    unsigned int vector = desc->arch.vector;
+    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
+
+    BUG_ON(!valid_irq_vector(vector));
+
+    /* Always clear desc->arch.vector */
+    cpumask_and(tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
+
+    for_each_cpu(cpu, tmp_mask)
+    {
+        ASSERT(per_cpu(vector_irq, cpu)[vector] == irq);
+        per_cpu(vector_irq, cpu)[vector] = ~irq;
+    }
+
+    desc->arch.vector = IRQ_VECTOR_UNASSIGNED;
+    cpumask_clear(desc->arch.cpu_mask);
+
+    if ( desc->arch.used_vectors )
+    {
+        ASSERT(test_bit(vector, desc->arch.used_vectors));
+        clear_bit(vector, desc->arch.used_vectors);
+    }
+
+    desc->arch.used = IRQ_UNUSED;
+
+    trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
+
+    if ( likely(!desc->arch.move_in_progress) )
+        return;
+
+    /* If we were in motion, also clear desc->arch.old_vector */
+    old_vector = desc->arch.old_vector;
+    cpumask_and(tmp_mask, desc->arch.old_cpu_mask, &cpu_online_map);
+
+    for_each_cpu(cpu, tmp_mask)
+    {
+        ASSERT(per_cpu(vector_irq, cpu)[old_vector] == irq);
+        TRACE_3D(TRC_HW_IRQ_MOVE_FINISH, irq, old_vector, cpu);
+        per_cpu(vector_irq, cpu)[old_vector] = ~irq;
+    }
+
+    release_old_vec(desc);
+
+    desc->arch.move_in_progress = 0;
+}
+
+void __init clear_irq_vector(int irq)
+{
+    struct irq_desc *desc = irq_to_desc(irq);
+    unsigned long flags;
+
+    spin_lock_irqsave(&desc->lock, flags);
+    spin_lock(&vector_lock);
+    _clear_irq_vector(desc);
+    spin_unlock(&vector_lock);
+    spin_unlock_irqrestore(&desc->lock, flags);
+}
+
 /*
  * Dynamic irq allocate and deallocation for MSI
  */
@@ -281,67 +341,6 @@ void destroy_irq(unsigned int irq)
     xfree(action);
 }
 
-static void _clear_irq_vector(struct irq_desc *desc)
-{
-    unsigned int cpu, old_vector, irq = desc->irq;
-    unsigned int vector = desc->arch.vector;
-    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
-
-    BUG_ON(!valid_irq_vector(vector));
-
-    /* Always clear desc->arch.vector */
-    cpumask_and(tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
-
-    for_each_cpu(cpu, tmp_mask)
-    {
-        ASSERT( per_cpu(vector_irq, cpu)[vector] == irq );
-        per_cpu(vector_irq, cpu)[vector] = ~irq;
-    }
-
-    desc->arch.vector = IRQ_VECTOR_UNASSIGNED;
-    cpumask_clear(desc->arch.cpu_mask);
-
-    if ( desc->arch.used_vectors )
-    {
-        ASSERT(test_bit(vector, desc->arch.used_vectors));
-        clear_bit(vector, desc->arch.used_vectors);
-    }
-
-    desc->arch.used = IRQ_UNUSED;
-
-    trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
-
-    if ( likely(!desc->arch.move_in_progress) )
-        return;
-
-    /* If we were in motion, also clear desc->arch.old_vector */
-    old_vector = desc->arch.old_vector;
-    cpumask_and(tmp_mask, desc->arch.old_cpu_mask, &cpu_online_map);
-
-    for_each_cpu(cpu, tmp_mask)
-    {
-        ASSERT( per_cpu(vector_irq, cpu)[old_vector] == irq );
-        TRACE_3D(TRC_HW_IRQ_MOVE_FINISH, irq, old_vector, cpu);
-        per_cpu(vector_irq, cpu)[old_vector] = ~irq;
-    }
-
-    release_old_vec(desc);
-
-    desc->arch.move_in_progress = 0;
-}
-
-void clear_irq_vector(int irq)
-{
-    struct irq_desc *desc = irq_to_desc(irq);
-    unsigned long flags;
-
-    spin_lock_irqsave(&desc->lock, flags);
-    spin_lock(&vector_lock);
-    _clear_irq_vector(desc);
-    spin_unlock(&vector_lock);
-    spin_unlock_irqrestore(&desc->lock, flags);
-}
-
 int irq_to_vector(int irq)
 {
     int vector = IRQ_VECTOR_UNASSIGNED;
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 00/13] x86: IRQ management adjustments
Posted by Andrew Cooper 4 years, 9 months ago
On 16/07/2019 08:24, Jan Beulich wrote:
> For v4 specific information please see the individual patches.
>
> Full set of patches attached here due to still unresolved email
> issues over here.

In future, can you please number the patches in their correct order.

There is no correlation between the names of the patches, and either the
commit title, or the order in which they should be applied.

I'm currently having an exceedingly "fun" time trying to turn this back
into a git tree.  (No point resending this series, because I've done it
now.)

~Andrew

P.S. Of course, `git format-patch` is the recommended way to do this,
but then again, so is `git send-email` in the first place.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 00/13] x86: IRQ management adjustments
Posted by Jan Beulich 4 years, 9 months ago
On 19.07.2019 14:36, Andrew Cooper wrote:
> On 16/07/2019 08:24, Jan Beulich wrote:
>> For v4 specific information please see the individual patches.
>>
>> Full set of patches attached here due to still unresolved email
>> issues over here.
> 
> In future, can you please number the patches in their correct order.
> 
> There is no correlation between the names of the patches, and either the
> commit title, or the order in which they should be applied.

I'd specifically gone and sorted the attachments, and as per the
list archives this sorting has indeed been preserved.

> I'm currently having an exceedingly "fun" time trying to turn this back
> into a git tree.  (No point resending this series, because I've done it
> now.)

I'm sorry for this.

> P.S. Of course, `git format-patch` is the recommended way to do this,
> but then again, so is `git send-email` in the first place.

Sure - for people using git for their development work. I can only
state again that I'm happy with quilt instead.

Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 00/13] x86: IRQ management adjustments
Posted by Andrew Cooper 4 years, 9 months ago
On 19/07/2019 14:04, Jan Beulich wrote:
> On 19.07.2019 14:36, Andrew Cooper wrote:
>> On 16/07/2019 08:24, Jan Beulich wrote:
>>> For v4 specific information please see the individual patches.
>>>
>>> Full set of patches attached here due to still unresolved email
>>> issues over here.
>> In future, can you please number the patches in their correct order.
>>
>> There is no correlation between the names of the patches, and either the
>> commit title, or the order in which they should be applied.
> I'd specifically gone and sorted the attachments, and as per the
> list archives this sorting has indeed been preserved.

Right, but that doesn't hold when saving the patches as individual files
in order to apply them.

>
>> I'm currently having an exceedingly "fun" time trying to turn this back
>> into a git tree.  (No point resending this series, because I've done it
>> now.)
> I'm sorry for this.
>
>> P.S. Of course, `git format-patch` is the recommended way to do this,
>> but then again, so is `git send-email` in the first place.
> Sure - for people using git for their development work. I can only
> state again that I'm happy with quilt instead.

A different alternative would be to push the series to your git tree on
xenbits, which would be almost the same as pushing to staging.

I expect that would actually be more convenient for the people liable to
be dealing with series like this (It certainly would be for me).

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 01/13] x86/IRQ: deal with move-in-progress state in fixup_irqs()
Posted by Jan Beulich 4 years, 9 months ago
The flag being set may prevent affinity changes, as these often imply
assignment of a new vector. When there's no possible destination left
for the IRQ, the clearing of the flag needs to happen right from
fixup_irqs().

Additionally _assign_irq_vector() needs to avoid setting the flag when
there's no online CPU left in what gets put into ->arch.old_cpu_mask.
The old vector can be released right away in this case.

Also extend the log message about broken affinity to include the new
affinity as well, allowing to notice issues with affinity changes not
actually having taken place. Swap the if/else-if order there at the
same time to reduce the amount of conditions checked.

At the same time replace two open coded instances of the new helper
function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
---
v4: Use cpumask_bits() in printk() invocation. Re-base.
v3: Move release_old_vec() further up (so a later patch won't need to).
     Re-base.
v2: Add/use valid_irq_vector().
v1b: Also update vector_irq[] in the code added to fixup_irqs().

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -99,6 +99,27 @@ void unlock_vector_lock(void)
      spin_unlock(&vector_lock);
  }
  
+static inline bool valid_irq_vector(unsigned int vector)
+{
+    return vector >= FIRST_DYNAMIC_VECTOR && vector <= LAST_HIPRIORITY_VECTOR;
+}
+
+static void release_old_vec(struct irq_desc *desc)
+{
+    unsigned int vector = desc->arch.old_vector;
+
+    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
+    cpumask_clear(desc->arch.old_cpu_mask);
+
+    if ( !valid_irq_vector(vector) )
+        ASSERT_UNREACHABLE();
+    else if ( desc->arch.used_vectors )
+    {
+        ASSERT(test_bit(vector, desc->arch.used_vectors));
+        clear_bit(vector, desc->arch.used_vectors);
+    }
+}
+
  static void _trace_irq_mask(uint32_t event, int irq, int vector,
                              const cpumask_t *mask)
  {
@@ -295,14 +316,7 @@ static void __clear_irq_vector(int irq)
          per_cpu(vector_irq, cpu)[old_vector] = ~irq;
      }
  
-    desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
-    cpumask_clear(desc->arch.old_cpu_mask);
-
-    if ( desc->arch.used_vectors )
-    {
-        ASSERT(test_bit(old_vector, desc->arch.used_vectors));
-        clear_bit(old_vector, desc->arch.used_vectors);
-    }
+    release_old_vec(desc);
  
      desc->arch.move_in_progress = 0;
  }
@@ -527,12 +541,21 @@ next:
          /* Found one! */
          current_vector = vector;
          current_offset = offset;
-        if (old_vector > 0) {
-            desc->arch.move_in_progress = 1;
-            cpumask_copy(desc->arch.old_cpu_mask, desc->arch.cpu_mask);
+
+        if ( old_vector > 0 )
+        {
+            cpumask_and(desc->arch.old_cpu_mask, desc->arch.cpu_mask,
+                        &cpu_online_map);
              desc->arch.old_vector = desc->arch.vector;
+            if ( !cpumask_empty(desc->arch.old_cpu_mask) )
+                desc->arch.move_in_progress = 1;
+            else
+                /* This can happen while offlining a CPU. */
+                release_old_vec(desc);
          }
+
          trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, &tmp_mask);
+
          for_each_cpu(new_cpu, &tmp_mask)
              per_cpu(vector_irq, new_cpu)[vector] = irq;
          desc->arch.vector = vector;
@@ -702,14 +725,8 @@ void irq_move_cleanup_interrupt(struct c
  
          if ( desc->arch.move_cleanup_count == 0 )
          {
-            desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED;
-            cpumask_clear(desc->arch.old_cpu_mask);
-
-            if ( desc->arch.used_vectors )
-            {
-                ASSERT(test_bit(vector, desc->arch.used_vectors));
-                clear_bit(vector, desc->arch.used_vectors);
-            }
+            ASSERT(vector == desc->arch.old_vector);
+            release_old_vec(desc);
          }
  unlock:
          spin_unlock(&desc->lock);
@@ -2409,6 +2426,33 @@ void fixup_irqs(const cpumask_t *mask, b
              continue;
          }
  
+        /*
+         * In order for the affinity adjustment below to be successful, we
+         * need __assign_irq_vector() to succeed. This in particular means
+         * clearing desc->arch.move_in_progress if this would otherwise
+         * prevent the function from succeeding. Since there's no way for the
+         * flag to get cleared anymore when there's no possible destination
+         * left (the only possibility then would be the IRQs enabled window
+         * after this loop), there's then also no race with us doing it here.
+         *
+         * Therefore the logic here and there need to remain in sync.
+         */
+        if ( desc->arch.move_in_progress &&
+             !cpumask_intersects(mask, desc->arch.cpu_mask) )
+        {
+            unsigned int cpu;
+
+            cpumask_and(&affinity, desc->arch.old_cpu_mask, &cpu_online_map);
+
+            spin_lock(&vector_lock);
+            for_each_cpu(cpu, &affinity)
+                per_cpu(vector_irq, cpu)[desc->arch.old_vector] = ~irq;
+            spin_unlock(&vector_lock);
+
+            release_old_vec(desc);
+            desc->arch.move_in_progress = 0;
+        }
+
          cpumask_and(&affinity, &affinity, mask);
          if ( cpumask_empty(&affinity) )
          {
@@ -2427,15 +2471,18 @@ void fixup_irqs(const cpumask_t *mask, b
          if ( desc->handler->enable )
              desc->handler->enable(desc);
  
+        cpumask_copy(&affinity, desc->affinity);
+
          spin_unlock(&desc->lock);
  
          if ( !verbose )
              continue;
  
-        if ( break_affinity && set_affinity )
-            printk("Broke affinity for irq %i\n", irq);
-        else if ( !set_affinity )
-            printk("Cannot set affinity for irq %i\n", irq);
+        if ( !set_affinity )
+            printk("Cannot set affinity for IRQ%u\n", irq);
+        else if ( break_affinity )
+            printk("Broke affinity for IRQ%u, new: %*pb\n",
+                   irq, nr_cpu_ids, cpumask_bits(&affinity));
      }
  
      /* That doesn't seem sufficient.  Give it 1ms. */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 01/13] x86/IRQ: deal with move-in-progress state in fixup_irqs()
Posted by Andrew Cooper 4 years, 9 months ago
On 16/07/2019 08:37, Jan Beulich wrote:
> The flag being set may prevent affinity changes, as these often imply
> assignment of a new vector. When there's no possible destination left
> for the IRQ, the clearing of the flag needs to happen right from
> fixup_irqs().
>
> Additionally _assign_irq_vector() needs to avoid setting the flag when
> there's no online CPU left in what gets put into ->arch.old_cpu_mask.
> The old vector can be released right away in this case.
>
> Also extend the log message about broken affinity to include the new
> affinity as well, allowing to notice issues with affinity changes not
> actually having taken place. Swap the if/else-if order there at the
> same time to reduce the amount of conditions checked.
>
> At the same time replace two open coded instances of the new helper
> function.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 02/13] x86/IRQ: deal with move cleanup count state in fixup_irqs()
Posted by Jan Beulich 4 years, 9 months ago
The cleanup IPI may get sent immediately before a CPU gets removed from
the online map. In such a case the IPI would get handled on the CPU
being offlined no earlier than in the interrupts disabled window after
fixup_irqs()' main loop. This is too late, however, because a possible
affinity change may incur the need for vector assignment, which will
fail when the IRQ's move cleanup count is still non-zero.

To fix this
- record the set of CPUs the cleanup IPIs gets actually sent to alongside
   setting their count,
- adjust the count in fixup_irqs(), accounting for all CPUs that the
   cleanup IPI was sent to, but that are no longer online,
- bail early from the cleanup IPI handler when the CPU is no longer
   online, to prevent double accounting.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -675,6 +675,9 @@ void irq_move_cleanup_interrupt(struct c
      ack_APIC_irq();
  
      me = smp_processor_id();
+    if ( !cpu_online(me) )
+        return;
+
      for ( vector = FIRST_DYNAMIC_VECTOR;
            vector <= LAST_HIPRIORITY_VECTOR; vector++)
      {
@@ -735,11 +738,14 @@ unlock:
  
  static void send_cleanup_vector(struct irq_desc *desc)
  {
-    cpumask_t cleanup_mask;
+    cpumask_and(desc->arch.old_cpu_mask, desc->arch.old_cpu_mask,
+                &cpu_online_map);
+    desc->arch.move_cleanup_count = cpumask_weight(desc->arch.old_cpu_mask);
  
-    cpumask_and(&cleanup_mask, desc->arch.old_cpu_mask, &cpu_online_map);
-    desc->arch.move_cleanup_count = cpumask_weight(&cleanup_mask);
-    send_IPI_mask(&cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+    if ( desc->arch.move_cleanup_count )
+        send_IPI_mask(desc->arch.old_cpu_mask, IRQ_MOVE_CLEANUP_VECTOR);
+    else
+        release_old_vec(desc);
  
      desc->arch.move_in_progress = 0;
  }
@@ -2419,6 +2425,16 @@ void fixup_irqs(const cpumask_t *mask, b
               vector <= LAST_HIPRIORITY_VECTOR )
              cpumask_and(desc->arch.cpu_mask, desc->arch.cpu_mask, mask);
  
+        if ( desc->arch.move_cleanup_count )
+        {
+            /* The cleanup IPI may have got sent while we were still online. */
+            cpumask_andnot(&affinity, desc->arch.old_cpu_mask,
+                           &cpu_online_map);
+            desc->arch.move_cleanup_count -= cpumask_weight(&affinity);
+            if ( !desc->arch.move_cleanup_count )
+                release_old_vec(desc);
+        }
+
          cpumask_copy(&affinity, desc->affinity);
          if ( !desc->action || cpumask_subset(&affinity, mask) )
          {

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 03/13] x86/IRQ: desc->affinity should strictly represent the requested value
Posted by Jan Beulich 4 years, 9 months ago
desc->arch.cpu_mask reflects the actual set of target CPUs. Don't ever
fiddle with desc->affinity itself, except to store caller requested
values. Note that assign_irq_vector() now takes a NULL incoming CPU mask
to mean "all CPUs" now, rather than just "all currently online CPUs".
This way no further affinity adjustment is needed after onlining further
CPUs.

This renders both set_native_irq_info() uses (which weren't using proper
locking anyway) redundant - drop the function altogether.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
---
v4: Use %*pbl.
---
TBD: To reduce the bad effect on the so far tabular output of the 'i'
      debug key, shifting the two affinity values further to the right
      may be worthwhile to consider.

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -1039,7 +1039,6 @@ static void __init setup_IO_APIC_irqs(vo
              SET_DEST(entry, logical, cpu_mask_to_apicid(TARGET_CPUS));
              spin_lock_irqsave(&ioapic_lock, flags);
              __ioapic_write_entry(apic, pin, 0, entry);
-            set_native_irq_info(irq, TARGET_CPUS);
              spin_unlock_irqrestore(&ioapic_lock, flags);
          }
      }
@@ -2248,7 +2247,6 @@ int io_apic_set_pci_routing (int ioapic,
  
      spin_lock_irqsave(&ioapic_lock, flags);
      __ioapic_write_entry(ioapic, pin, 0, entry);
-    set_native_irq_info(irq, TARGET_CPUS);
      spin_unlock(&ioapic_lock);
  
      spin_lock(&desc->lock);
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -589,11 +589,16 @@ int assign_irq_vector(int irq, const cpu
  
      spin_lock_irqsave(&vector_lock, flags);
      ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS);
-    if (!ret) {
+    if ( !ret )
+    {
          ret = desc->arch.vector;
-        cpumask_copy(desc->affinity, desc->arch.cpu_mask);
+        if ( mask )
+            cpumask_copy(desc->affinity, mask);
+        else
+            cpumask_setall(desc->affinity);
      }
      spin_unlock_irqrestore(&vector_lock, flags);
+
      return ret;
  }
  
@@ -2345,9 +2350,10 @@ static void dump_irqs(unsigned char key)
  
          spin_lock_irqsave(&desc->lock, flags);
  
-        printk("   IRQ:%4d aff:%*pb vec:%02x %-15s status=%03x ",
-               irq, nr_cpu_ids, cpumask_bits(desc->affinity), desc->arch.vector,
-               desc->handler->typename, desc->status);
+        printk("   IRQ:%4d aff:{%*pbl}/{%*pbl} vec:%02x %-15s status=%03x ",
+               irq, nr_cpu_ids, cpumask_bits(desc->affinity),
+               nr_cpu_ids, cpumask_bits(desc->arch.cpu_mask),
+               desc->arch.vector, desc->handler->typename, desc->status);
  
          if ( ssid )
              printk("Z=%-25s ", ssid);
@@ -2435,8 +2441,7 @@ void fixup_irqs(const cpumask_t *mask, b
                  release_old_vec(desc);
          }
  
-        cpumask_copy(&affinity, desc->affinity);
-        if ( !desc->action || cpumask_subset(&affinity, mask) )
+        if ( !desc->action || cpumask_subset(desc->affinity, mask) )
          {
              spin_unlock(&desc->lock);
              continue;
@@ -2469,12 +2474,13 @@ void fixup_irqs(const cpumask_t *mask, b
              desc->arch.move_in_progress = 0;
          }
  
-        cpumask_and(&affinity, &affinity, mask);
-        if ( cpumask_empty(&affinity) )
+        if ( !cpumask_intersects(mask, desc->affinity) )
          {
              break_affinity = true;
-            cpumask_copy(&affinity, mask);
+            cpumask_setall(&affinity);
          }
+        else
+            cpumask_copy(&affinity, desc->affinity);
  
          if ( desc->handler->disable )
              desc->handler->disable(desc);
--- a/xen/include/xen/irq.h
+++ b/xen/include/xen/irq.h
@@ -162,11 +162,6 @@ extern irq_desc_t *domain_spin_lock_irq_
  extern irq_desc_t *pirq_spin_lock_irq_desc(
      const struct pirq *, unsigned long *pflags);
  
-static inline void set_native_irq_info(unsigned int irq, const cpumask_t *mask)
-{
-    cpumask_copy(irq_to_desc(irq)->affinity, mask);
-}
-
  unsigned int set_desc_affinity(struct irq_desc *, const cpumask_t *);
  
  #ifndef arch_hwdom_irqs

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 03/13] x86/IRQ: desc->affinity should strictly represent the requested value
Posted by Andrew Cooper 4 years, 9 months ago
On 16/07/2019 08:38, Jan Beulich wrote:
> desc->arch.cpu_mask reflects the actual set of target CPUs. Don't ever
> fiddle with desc->affinity itself, except to store caller requested
> values. Note that assign_irq_vector() now takes a NULL incoming CPU mask
> to mean "all CPUs" now, rather than just "all currently online CPUs".
> This way no further affinity adjustment is needed after onlining further
> CPUs.
>
> This renders both set_native_irq_info() uses (which weren't using proper
> locking anyway) redundant - drop the function altogether.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

There are utf8 encoding problems here, but the patch in 0/$N does look
to be ok.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 04/13] x86/IRQ: consolidate use of ->arch.cpu_mask
Posted by Jan Beulich 4 years, 9 months ago
Mixed meaning was implied so far by different pieces of code -
disagreement was in particular about whether to expect offline CPUs'
bits to possibly be set. Switch to a mostly consistent meaning
(exception being high priority interrupts, which would perhaps better
be switched to the same model as well in due course). Use the field to
record the vector allocation mask, i.e. potentially including bits of
offline (parked) CPUs. This implies that before passing the mask to
certain functions (most notably cpu_mask_to_apicid()) it needs to be
further reduced to the online subset.

The exception of high priority interrupts is also why for the moment
_bind_irq_vector() is left as is, despite looking wrong: It's used
exclusively for IRQ0, which isn't supposed to move off CPU0 at any time.

The prior lack of restricting to online CPUs in set_desc_affinity()
before calling cpu_mask_to_apicid() in particular allowed (in x2APIC
clustered mode) offlined CPUs to end up enabled in an IRQ's destination
field. (I wonder whether vector_allocation_cpumask_flat() shouldn't
follow a similar model, using cpu_present_map in favor of
cpu_online_map.)

For IO-APIC code it was definitely wrong to potentially store, as a
fallback, TARGET_CPUS (i.e. all online ones) into the field, as that
would have caused problems when determining on which CPUs to release
vectors when they've gone out of use. Disable interrupts instead when
no valid target CPU can be established (which code elsewhere should
guarantee to never happen), and log a message in such an unlikely event.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v2: New.

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -680,7 +680,7 @@ void /*__init*/ setup_ioapic_dest(void)
                  continue;
              irq = pin_2_irq(irq_entry, ioapic, pin);
              desc = irq_to_desc(irq);
-            BUG_ON(cpumask_empty(desc->arch.cpu_mask));
+            BUG_ON(!cpumask_intersects(desc->arch.cpu_mask, &cpu_online_map));
              set_ioapic_affinity_irq(desc, desc->arch.cpu_mask);
          }
  
@@ -2194,7 +2194,6 @@ int io_apic_set_pci_routing (int ioapic,
  {
      struct irq_desc *desc = irq_to_desc(irq);
      struct IO_APIC_route_entry entry;
-    cpumask_t mask;
      unsigned long flags;
      int vector;
  
@@ -2229,11 +2228,17 @@ int io_apic_set_pci_routing (int ioapic,
          return vector;
      entry.vector = vector;
  
-    cpumask_copy(&mask, TARGET_CPUS);
-    /* Don't chance ending up with an empty mask. */
-    if (cpumask_intersects(&mask, desc->arch.cpu_mask))
-        cpumask_and(&mask, &mask, desc->arch.cpu_mask);
-    SET_DEST(entry, logical, cpu_mask_to_apicid(&mask));
+    if (cpumask_intersects(desc->arch.cpu_mask, TARGET_CPUS)) {
+        cpumask_t *mask = this_cpu(scratch_cpumask);
+
+        cpumask_and(mask, desc->arch.cpu_mask, TARGET_CPUS);
+        SET_DEST(entry, logical, cpu_mask_to_apicid(mask));
+    } else {
+        printk(XENLOG_ERR "IRQ%d: no target CPU (%*pb vs %*pb)\n",
+               irq, nr_cpu_ids, cpumask_bits(desc->arch.cpu_mask),
+               nr_cpu_ids, cpumask_bits(TARGET_CPUS));
+        desc->status |= IRQ_DISABLED;
+    }
  
      apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
  		"(%d-%d -> %#x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
@@ -2419,7 +2424,21 @@ int ioapic_guest_write(unsigned long phy
      /* Set the vector field to the real vector! */
      rte.vector = desc->arch.vector;
  
-    SET_DEST(rte, logical, cpu_mask_to_apicid(desc->arch.cpu_mask));
+    if ( cpumask_intersects(desc->arch.cpu_mask, TARGET_CPUS) )
+    {
+        cpumask_t *mask = this_cpu(scratch_cpumask);
+
+        cpumask_and(mask, desc->arch.cpu_mask, TARGET_CPUS);
+        SET_DEST(rte, logical, cpu_mask_to_apicid(mask));
+    }
+    else
+    {
+        gprintk(XENLOG_ERR, "IRQ%d: no target CPU (%*pb vs %*pb)\n",
+               irq, nr_cpu_ids, cpumask_bits(desc->arch.cpu_mask),
+               nr_cpu_ids, cpumask_bits(TARGET_CPUS));
+        desc->status |= IRQ_DISABLED;
+        rte.mask = 1;
+    }
  
      __ioapic_write_entry(apic, pin, 0, rte);
      
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -478,11 +478,13 @@ static int __assign_irq_vector(
       */
      static int current_vector = FIRST_DYNAMIC_VECTOR, current_offset = 0;
      int cpu, err, old_vector;
-    cpumask_t tmp_mask;
      vmask_t *irq_used_vectors = NULL;
  
      old_vector = irq_to_vector(irq);
-    if (old_vector > 0) {
+    if ( old_vector > 0 )
+    {
+        cpumask_t tmp_mask;
+
          cpumask_and(&tmp_mask, mask, &cpu_online_map);
          if (cpumask_intersects(&tmp_mask, desc->arch.cpu_mask)) {
              desc->arch.vector = old_vector;
@@ -505,7 +507,9 @@ static int __assign_irq_vector(
      else
          irq_used_vectors = irq_get_used_vector_mask(irq);
  
-    for_each_cpu(cpu, mask) {
+    for_each_cpu(cpu, mask)
+    {
+        const cpumask_t *vec_mask;
          int new_cpu;
          int vector, offset;
  
@@ -513,8 +517,7 @@ static int __assign_irq_vector(
          if (!cpu_online(cpu))
              continue;
  
-        cpumask_and(&tmp_mask, vector_allocation_cpumask(cpu),
-                    &cpu_online_map);
+        vec_mask = vector_allocation_cpumask(cpu);
  
          vector = current_vector;
          offset = current_offset;
@@ -535,7 +538,7 @@ next:
              && test_bit(vector, irq_used_vectors) )
              goto next;
  
-        for_each_cpu(new_cpu, &tmp_mask)
+        for_each_cpu(new_cpu, vec_mask)
              if (per_cpu(vector_irq, new_cpu)[vector] >= 0)
                  goto next;
          /* Found one! */
@@ -554,12 +557,12 @@ next:
                  release_old_vec(desc);
          }
  
-        trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, &tmp_mask);
+        trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, vec_mask);
  
-        for_each_cpu(new_cpu, &tmp_mask)
+        for_each_cpu(new_cpu, vec_mask)
              per_cpu(vector_irq, new_cpu)[vector] = irq;
          desc->arch.vector = vector;
-        cpumask_copy(desc->arch.cpu_mask, &tmp_mask);
+        cpumask_copy(desc->arch.cpu_mask, vec_mask);
  
          desc->arch.used = IRQ_USED;
          ASSERT((desc->arch.used_vectors == NULL)
@@ -791,6 +794,7 @@ unsigned int set_desc_affinity(struct ir
  
      cpumask_copy(desc->affinity, mask);
      cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
+    cpumask_and(&dest_mask, &dest_mask, &cpu_online_map);
  
      return cpu_mask_to_apicid(&dest_mask);
  }
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -32,6 +32,12 @@ struct irq_desc;
  struct arch_irq_desc {
          s16 vector;                  /* vector itself is only 8 bits, */
          s16 old_vector;              /* but we use -1 for unassigned  */
+        /*
+         * Except for high priority interrupts @cpu_mask may have bits set for
+         * offline CPUs.  Consumers need to be careful to mask this down to
+         * online ones as necessary.  There is supposed to always be a non-
+         * empty intersection with cpu_online_map.
+         */
          cpumask_var_t cpu_mask;
          cpumask_var_t old_cpu_mask;
          cpumask_var_t pending_mask;

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 05/13] x86/IRQ: fix locking around vector management
Posted by Jan Beulich 4 years, 9 months ago
All of __{assign,bind,clear}_irq_vector() manipulate struct irq_desc
fields, and hence ought to be called with the descriptor lock held in
addition to vector_lock. This is currently the case for only
set_desc_affinity() (in the common case) and destroy_irq(), which also
clarifies what the nesting behavior between the locks has to be.
Reflect the new expectation by having these functions all take a
descriptor as parameter instead of an interrupt number.

Also take care of the two special cases of calls to set_desc_affinity():
set_ioapic_affinity_irq() and VT-d's dma_msi_set_affinity() get called
directly as well, and in these cases the descriptor locks hadn't got
acquired till now. For set_ioapic_affinity_irq() this means acquiring /
releasing of the IO-APIC lock can be plain spin_{,un}lock() then.

Drop one of the two leading underscores from all three functions at
the same time.

There's one case left where descriptors get manipulated with just
vector_lock held: setup_vector_irq() assumes its caller to acquire
vector_lock, and hence can't itself acquire the descriptor locks (wrong
lock order). I don't currently see how to address this.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com> [VT-d]
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v4: Adjust comment ahead of setup_vector_irq().
v3: Also drop one leading underscore from a comment. Re-base.
v2: Also adjust set_ioapic_affinity_irq() and VT-d's
     dma_msi_set_affinity().

--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -550,14 +550,14 @@ static void clear_IO_APIC (void)
  static void
  set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask)
  {
-    unsigned long flags;
      unsigned int dest;
      int pin, irq;
      struct irq_pin_list *entry;
  
      irq = desc->irq;
  
-    spin_lock_irqsave(&ioapic_lock, flags);
+    spin_lock(&ioapic_lock);
+
      dest = set_desc_affinity(desc, mask);
      if (dest != BAD_APICID) {
          if ( !x2apic_enabled )
@@ -580,8 +580,8 @@ set_ioapic_affinity_irq(struct irq_desc
              entry = irq_2_pin + entry->next;
          }
      }
-    spin_unlock_irqrestore(&ioapic_lock, flags);
  
+    spin_unlock(&ioapic_lock);
  }
  
  /*
@@ -674,16 +674,19 @@ void /*__init*/ setup_ioapic_dest(void)
      for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
          for (pin = 0; pin < nr_ioapic_entries[ioapic]; pin++) {
              struct irq_desc *desc;
+            unsigned long flags;
  
              irq_entry = find_irq_entry(ioapic, pin, mp_INT);
              if (irq_entry == -1)
                  continue;
              irq = pin_2_irq(irq_entry, ioapic, pin);
              desc = irq_to_desc(irq);
+
+            spin_lock_irqsave(&desc->lock, flags);
              BUG_ON(!cpumask_intersects(desc->arch.cpu_mask, &cpu_online_map));
              set_ioapic_affinity_irq(desc, desc->arch.cpu_mask);
+            spin_unlock_irqrestore(&desc->lock, flags);
          }
-
      }
  }
  
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -27,6 +27,7 @@
  #include <public/physdev.h>
  
  static int parse_irq_vector_map_param(const char *s);
+static void _clear_irq_vector(struct irq_desc *desc);
  
  /* opt_noirqbalance: If true, software IRQ balancing/affinity is disabled. */
  bool __read_mostly opt_noirqbalance;
@@ -143,13 +144,12 @@ static void trace_irq_mask(uint32_t even
          _trace_irq_mask(event, irq, vector, mask);
  }
  
-static int __init __bind_irq_vector(int irq, int vector, const cpumask_t *cpu_mask)
+static int __init _bind_irq_vector(struct irq_desc *desc, int vector,
+                                   const cpumask_t *cpu_mask)
  {
      cpumask_t online_mask;
      int cpu;
-    struct irq_desc *desc = irq_to_desc(irq);
  
-    BUG_ON((unsigned)irq >= nr_irqs);
      BUG_ON((unsigned)vector >= NR_VECTORS);
  
      cpumask_and(&online_mask, cpu_mask, &cpu_online_map);
@@ -160,9 +160,9 @@ static int __init __bind_irq_vector(int
          return 0;
      if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED )
          return -EBUSY;
-    trace_irq_mask(TRC_HW_IRQ_BIND_VECTOR, irq, vector, &online_mask);
+    trace_irq_mask(TRC_HW_IRQ_BIND_VECTOR, desc->irq, vector, &online_mask);
      for_each_cpu(cpu, &online_mask)
-        per_cpu(vector_irq, cpu)[vector] = irq;
+        per_cpu(vector_irq, cpu)[vector] = desc->irq;
      desc->arch.vector = vector;
      cpumask_copy(desc->arch.cpu_mask, &online_mask);
      if ( desc->arch.used_vectors )
@@ -176,12 +176,18 @@ static int __init __bind_irq_vector(int
  
  int __init bind_irq_vector(int irq, int vector, const cpumask_t *cpu_mask)
  {
+    struct irq_desc *desc = irq_to_desc(irq);
      unsigned long flags;
      int ret;
  
-    spin_lock_irqsave(&vector_lock, flags);
-    ret = __bind_irq_vector(irq, vector, cpu_mask);
-    spin_unlock_irqrestore(&vector_lock, flags);
+    BUG_ON((unsigned)irq >= nr_irqs);
+
+    spin_lock_irqsave(&desc->lock, flags);
+    spin_lock(&vector_lock);
+    ret = _bind_irq_vector(desc, vector, cpu_mask);
+    spin_unlock(&vector_lock);
+    spin_unlock_irqrestore(&desc->lock, flags);
+
      return ret;
  }
  
@@ -266,18 +272,20 @@ void destroy_irq(unsigned int irq)
  
      spin_lock_irqsave(&desc->lock, flags);
      desc->handler = &no_irq_type;
-    clear_irq_vector(irq);
+    spin_lock(&vector_lock);
+    _clear_irq_vector(desc);
+    spin_unlock(&vector_lock);
      desc->arch.used_vectors = NULL;
      spin_unlock_irqrestore(&desc->lock, flags);
  
      xfree(action);
  }
  
-static void __clear_irq_vector(int irq)
+static void _clear_irq_vector(struct irq_desc *desc)
  {
-    int cpu, vector, old_vector;
+    unsigned int cpu;
+    int vector, old_vector, irq = desc->irq;
      cpumask_t tmp_mask;
-    struct irq_desc *desc = irq_to_desc(irq);
  
      BUG_ON(!desc->arch.vector);
  
@@ -323,11 +331,14 @@ static void __clear_irq_vector(int irq)
  
  void clear_irq_vector(int irq)
  {
+    struct irq_desc *desc = irq_to_desc(irq);
      unsigned long flags;
  
-    spin_lock_irqsave(&vector_lock, flags);
-    __clear_irq_vector(irq);
-    spin_unlock_irqrestore(&vector_lock, flags);
+    spin_lock_irqsave(&desc->lock, flags);
+    spin_lock(&vector_lock);
+    _clear_irq_vector(desc);
+    spin_unlock(&vector_lock);
+    spin_unlock_irqrestore(&desc->lock, flags);
  }
  
  int irq_to_vector(int irq)
@@ -462,8 +473,7 @@ static vmask_t *irq_get_used_vector_mask
      return ret;
  }
  
-static int __assign_irq_vector(
-    int irq, struct irq_desc *desc, const cpumask_t *mask)
+static int _assign_irq_vector(struct irq_desc *desc, const cpumask_t *mask)
  {
      /*
       * NOTE! The local APIC isn't very good at handling
@@ -477,7 +487,8 @@ static int __assign_irq_vector(
       * 0x80, because int 0x80 is hm, kind of importantish. ;)
       */
      static int current_vector = FIRST_DYNAMIC_VECTOR, current_offset = 0;
-    int cpu, err, old_vector;
+    unsigned int cpu;
+    int err, old_vector, irq = desc->irq;
      vmask_t *irq_used_vectors = NULL;
  
      old_vector = irq_to_vector(irq);
@@ -590,8 +601,12 @@ int assign_irq_vector(int irq, const cpu
      
      BUG_ON(irq >= nr_irqs || irq <0);
  
-    spin_lock_irqsave(&vector_lock, flags);
-    ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS);
+    spin_lock_irqsave(&desc->lock, flags);
+
+    spin_lock(&vector_lock);
+    ret = _assign_irq_vector(desc, mask ?: TARGET_CPUS);
+    spin_unlock(&vector_lock);
+
      if ( !ret )
      {
          ret = desc->arch.vector;
@@ -600,14 +615,16 @@ int assign_irq_vector(int irq, const cpu
          else
              cpumask_setall(desc->affinity);
      }
-    spin_unlock_irqrestore(&vector_lock, flags);
+
+    spin_unlock_irqrestore(&desc->lock, flags);
  
      return ret;
  }
  
  /*
   * Initialize vector_irq on a new cpu. This function must be called
- * with vector_lock held.
+ * with vector_lock held.  For this reason it may not itself acquire
+ * the IRQ descriptor locks, as lock nesting is the other way around.
   */
  void setup_vector_irq(unsigned int cpu)
  {
@@ -775,7 +792,6 @@ void irq_complete_move(struct irq_desc *
  
  unsigned int set_desc_affinity(struct irq_desc *desc, const cpumask_t *mask)
  {
-    unsigned int irq;
      int ret;
      unsigned long flags;
      cpumask_t dest_mask;
@@ -783,10 +799,8 @@ unsigned int set_desc_affinity(struct ir
      if (!cpumask_intersects(mask, &cpu_online_map))
          return BAD_APICID;
  
-    irq = desc->irq;
-
      spin_lock_irqsave(&vector_lock, flags);
-    ret = __assign_irq_vector(irq, desc, mask);
+    ret = _assign_irq_vector(desc, mask);
      spin_unlock_irqrestore(&vector_lock, flags);
  
      if (ret < 0)
@@ -2453,7 +2467,7 @@ void fixup_irqs(const cpumask_t *mask, b
  
          /*
           * In order for the affinity adjustment below to be successful, we
-         * need __assign_irq_vector() to succeed. This in particular means
+         * need _assign_irq_vector() to succeed. This in particular means
           * clearing desc->arch.move_in_progress if this would otherwise
           * prevent the function from succeeding. Since there's no way for the
           * flag to get cleared anymore when there's no possible destination
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2134,11 +2134,16 @@ static void adjust_irq_affinity(struct a
      unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
                               : NUMA_NO_NODE;
      const cpumask_t *cpumask = &cpu_online_map;
+    struct irq_desc *desc;
  
      if ( node < MAX_NUMNODES && node_online(node) &&
           cpumask_intersects(&node_to_cpumask(node), cpumask) )
          cpumask = &node_to_cpumask(node);
-    dma_msi_set_affinity(irq_to_desc(drhd->iommu->msi.irq), cpumask);
+
+    desc = irq_to_desc(drhd->iommu->msi.irq);
+    spin_lock_irq(&desc->lock);
+    dma_msi_set_affinity(desc, cpumask);
+    spin_unlock_irq(&desc->lock);
  }
  
  static int adjust_vtd_irq_affinities(void)

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 06/13] x86/IOMMU: don't restrict IRQ affinities to online CPUs
Posted by Jan Beulich 4 years, 9 months ago
In line with "x86/IRQ: desc->affinity should strictly represent the
requested value" the internally used IRQ(s) also shouldn't be restricted
to online ones. Make set_desc_affinity() (set_msi_affinity() then does
by implication) cope with a NULL mask being passed (just like
assign_irq_vector() does), and have IOMMU code pass NULL instead of
&cpu_online_map (when, for VT-d, there's no NUMA node information
available).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -796,18 +796,26 @@ unsigned int set_desc_affinity(struct ir
      unsigned long flags;
      cpumask_t dest_mask;
  
-    if (!cpumask_intersects(mask, &cpu_online_map))
+    if ( mask && !cpumask_intersects(mask, &cpu_online_map) )
          return BAD_APICID;
  
      spin_lock_irqsave(&vector_lock, flags);
-    ret = _assign_irq_vector(desc, mask);
+    ret = _assign_irq_vector(desc, mask ?: TARGET_CPUS);
      spin_unlock_irqrestore(&vector_lock, flags);
  
-    if (ret < 0)
+    if ( ret < 0 )
          return BAD_APICID;
  
-    cpumask_copy(desc->affinity, mask);
-    cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
+    if ( mask )
+    {
+        cpumask_copy(desc->affinity, mask);
+        cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
+    }
+    else
+    {
+        cpumask_setall(desc->affinity);
+        cpumask_copy(&dest_mask, desc->arch.cpu_mask);
+    }
      cpumask_and(&dest_mask, &dest_mask, &cpu_online_map);
  
      return cpu_mask_to_apicid(&dest_mask);
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -888,7 +888,7 @@ static void enable_iommu(struct amd_iomm
  
      desc = irq_to_desc(iommu->msi.irq);
      spin_lock(&desc->lock);
-    set_msi_affinity(desc, &cpu_online_map);
+    set_msi_affinity(desc, NULL);
      spin_unlock(&desc->lock);
  
      amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2133,11 +2133,11 @@ static void adjust_irq_affinity(struct a
      const struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
      unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
                               : NUMA_NO_NODE;
-    const cpumask_t *cpumask = &cpu_online_map;
+    const cpumask_t *cpumask = NULL;
      struct irq_desc *desc;
  
      if ( node < MAX_NUMNODES && node_online(node) &&
-         cpumask_intersects(&node_to_cpumask(node), cpumask) )
+         cpumask_intersects(&node_to_cpumask(node), &cpu_online_map) )
          cpumask = &node_to_cpumask(node);
  
      desc = irq_to_desc(drhd->iommu->msi.irq);
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 06/13] x86/IOMMU: don't restrict IRQ affinities to online CPUs
Posted by Roger Pau Monné 4 years, 9 months ago
On Tue, Jul 16, 2019 at 07:40:57AM +0000, Jan Beulich wrote:
> In line with "x86/IRQ: desc->affinity should strictly represent the
> requested value" the internally used IRQ(s) also shouldn't be restricted
> to online ones. Make set_desc_affinity() (set_msi_affinity() then does
> by implication) cope with a NULL mask being passed (just like
> assign_irq_vector() does), and have IOMMU code pass NULL instead of
> &cpu_online_map (when, for VT-d, there's no NUMA node information
> available).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

LGTM, just one patch style comment and one code comment:

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

> ---
> v4: New.
> 
> --- a/xen/arch/x86/irq.c
> +++ b/xen/arch/x86/irq.c
> @@ -796,18 +796,26 @@ unsigned int set_desc_affinity(struct ir
>       unsigned long flags;
>       cpumask_t dest_mask;
>   
> -    if (!cpumask_intersects(mask, &cpu_online_map))
> +    if ( mask && !cpumask_intersects(mask, &cpu_online_map) )
>           return BAD_APICID;
>   
>       spin_lock_irqsave(&vector_lock, flags);
> -    ret = _assign_irq_vector(desc, mask);
> +    ret = _assign_irq_vector(desc, mask ?: TARGET_CPUS);
>       spin_unlock_irqrestore(&vector_lock, flags);

I think the patch is somehow mangled at least on my end, there's one
prepended extra space in the non-modified lines AFAICT.

>   
> -    if (ret < 0)
> +    if ( ret < 0 )
>           return BAD_APICID;
>   
> -    cpumask_copy(desc->affinity, mask);

AFAICT you could also avoid the if and just do the same as in the
assign_irq_vector call above and pass TARGET_CPUS if mask is NULL?

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 06/13] x86/IOMMU: don't restrict IRQ affinities to online CPUs
Posted by Jan Beulich 4 years, 9 months ago
On 16.07.2019 11:12, Roger Pau Monné  wrote:
> On Tue, Jul 16, 2019 at 07:40:57AM +0000, Jan Beulich wrote:
>> In line with "x86/IRQ: desc->affinity should strictly represent the
>> requested value" the internally used IRQ(s) also shouldn't be restricted
>> to online ones. Make set_desc_affinity() (set_msi_affinity() then does
>> by implication) cope with a NULL mask being passed (just like
>> assign_irq_vector() does), and have IOMMU code pass NULL instead of
>> &cpu_online_map (when, for VT-d, there's no NUMA node information
>> available).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> LGTM, just one patch style comment and one code comment:
> 
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

>> --- a/xen/arch/x86/irq.c
>> +++ b/xen/arch/x86/irq.c
>> @@ -796,18 +796,26 @@ unsigned int set_desc_affinity(struct ir
>>        unsigned long flags;
>>        cpumask_t dest_mask;
>>    
>> -    if (!cpumask_intersects(mask, &cpu_online_map))
>> +    if ( mask && !cpumask_intersects(mask, &cpu_online_map) )
>>            return BAD_APICID;
>>    
>>        spin_lock_irqsave(&vector_lock, flags);
>> -    ret = _assign_irq_vector(desc, mask);
>> +    ret = _assign_irq_vector(desc, mask ?: TARGET_CPUS);
>>        spin_unlock_irqrestore(&vector_lock, flags);
> 
> I think the patch is somehow mangled at least on my end, there's one
> prepended extra space in the non-modified lines AFAICT.

Well, yes, hence the last sentence in the cover letter and the attached
patches there. It is the mail system (more likely server than client)
over here which causes this issue (everywhere for me).

>>    
>> -    if (ret < 0)
>> +    if ( ret < 0 )
>>            return BAD_APICID;
>>    
>> -    cpumask_copy(desc->affinity, mask);
> 
> AFAICT you could also avoid the if and just do the same as in the
> assign_irq_vector call above and pass TARGET_CPUS if mask is NULL?

Are you talking about the if() in context above, or the one you've
stripped (immediately following the last quoted line of the patch)?
For the one in context I don't see how the rest of your remark is
related. For the other one - no, strictly not, as that would be
against the purpose of this change: We specifically want to _not_
restrict desc->affinity to online CPUs only (yet that's what
TARGET_CPUS resolves to).

Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 06/13] x86/IOMMU: don't restrict IRQ affinities to online CPUs
Posted by Roger Pau Monné 4 years, 9 months ago
On Tue, Jul 16, 2019 at 10:20:10AM +0000, Jan Beulich wrote:
> On 16.07.2019 11:12, Roger Pau Monné  wrote:
> > On Tue, Jul 16, 2019 at 07:40:57AM +0000, Jan Beulich wrote:
> >> In line with "x86/IRQ: desc->affinity should strictly represent the
> >> requested value" the internally used IRQ(s) also shouldn't be restricted
> >> to online ones. Make set_desc_affinity() (set_msi_affinity() then does
> >> by implication) cope with a NULL mask being passed (just like
> >> assign_irq_vector() does), and have IOMMU code pass NULL instead of
> >> &cpu_online_map (when, for VT-d, there's no NUMA node information
> >> available).
> >>
> >> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> > 
> > LGTM, just one patch style comment and one code comment:
> > 
> > Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> Thanks.
> 
> >> --- a/xen/arch/x86/irq.c
> >> +++ b/xen/arch/x86/irq.c
> >> @@ -796,18 +796,26 @@ unsigned int set_desc_affinity(struct ir
> >>        unsigned long flags;
> >>        cpumask_t dest_mask;
> >>    
> >> -    if (!cpumask_intersects(mask, &cpu_online_map))
> >> +    if ( mask && !cpumask_intersects(mask, &cpu_online_map) )
> >>            return BAD_APICID;
> >>    
> >>        spin_lock_irqsave(&vector_lock, flags);
> >> -    ret = _assign_irq_vector(desc, mask);
> >> +    ret = _assign_irq_vector(desc, mask ?: TARGET_CPUS);
> >>        spin_unlock_irqrestore(&vector_lock, flags);
> > 
> > I think the patch is somehow mangled at least on my end, there's one
> > prepended extra space in the non-modified lines AFAICT.
> 
> Well, yes, hence the last sentence in the cover letter and the attached
> patches there. It is the mail system (more likely server than client)
> over here which causes this issue (everywhere for me).

Oh, sorry to hear that. Hope you get that sorted out, I guess it's
causing quite a lot of pain for more people at SUSE also.

> >>    
> >> -    if (ret < 0)
> >> +    if ( ret < 0 )
> >>            return BAD_APICID;
> >>    
> >> -    cpumask_copy(desc->affinity, mask);
> > 
> > AFAICT you could also avoid the if and just do the same as in the
> > assign_irq_vector call above and pass TARGET_CPUS if mask is NULL?
> 
> Are you talking about the if() in context above, or the one you've
> stripped (immediately following the last quoted line of the patch)?
> For the one in context I don't see how the rest of your remark is
> related. For the other one - no, strictly not, as that would be
> against the purpose of this change: We specifically want to _not_
> restrict desc->affinity to online CPUs only (yet that's what
> TARGET_CPUS resolves to).

Yes, that was my remark - which is wrong as you pointed out. I guess
you could use cpu_possible_map, but anyway the current approach is OK
IMO.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 06/13] x86/IOMMU: don't restrict IRQ affinities to online CPUs
Posted by Andrew Cooper 4 years, 9 months ago
On 16/07/2019 08:40, Jan Beulich wrote:
> In line with "x86/IRQ: desc->affinity should strictly represent the
> requested value" the internally used IRQ(s) also shouldn't be restricted
> to online ones. Make set_desc_affinity() (set_msi_affinity() then does
> by implication) cope with a NULL mask being passed (just like
> assign_irq_vector() does), and have IOMMU code pass NULL instead of
> &cpu_online_map (when, for VT-d, there's no NUMA node information
> available).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 06/13] x86/IOMMU: don't restrict IRQ affinities to online CPUs
Posted by Tian, Kevin 4 years, 9 months ago
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: Tuesday, July 16, 2019 3:41 PM
> 
> In line with "x86/IRQ: desc->affinity should strictly represent the
> requested value" the internally used IRQ(s) also shouldn't be restricted
> to online ones. Make set_desc_affinity() (set_msi_affinity() then does
> by implication) cope with a NULL mask being passed (just like
> assign_irq_vector() does), and have IOMMU code pass NULL instead of
> &cpu_online_map (when, for VT-d, there's no NUMA node information
> available).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 06/13] x86/IOMMU: don't restrict IRQ affinities to online CPUs
Posted by Woods, Brian 4 years, 9 months ago
On Tue, Jul 16, 2019 at 07:40:57AM +0000, Jan Beulich wrote:
> In line with "x86/IRQ: desc->affinity should strictly represent the
> requested value" the internally used IRQ(s) also shouldn't be restricted
> to online ones. Make set_desc_affinity() (set_msi_affinity() then does
> by implication) cope with a NULL mask being passed (just like
> assign_irq_vector() does), and have IOMMU code pass NULL instead of
> &cpu_online_map (when, for VT-d, there's no NUMA node information
> available).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Brian Woods <brian.woods@amd.com>

> ---
> v4: New.
> 
> --- a/xen/arch/x86/irq.c
> +++ b/xen/arch/x86/irq.c
> @@ -796,18 +796,26 @@ unsigned int set_desc_affinity(struct ir
>       unsigned long flags;
>       cpumask_t dest_mask;
>   
> -    if (!cpumask_intersects(mask, &cpu_online_map))
> +    if ( mask && !cpumask_intersects(mask, &cpu_online_map) )
>           return BAD_APICID;
>   
>       spin_lock_irqsave(&vector_lock, flags);
> -    ret = _assign_irq_vector(desc, mask);
> +    ret = _assign_irq_vector(desc, mask ?: TARGET_CPUS);
>       spin_unlock_irqrestore(&vector_lock, flags);
>   
> -    if (ret < 0)
> +    if ( ret < 0 )
>           return BAD_APICID;
>   
> -    cpumask_copy(desc->affinity, mask);
> -    cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
> +    if ( mask )
> +    {
> +        cpumask_copy(desc->affinity, mask);
> +        cpumask_and(&dest_mask, mask, desc->arch.cpu_mask);
> +    }
> +    else
> +    {
> +        cpumask_setall(desc->affinity);
> +        cpumask_copy(&dest_mask, desc->arch.cpu_mask);
> +    }
>       cpumask_and(&dest_mask, &dest_mask, &cpu_online_map);
>   
>       return cpu_mask_to_apicid(&dest_mask);
> --- a/xen/drivers/passthrough/amd/iommu_init.c
> +++ b/xen/drivers/passthrough/amd/iommu_init.c
> @@ -888,7 +888,7 @@ static void enable_iommu(struct amd_iomm
>   
>       desc = irq_to_desc(iommu->msi.irq);
>       spin_lock(&desc->lock);
> -    set_msi_affinity(desc, &cpu_online_map);
> +    set_msi_affinity(desc, NULL);
>       spin_unlock(&desc->lock);
>   
>       amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED);
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -2133,11 +2133,11 @@ static void adjust_irq_affinity(struct a
>       const struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
>       unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
>                                : NUMA_NO_NODE;
> -    const cpumask_t *cpumask = &cpu_online_map;
> +    const cpumask_t *cpumask = NULL;
>       struct irq_desc *desc;
>   
>       if ( node < MAX_NUMNODES && node_online(node) &&
> -         cpumask_intersects(&node_to_cpumask(node), cpumask) )
> +         cpumask_intersects(&node_to_cpumask(node), &cpu_online_map) )
>           cpumask = &node_to_cpumask(node);
>   
>       desc = irq_to_desc(drhd->iommu->msi.irq);

-- 
Brian Woods

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 07/13] x86/IRQ: target online CPUs when binding guest IRQ
Posted by Jan Beulich 4 years, 9 months ago
fixup_irqs() skips interrupts without action. Hence such interrupts can
retain affinity to just offline CPUs. With "noirqbalance" in effect,
pirq_guest_bind() so far would have left them alone, resulting in a non-
working interrupt.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v3: New.
---
I've not observed this problem in practice - the change is just the
result of code inspection after having noticed action-less IRQs in 'i'
debug key output pointing at all parked/offline CPUs.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1703,9 +1703,27 @@ int pirq_guest_bind(struct vcpu *v, stru
  
          desc->status |= IRQ_GUEST;
  
-        /* Attempt to bind the interrupt target to the correct CPU. */
-        if ( !opt_noirqbalance && (desc->handler->set_affinity != NULL) )
-            desc->handler->set_affinity(desc, cpumask_of(v->processor));
+        /*
+         * Attempt to bind the interrupt target to the correct (or at least
+         * some online) CPU.
+         */
+        if ( desc->handler->set_affinity )
+        {
+            const cpumask_t *affinity = NULL;
+
+            if ( !opt_noirqbalance )
+                affinity = cpumask_of(v->processor);
+            else if ( !cpumask_intersects(desc->affinity, &cpu_online_map) )
+            {
+                cpumask_setall(desc->affinity);
+                affinity = &cpumask_all;
+            }
+            else if ( !cpumask_intersects(desc->arch.cpu_mask,
+                                          &cpu_online_map) )
+                affinity = desc->affinity;
+            if ( affinity )
+                desc->handler->set_affinity(desc, affinity);
+        }
  
          desc->status &= ~IRQ_DISABLED;
          desc->handler->startup(desc);

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 08/13] x86/IRQs: correct/tighten vector check in _clear_irq_vector()
Posted by Jan Beulich 4 years, 9 months ago
If any particular value was to be checked against, it would need to be
IRQ_VECTOR_UNASSIGNED.

Reported-by: Roger Pau Monné <roger.pau@citrix.com>

Be more strict though and use valid_irq_vector() instead.

Take the opportunity and also convert local variables to unsigned int.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v2: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -283,14 +283,13 @@ void destroy_irq(unsigned int irq)
  
  static void _clear_irq_vector(struct irq_desc *desc)
  {
-    unsigned int cpu;
-    int vector, old_vector, irq = desc->irq;
+    unsigned int cpu, old_vector, irq = desc->irq;
+    unsigned int vector = desc->arch.vector;
      cpumask_t tmp_mask;
  
-    BUG_ON(!desc->arch.vector);
+    BUG_ON(!valid_irq_vector(vector));
  
      /* Always clear desc->arch.vector */
-    vector = desc->arch.vector;
      cpumask_and(&tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
  
      for_each_cpu(cpu, &tmp_mask) {
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 09/13] x86/IRQ: make fixup_irqs() skip unconnected internally used interrupts
Posted by Jan Beulich 4 years, 9 months ago
Since the "Cannot set affinity ..." warning is a one time one, avoid
triggering it already at boot time when parking secondary threads and
the serial console uses a (still unconnected at that time) PCI IRQ.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -2472,8 +2472,20 @@ void fixup_irqs(const cpumask_t *mask, b
          vector = irq_to_vector(irq);
          if ( vector >= FIRST_HIPRIORITY_VECTOR &&
               vector <= LAST_HIPRIORITY_VECTOR )
+        {
              cpumask_and(desc->arch.cpu_mask, desc->arch.cpu_mask, mask);
  
+            /*
+             * This can in particular happen when parking secondary threads
+             * during boot and when the serial console wants to use a PCI IRQ.
+             */
+            if ( desc->handler == &no_irq_type )
+            {
+                spin_unlock(&desc->lock);
+                continue;
+            }
+        }
+
          if ( desc->arch.move_cleanup_count )
          {
              /* The cleanup IPI may have got sent while we were still online. */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 10/13] x86/IRQ: drop redundant cpumask_empty() from move_masked_irq()
Posted by Jan Beulich 4 years, 9 months ago
The subsequent cpumask_intersects() covers the "empty" case quite fine.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -658,9 +658,6 @@ void move_masked_irq(struct irq_desc *de
      
      desc->status &= ~IRQ_MOVE_PENDING;
  
-    if (unlikely(cpumask_empty(pending_mask)))
-        return;
-
      if (!desc->handler->set_affinity)
          return;
  

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 11/13] x86/IRQ: tighten vector checks
Posted by Jan Beulich 4 years, 9 months ago
Use valid_irq_vector() rather than "> 0".

Also replace an open-coded use of IRQ_VECTOR_UNASSIGNED.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v3: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -342,7 +342,7 @@ void clear_irq_vector(int irq)
  
  int irq_to_vector(int irq)
  {
-    int vector = -1;
+    int vector = IRQ_VECTOR_UNASSIGNED;
  
      BUG_ON(irq >= nr_irqs || irq < 0);
  
@@ -452,15 +452,18 @@ static vmask_t *irq_get_used_vector_mask
              int vector;
              
              vector = irq_to_vector(irq);
-            if ( vector > 0 )
+            if ( valid_irq_vector(vector) )
              {
-                printk(XENLOG_INFO "IRQ %d already assigned vector %d\n",
+                printk(XENLOG_INFO "IRQ%d already assigned vector %02x\n",
                         irq, vector);
                  
                  ASSERT(!test_bit(vector, ret));
  
                  set_bit(vector, ret);
              }
+            else if ( vector != IRQ_VECTOR_UNASSIGNED )
+                printk(XENLOG_WARNING "IRQ%d mapped to bogus vector %02x\n",
+                       irq, vector);
          }
      }
      else if ( IO_APIC_IRQ(irq) &&
@@ -491,7 +494,7 @@ static int _assign_irq_vector(struct irq
      vmask_t *irq_used_vectors = NULL;
  
      old_vector = irq_to_vector(irq);
-    if ( old_vector > 0 )
+    if ( valid_irq_vector(old_vector) )
      {
          cpumask_t tmp_mask;
  
@@ -555,7 +558,7 @@ next:
          current_vector = vector;
          current_offset = offset;
  
-        if ( old_vector > 0 )
+        if ( valid_irq_vector(old_vector) )
          {
              cpumask_and(desc->arch.old_cpu_mask, desc->arch.cpu_mask,
                          &cpu_online_map);

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 12/13] x86/IRQ: eliminate some on-stack cpumask_t instances
Posted by Jan Beulich 4 years, 9 months ago
Use scratch_cpumask where possible, to avoid creating these possibly
large stack objects. We can't use it in _assign_irq_vector() and
set_desc_affinity(), as these get called in IRQ context.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v4: Re-base over changes earlier in the series.
v3: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -285,14 +285,15 @@ static void _clear_irq_vector(struct irq
  {
      unsigned int cpu, old_vector, irq = desc->irq;
      unsigned int vector = desc->arch.vector;
-    cpumask_t tmp_mask;
+    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
  
      BUG_ON(!valid_irq_vector(vector));
  
      /* Always clear desc->arch.vector */
-    cpumask_and(&tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
+    cpumask_and(tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
  
-    for_each_cpu(cpu, &tmp_mask) {
+    for_each_cpu(cpu, tmp_mask)
+    {
          ASSERT( per_cpu(vector_irq, cpu)[vector] == irq );
          per_cpu(vector_irq, cpu)[vector] = ~irq;
      }
@@ -308,16 +309,17 @@ static void _clear_irq_vector(struct irq
  
      desc->arch.used = IRQ_UNUSED;
  
-    trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, &tmp_mask);
+    trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
  
      if ( likely(!desc->arch.move_in_progress) )
          return;
  
      /* If we were in motion, also clear desc->arch.old_vector */
      old_vector = desc->arch.old_vector;
-    cpumask_and(&tmp_mask, desc->arch.old_cpu_mask, &cpu_online_map);
+    cpumask_and(tmp_mask, desc->arch.old_cpu_mask, &cpu_online_map);
  
-    for_each_cpu(cpu, &tmp_mask) {
+    for_each_cpu(cpu, tmp_mask)
+    {
          ASSERT( per_cpu(vector_irq, cpu)[old_vector] == irq );
          TRACE_3D(TRC_HW_IRQ_MOVE_FINISH, irq, old_vector, cpu);
          per_cpu(vector_irq, cpu)[old_vector] = ~irq;
@@ -1169,7 +1171,6 @@ static void irq_guest_eoi_timer_fn(void
      struct irq_desc *desc = data;
      unsigned int i, irq = desc - irq_desc;
      irq_guest_action_t *action;
-    cpumask_t cpu_eoi_map;
  
      spin_lock_irq(&desc->lock);
      
@@ -1206,14 +1207,18 @@ static void irq_guest_eoi_timer_fn(void
  
      switch ( action->ack_type )
      {
+        cpumask_t *cpu_eoi_map;
+
      case ACKTYPE_UNMASK:
          if ( desc->handler->end )
              desc->handler->end(desc, 0);
          break;
+
      case ACKTYPE_EOI:
-        cpumask_copy(&cpu_eoi_map, action->cpu_eoi_map);
+        cpu_eoi_map = this_cpu(scratch_cpumask);
+        cpumask_copy(cpu_eoi_map, action->cpu_eoi_map);
          spin_unlock_irq(&desc->lock);
-        on_selected_cpus(&cpu_eoi_map, set_eoi_ready, desc, 0);
+        on_selected_cpus(cpu_eoi_map, set_eoi_ready, desc, 0);
          return;
      }
  
@@ -2458,7 +2463,7 @@ void fixup_irqs(const cpumask_t *mask, b
      {
          bool break_affinity = false, set_affinity = true;
          unsigned int vector;
-        cpumask_t affinity;
+        cpumask_t *affinity = this_cpu(scratch_cpumask);
  
          if ( irq == 2 )
              continue;
@@ -2489,9 +2494,9 @@ void fixup_irqs(const cpumask_t *mask, b
          if ( desc->arch.move_cleanup_count )
          {
              /* The cleanup IPI may have got sent while we were still online. */
-            cpumask_andnot(&affinity, desc->arch.old_cpu_mask,
+            cpumask_andnot(affinity, desc->arch.old_cpu_mask,
                             &cpu_online_map);
-            desc->arch.move_cleanup_count -= cpumask_weight(&affinity);
+            desc->arch.move_cleanup_count -= cpumask_weight(affinity);
              if ( !desc->arch.move_cleanup_count )
                  release_old_vec(desc);
          }
@@ -2518,10 +2523,10 @@ void fixup_irqs(const cpumask_t *mask, b
          {
              unsigned int cpu;
  
-            cpumask_and(&affinity, desc->arch.old_cpu_mask, &cpu_online_map);
+            cpumask_and(affinity, desc->arch.old_cpu_mask, &cpu_online_map);
  
              spin_lock(&vector_lock);
-            for_each_cpu(cpu, &affinity)
+            for_each_cpu(cpu, affinity)
                  per_cpu(vector_irq, cpu)[desc->arch.old_vector] = ~irq;
              spin_unlock(&vector_lock);
  
@@ -2532,23 +2537,23 @@ void fixup_irqs(const cpumask_t *mask, b
          if ( !cpumask_intersects(mask, desc->affinity) )
          {
              break_affinity = true;
-            cpumask_setall(&affinity);
+            cpumask_setall(affinity);
          }
          else
-            cpumask_copy(&affinity, desc->affinity);
+            cpumask_copy(affinity, desc->affinity);
  
          if ( desc->handler->disable )
              desc->handler->disable(desc);
  
          if ( desc->handler->set_affinity )
-            desc->handler->set_affinity(desc, &affinity);
+            desc->handler->set_affinity(desc, affinity);
          else if ( !(warned++) )
              set_affinity = false;
  
          if ( desc->handler->enable )
              desc->handler->enable(desc);
  
-        cpumask_copy(&affinity, desc->affinity);
+        cpumask_copy(affinity, desc->affinity);
  
          spin_unlock(&desc->lock);
  
@@ -2559,7 +2564,7 @@ void fixup_irqs(const cpumask_t *mask, b
              printk("Cannot set affinity for IRQ%u\n", irq);
          else if ( break_affinity )
              printk("Broke affinity for IRQ%u, new: %*pb\n",
-                   irq, nr_cpu_ids, cpumask_bits(&affinity));
+                   irq, nr_cpu_ids, cpumask_bits(affinity));
      }
  
      /* That doesn't seem sufficient.  Give it 1ms. */

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 13/13] x86/IRQ: move {,_}clear_irq_vector()
Posted by Jan Beulich 4 years, 9 months ago
This is largely to drop a forward declaration. There's one functional
change - clear_irq_vector() gets marked __init, as its only caller is
check_timer(). Beyond this only a few stray blanks get removed.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v3: New.

--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -27,7 +27,6 @@
  #include <public/physdev.h>
  
  static int parse_irq_vector_map_param(const char *s);
-static void _clear_irq_vector(struct irq_desc *desc);
  
  /* opt_noirqbalance: If true, software IRQ balancing/affinity is disabled. */
  bool __read_mostly opt_noirqbalance;
@@ -191,6 +190,67 @@ int __init bind_irq_vector(int irq, int
      return ret;
  }
  
+static void _clear_irq_vector(struct irq_desc *desc)
+{
+    unsigned int cpu, old_vector, irq = desc->irq;
+    unsigned int vector = desc->arch.vector;
+    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
+
+    BUG_ON(!valid_irq_vector(vector));
+
+    /* Always clear desc->arch.vector */
+    cpumask_and(tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
+
+    for_each_cpu(cpu, tmp_mask)
+    {
+        ASSERT(per_cpu(vector_irq, cpu)[vector] == irq);
+        per_cpu(vector_irq, cpu)[vector] = ~irq;
+    }
+
+    desc->arch.vector = IRQ_VECTOR_UNASSIGNED;
+    cpumask_clear(desc->arch.cpu_mask);
+
+    if ( desc->arch.used_vectors )
+    {
+        ASSERT(test_bit(vector, desc->arch.used_vectors));
+        clear_bit(vector, desc->arch.used_vectors);
+    }
+
+    desc->arch.used = IRQ_UNUSED;
+
+    trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
+
+    if ( likely(!desc->arch.move_in_progress) )
+        return;
+
+    /* If we were in motion, also clear desc->arch.old_vector */
+    old_vector = desc->arch.old_vector;
+    cpumask_and(tmp_mask, desc->arch.old_cpu_mask, &cpu_online_map);
+
+    for_each_cpu(cpu, tmp_mask)
+    {
+        ASSERT(per_cpu(vector_irq, cpu)[old_vector] == irq);
+        TRACE_3D(TRC_HW_IRQ_MOVE_FINISH, irq, old_vector, cpu);
+        per_cpu(vector_irq, cpu)[old_vector] = ~irq;
+    }
+
+    release_old_vec(desc);
+
+    desc->arch.move_in_progress = 0;
+}
+
+void __init clear_irq_vector(int irq)
+{
+    struct irq_desc *desc = irq_to_desc(irq);
+    unsigned long flags;
+
+    spin_lock_irqsave(&desc->lock, flags);
+    spin_lock(&vector_lock);
+    _clear_irq_vector(desc);
+    spin_unlock(&vector_lock);
+    spin_unlock_irqrestore(&desc->lock, flags);
+}
+
  /*
   * Dynamic irq allocate and deallocation for MSI
   */
@@ -281,67 +341,6 @@ void destroy_irq(unsigned int irq)
      xfree(action);
  }
  
-static void _clear_irq_vector(struct irq_desc *desc)
-{
-    unsigned int cpu, old_vector, irq = desc->irq;
-    unsigned int vector = desc->arch.vector;
-    cpumask_t *tmp_mask = this_cpu(scratch_cpumask);
-
-    BUG_ON(!valid_irq_vector(vector));
-
-    /* Always clear desc->arch.vector */
-    cpumask_and(tmp_mask, desc->arch.cpu_mask, &cpu_online_map);
-
-    for_each_cpu(cpu, tmp_mask)
-    {
-        ASSERT( per_cpu(vector_irq, cpu)[vector] == irq );
-        per_cpu(vector_irq, cpu)[vector] = ~irq;
-    }
-
-    desc->arch.vector = IRQ_VECTOR_UNASSIGNED;
-    cpumask_clear(desc->arch.cpu_mask);
-
-    if ( desc->arch.used_vectors )
-    {
-        ASSERT(test_bit(vector, desc->arch.used_vectors));
-        clear_bit(vector, desc->arch.used_vectors);
-    }
-
-    desc->arch.used = IRQ_UNUSED;
-
-    trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, tmp_mask);
-
-    if ( likely(!desc->arch.move_in_progress) )
-        return;
-
-    /* If we were in motion, also clear desc->arch.old_vector */
-    old_vector = desc->arch.old_vector;
-    cpumask_and(tmp_mask, desc->arch.old_cpu_mask, &cpu_online_map);
-
-    for_each_cpu(cpu, tmp_mask)
-    {
-        ASSERT( per_cpu(vector_irq, cpu)[old_vector] == irq );
-        TRACE_3D(TRC_HW_IRQ_MOVE_FINISH, irq, old_vector, cpu);
-        per_cpu(vector_irq, cpu)[old_vector] = ~irq;
-    }
-
-    release_old_vec(desc);
-
-    desc->arch.move_in_progress = 0;
-}
-
-void clear_irq_vector(int irq)
-{
-    struct irq_desc *desc = irq_to_desc(irq);
-    unsigned long flags;
-
-    spin_lock_irqsave(&desc->lock, flags);
-    spin_lock(&vector_lock);
-    _clear_irq_vector(desc);
-    spin_unlock(&vector_lock);
-    spin_unlock_irqrestore(&desc->lock, flags);
-}
-
  int irq_to_vector(int irq)
  {
      int vector = IRQ_VECTOR_UNASSIGNED;

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel