[v4] evtchn: (not so) recent XSAs follow-on

[PATCH v4 00/10] evtchn: (not so) recent XSAs follow-on

Posted by Jan Beulich 5 years, 1 month ago

These are grouped into a series largely because of their origin,
not so much because there are heavy dependencies among them.
Compared to v3, there are several a new patches (in particular
the simultaneous locking of two domains' event locks gets done
away with) and movement of a controversial one to the end. See
also the individual patches.

1: use per-channel lock where possible
2: bind-interdomain doesn't need to hold both domains' event locks
3: convert domain event lock to an r/w one
4: don't call Xen consumer callback with per-channel lock held
5: closing of vIRQ-s doesn't require looping over all vCPU-s
6: slightly defer lock acquire where possible
7: add helper for port_is_valid() + evtchn_from_port()
8: closing of ports doesn't need to hold both domains' event locks
9: type adjustments
10: drop acquiring of per-channel lock from send_guest_{global,vcpu}_virq()

Jan

[PATCH v4 01/10] evtchn: use per-channel lock where possible

Posted by Jan Beulich 5 years, 1 month ago

Neither evtchn_status() nor domain_dump_evtchn_info() nor
flask_get_peer_sid() need to hold the per-domain lock - they all only
read a single channel's state (at a time, in the dump case).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -968,15 +968,16 @@ int evtchn_status(evtchn_status_t *statu
     if ( d == NULL )
         return -ESRCH;
 
-    spin_lock(&d->event_lock);
-
     if ( !port_is_valid(d, port) )
     {
-        rc = -EINVAL;
-        goto out;
+        rcu_unlock_domain(d);
+        return -EINVAL;
     }
 
     chn = evtchn_from_port(d, port);
+
+    evtchn_read_lock(chn);
+
     if ( consumer_is_xen(chn) )
     {
         rc = -EACCES;
@@ -1021,7 +1022,7 @@ int evtchn_status(evtchn_status_t *statu
     status->vcpu = chn->notify_vcpu_id;
 
  out:
-    spin_unlock(&d->event_lock);
+    evtchn_read_unlock(chn);
     rcu_unlock_domain(d);
 
     return rc;
@@ -1576,22 +1577,32 @@ void evtchn_move_pirqs(struct vcpu *v)
 static void domain_dump_evtchn_info(struct domain *d)
 {
     unsigned int port;
-    int irq;
 
     printk("Event channel information for domain %d:\n"
            "Polling vCPUs: {%*pbl}\n"
            "    port [p/m/s]\n", d->domain_id, d->max_vcpus, d->poll_mask);
 
-    spin_lock(&d->event_lock);
-
     for ( port = 1; port_is_valid(d, port); ++port )
     {
-        const struct evtchn *chn;
+        struct evtchn *chn;
         char *ssid;
 
+        if ( !(port & 0x3f) )
+            process_pending_softirqs();
+
         chn = evtchn_from_port(d, port);
+
+        if ( !evtchn_read_trylock(chn) )
+        {
+            printk("    %4u in flux\n", port);
+            continue;
+        }
+
         if ( chn->state == ECS_FREE )
+        {
+            evtchn_read_unlock(chn);
             continue;
+        }
 
         printk("    %4u [%d/%d/",
                port,
@@ -1601,26 +1612,49 @@ static void domain_dump_evtchn_info(stru
         printk("]: s=%d n=%d x=%d",
                chn->state, chn->notify_vcpu_id, chn->xen_consumer);
 
+        ssid = xsm_show_security_evtchn(d, chn);
+
         switch ( chn->state )
         {
         case ECS_UNBOUND:
             printk(" d=%d", chn->u.unbound.remote_domid);
             break;
+
         case ECS_INTERDOMAIN:
             printk(" d=%d p=%d",
                    chn->u.interdomain.remote_dom->domain_id,
                    chn->u.interdomain.remote_port);
             break;
-        case ECS_PIRQ:
-            irq = domain_pirq_to_irq(d, chn->u.pirq.irq);
-            printk(" p=%d i=%d", chn->u.pirq.irq, irq);
+
+        case ECS_PIRQ: {
+            unsigned int pirq = chn->u.pirq.irq;
+
+            /*
+             * The per-channel locks nest inside the per-domain one, so we
+             * can't acquire the latter without first letting go of the former.
+             */
+            evtchn_read_unlock(chn);
+            chn = NULL;
+            if ( spin_trylock(&d->event_lock) )
+            {
+                int irq = domain_pirq_to_irq(d, pirq);
+
+                spin_unlock(&d->event_lock);
+                printk(" p=%u i=%d", pirq, irq);
+            }
+            else
+                printk(" p=%u i=?", pirq);
             break;
+        }
+
         case ECS_VIRQ:
             printk(" v=%d", chn->u.virq);
             break;
         }
 
-        ssid = xsm_show_security_evtchn(d, chn);
+        if ( chn )
+            evtchn_read_unlock(chn);
+
         if (ssid) {
             printk(" Z=%s\n", ssid);
             xfree(ssid);
@@ -1628,8 +1662,6 @@ static void domain_dump_evtchn_info(stru
             printk("\n");
         }
     }
-
-    spin_unlock(&d->event_lock);
 }
 
 static void dump_evtchn_info(unsigned char key)
--- a/xen/xsm/flask/flask_op.c
+++ b/xen/xsm/flask/flask_op.c
@@ -555,12 +555,13 @@ static int flask_get_peer_sid(struct xen
     struct evtchn *chn;
     struct domain_security_struct *dsec;
 
-    spin_lock(&d->event_lock);
-
     if ( !port_is_valid(d, arg->evtchn) )
-        goto out;
+        return -EINVAL;
 
     chn = evtchn_from_port(d, arg->evtchn);
+
+    evtchn_read_lock(chn);
+
     if ( chn->state != ECS_INTERDOMAIN )
         goto out;
 
@@ -573,7 +574,7 @@ static int flask_get_peer_sid(struct xen
     rv = 0;
 
  out:
-    spin_unlock(&d->event_lock);
+    evtchn_read_unlock(chn);
     return rv;
 }

Re: [PATCH v4 01/10] evtchn: use per-channel lock where possible

Posted by Julien Grall 5 years, 1 month ago

Hi Jan,

On 05/01/2021 13:09, Jan Beulich wrote:
> Neither evtchn_status() nor domain_dump_evtchn_info() nor
> flask_get_peer_sid() need to hold the per-domain lock - they all only
> read a single channel's state (at a time, in the dump case).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v4: New.
> 
> --- a/xen/common/event_channel.c
> +++ b/xen/common/event_channel.c
> @@ -968,15 +968,16 @@ int evtchn_status(evtchn_status_t *statu
>       if ( d == NULL )
>           return -ESRCH;
>   
> -    spin_lock(&d->event_lock);
> -
>       if ( !port_is_valid(d, port) )

There is one issue that is now becoming more apparent. To be clear, the 
problem is not in this patch, but I think it is the best place to 
discuss it as d->event_lock may be part of the solution.

After XSA-344, evtchn_destroy() will end up to decrement d->valid_evtchns.

Given that evtchn_status() can work on the non-current domain, it would 
be possible to run it concurrently with evtchn_destroy(). As a 
consequence, port_is_valid() will be unstable as a valid event channel 
may turn invalid.

AFAICT, we are getting away so far, as the memory is not freed until the 
domain is fully destroyed. However, we re-introduced XSA-338 in a 
different way.

To be clear this is not the fault of this patch. But I don't think this 
is sane to re-introduce a behavior that lead us to an XSA.

I can see two solutions:
   1) Use d->event_lock to protect port_is_valid() when d != 
current->domain. This would require evtchn_destroy() to grab the lock 
when updating d->valid_evtchns.
   2) Never decrement d->valid_evtchns and use a different field for 
closing ports

I am not a big fan of 1) because this is muddying the already complex 
locking situation in the event channel code. But I suggested it because 
I wasn't sure whether you would be happy with 2).

If you are happy with 2), then the lock can be dropped here. I would be 
happy to send the patch for either 1) or 2) depending on the agreement here.

Cheers,


>       {
> -        rc = -EINVAL;
> -        goto out;
> +        rcu_unlock_domain(d);
> +        return -EINVAL;
>       }
>   
>       chn = evtchn_from_port(d, port);
> +
> +    evtchn_read_lock(chn);
> +
>       if ( consumer_is_xen(chn) )
>       {
>           rc = -EACCES;
> @@ -1021,7 +1022,7 @@ int evtchn_status(evtchn_status_t *statu
>       status->vcpu = chn->notify_vcpu_id;
>   
>    out:
> -    spin_unlock(&d->event_lock);
> +    evtchn_read_unlock(chn);
>       rcu_unlock_domain(d);
>   
>       return rc;
> @@ -1576,22 +1577,32 @@ void evtchn_move_pirqs(struct vcpu *v)
>   static void domain_dump_evtchn_info(struct domain *d)
>   {
>       unsigned int port;
> -    int irq;
>   
>       printk("Event channel information for domain %d:\n"
>              "Polling vCPUs: {%*pbl}\n"
>              "    port [p/m/s]\n", d->domain_id, d->max_vcpus, d->poll_mask);
>   
> -    spin_lock(&d->event_lock);
> -
>       for ( port = 1; port_is_valid(d, port); ++port )
>       {
> -        const struct evtchn *chn;
> +        struct evtchn *chn;
>           char *ssid;
>   
> +        if ( !(port & 0x3f) )
> +            process_pending_softirqs();
> +
>           chn = evtchn_from_port(d, port);
> +
> +        if ( !evtchn_read_trylock(chn) )
> +        {
> +            printk("    %4u in flux\n", port);
> +            continue;
> +        }
> +
>           if ( chn->state == ECS_FREE )
> +        {
> +            evtchn_read_unlock(chn);
>               continue;
> +        }
>   
>           printk("    %4u [%d/%d/",
>                  port,
> @@ -1601,26 +1612,49 @@ static void domain_dump_evtchn_info(stru
>           printk("]: s=%d n=%d x=%d",
>                  chn->state, chn->notify_vcpu_id, chn->xen_consumer);
>   
> +        ssid = xsm_show_security_evtchn(d, chn);
> +
>           switch ( chn->state )
>           {
>           case ECS_UNBOUND:
>               printk(" d=%d", chn->u.unbound.remote_domid);
>               break;
> +
>           case ECS_INTERDOMAIN:
>               printk(" d=%d p=%d",
>                      chn->u.interdomain.remote_dom->domain_id,
>                      chn->u.interdomain.remote_port);
>               break;
> -        case ECS_PIRQ:
> -            irq = domain_pirq_to_irq(d, chn->u.pirq.irq);
> -            printk(" p=%d i=%d", chn->u.pirq.irq, irq);
> +
> +        case ECS_PIRQ: {
> +            unsigned int pirq = chn->u.pirq.irq;
> +
> +            /*
> +             * The per-channel locks nest inside the per-domain one, so we
> +             * can't acquire the latter without first letting go of the former.
> +             */
> +            evtchn_read_unlock(chn);
> +            chn = NULL;
> +            if ( spin_trylock(&d->event_lock) )
> +            {
> +                int irq = domain_pirq_to_irq(d, pirq);
> +
> +                spin_unlock(&d->event_lock);
> +                printk(" p=%u i=%d", pirq, irq);
> +            }
> +            else
> +                printk(" p=%u i=?", pirq);
>               break;
> +        }
> +
>           case ECS_VIRQ:
>               printk(" v=%d", chn->u.virq);
>               break;
>           }
>   
> -        ssid = xsm_show_security_evtchn(d, chn);
> +        if ( chn )
> +            evtchn_read_unlock(chn);
> +
>           if (ssid) {
>               printk(" Z=%s\n", ssid);
>               xfree(ssid);
> @@ -1628,8 +1662,6 @@ static void domain_dump_evtchn_info(stru
>               printk("\n");
>           }
>       }
> -
> -    spin_unlock(&d->event_lock);
>   }
>   
>   static void dump_evtchn_info(unsigned char key)
> --- a/xen/xsm/flask/flask_op.c
> +++ b/xen/xsm/flask/flask_op.c
> @@ -555,12 +555,13 @@ static int flask_get_peer_sid(struct xen
>       struct evtchn *chn;
>       struct domain_security_struct *dsec;
>   
> -    spin_lock(&d->event_lock);
> -
>       if ( !port_is_valid(d, arg->evtchn) )
> -        goto out;
> +        return -EINVAL;
>   
>       chn = evtchn_from_port(d, arg->evtchn);
> +
> +    evtchn_read_lock(chn);
> +
>       if ( chn->state != ECS_INTERDOMAIN )
>           goto out;
>   
> @@ -573,7 +574,7 @@ static int flask_get_peer_sid(struct xen
>       rv = 0;
>   
>    out:
> -    spin_unlock(&d->event_lock);
> +    evtchn_read_unlock(chn);
>       return rv;
>   }
>   
> 

-- 
Julien Grall

Re: [PATCH v4 01/10] evtchn: use per-channel lock where possible

Posted by Jan Beulich 5 years ago

On 08.01.2021 21:32, Julien Grall wrote:
> Hi Jan,
> 
> On 05/01/2021 13:09, Jan Beulich wrote:
>> Neither evtchn_status() nor domain_dump_evtchn_info() nor
>> flask_get_peer_sid() need to hold the per-domain lock - they all only
>> read a single channel's state (at a time, in the dump case).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> v4: New.
>>
>> --- a/xen/common/event_channel.c
>> +++ b/xen/common/event_channel.c
>> @@ -968,15 +968,16 @@ int evtchn_status(evtchn_status_t *statu
>>       if ( d == NULL )
>>           return -ESRCH;
>>   
>> -    spin_lock(&d->event_lock);
>> -
>>       if ( !port_is_valid(d, port) )
> 
> There is one issue that is now becoming more apparent. To be clear, the 
> problem is not in this patch, but I think it is the best place to 
> discuss it as d->event_lock may be part of the solution.
> 
> After XSA-344, evtchn_destroy() will end up to decrement d->valid_evtchns.
> 
> Given that evtchn_status() can work on the non-current domain, it would 
> be possible to run it concurrently with evtchn_destroy(). As a 
> consequence, port_is_valid() will be unstable as a valid event channel 
> may turn invalid.
> 
> AFAICT, we are getting away so far, as the memory is not freed until the 
> domain is fully destroyed. However, we re-introduced XSA-338 in a 
> different way.
> 
> To be clear this is not the fault of this patch. But I don't think this 
> is sane to re-introduce a behavior that lead us to an XSA.

I'm getting confused, I'm afraid, from the varying statements above:
Are you suggesting this patch does re-introduce bad behavior or not?

Yes, the decrementing of ->valid_evtchns has a similar effect, but
I'm not convinced it gets us into XSA territory again. The problem
wasn't the reducing of ->max_evtchns as such, but the derived
assumptions elsewhere in the code. If there were any such again, I
suppose we'd have reason to issue another XSA.

Furthermore there are other paths already using port_is_valid()
without holding the domain's event lock; I've not been able to spot
a problem with this though, so far.

> I can see two solutions:
>    1) Use d->event_lock to protect port_is_valid() when d != 
> current->domain. This would require evtchn_destroy() to grab the lock 
> when updating d->valid_evtchns.
>    2) Never decrement d->valid_evtchns and use a different field for 
> closing ports
> 
> I am not a big fan of 1) because this is muddying the already complex 
> locking situation in the event channel code. But I suggested it because 
> I wasn't sure whether you would be happy with 2).

I agree 1) wouldn't be very nice, and you're right in assuming I
wouldn't like 2) very much. For the moment I'm not (yet) convinced
we need to do anything at all - as you say yourself, while the
result of port_is_valid() is potentially unstable when a domain is
in the process of being cleaned up, the state guarded by such
checks remains usable in (I think) a race free manner.

Jan

Re: [PATCH v4 01/10] evtchn: use per-channel lock where possible

Posted by Julien Grall 5 years ago

Hi Jan,

On 11/01/2021 10:14, Jan Beulich wrote:
> On 08.01.2021 21:32, Julien Grall wrote:
>> Hi Jan,
>>
>> On 05/01/2021 13:09, Jan Beulich wrote:
>>> Neither evtchn_status() nor domain_dump_evtchn_info() nor
>>> flask_get_peer_sid() need to hold the per-domain lock - they all only
>>> read a single channel's state (at a time, in the dump case).
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> ---
>>> v4: New.
>>>
>>> --- a/xen/common/event_channel.c
>>> +++ b/xen/common/event_channel.c
>>> @@ -968,15 +968,16 @@ int evtchn_status(evtchn_status_t *statu
>>>        if ( d == NULL )
>>>            return -ESRCH;
>>>    
>>> -    spin_lock(&d->event_lock);
>>> -
>>>        if ( !port_is_valid(d, port) )
>>
>> There is one issue that is now becoming more apparent. To be clear, the
>> problem is not in this patch, but I think it is the best place to
>> discuss it as d->event_lock may be part of the solution.
>>
>> After XSA-344, evtchn_destroy() will end up to decrement d->valid_evtchns.
>>
>> Given that evtchn_status() can work on the non-current domain, it would
>> be possible to run it concurrently with evtchn_destroy(). As a
>> consequence, port_is_valid() will be unstable as a valid event channel
>> may turn invalid.
>>
>> AFAICT, we are getting away so far, as the memory is not freed until the
>> domain is fully destroyed. However, we re-introduced XSA-338 in a
>> different way.
>>
>> To be clear this is not the fault of this patch. But I don't think this
>> is sane to re-introduce a behavior that lead us to an XSA.
> 
> I'm getting confused, I'm afraid, from the varying statements above:
> Are you suggesting this patch does re-introduce bad behavior or not?

No. I am pointing out that this is widening the bad behavior (again).

> 
> Yes, the decrementing of ->valid_evtchns has a similar effect, but
> I'm not convinced it gets us into XSA territory again. The problem
> wasn't the reducing of ->max_evtchns as such, but the derived
> assumptions elsewhere in the code. If there were any such again, I
> suppose we'd have reason to issue another XSA.

I don't think it get us to the XSA territory yet. However, the 
locking/interaction in the event channel code is quite complex.

To give a concrete example, below the current implementation of 
free_xen_event_channel():

     if ( !port_is_valid(d, port) )
     {
         /*
          * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
          * with the spin_barrier() and BUG_ON() in evtchn_destroy().
          */
         smp_rmb();
         BUG_ON(!d->is_dying);
         return;
     }

     evtchn_close(d, port, 0);

It would be fair for a developer to assume that after the check above, 
port_is_valid() would return true. However, this is not the case...

I am not aware of any issue so far... But I am not ready to be this is 
not going to be missed out. How about you?

 > If there were any such again, I
 > suppose we'd have reason to issue another XSA.

The point of my e-mail is to prevent this XSA to happen. I am pretty 
sure you want the same.

> 
> Furthermore there are other paths already using port_is_valid()
> without holding the domain's event lock; I've not been able to spot
> a problem with this though, so far.

Right. Most of the fine are fine because d == current. Therefore, the 
domain must be running and evtchn_destroy() couldn't happen concurrently.

> 
>> I can see two solutions:
>>     1) Use d->event_lock to protect port_is_valid() when d !=
>> current->domain. This would require evtchn_destroy() to grab the lock
>> when updating d->valid_evtchns.
>>     2) Never decrement d->valid_evtchns and use a different field for
>> closing ports
>>
>> I am not a big fan of 1) because this is muddying the already complex
>> locking situation in the event channel code. But I suggested it because
>> I wasn't sure whether you would be happy with 2).
> 
> I agree 1) wouldn't be very nice, and you're right in assuming I
> wouldn't like 2) very much. For the moment I'm not (yet) convinced
> we need to do anything at all - as you say yourself, while the
> result of port_is_valid() is potentially unstable when a domain is
> in the process of being cleaned up, the state guarded by such
> checks remains usable in (I think) a race free manner.

It remains usable *today*, the question is how long this will last?

All the recent XSAs in the event channel taught me that the 
locking/interaction is extremely complex. This series is another proof.

We would save us quite a bit of trouble by making port_is_valid() stable 
no matter the state of the domain.

I think an extra field (option 2) is quite a good compromise with space 
use, maintenance, speed.

I am would be interested to hear from others.

Cheers,

-- 
Julien Grall

Re: [PATCH v4 01/10] evtchn: use per-channel lock where possible

Posted by Jan Beulich 5 years ago

On 11.01.2021 12:06, Julien Grall wrote:
> On 11/01/2021 10:14, Jan Beulich wrote:
>> On 08.01.2021 21:32, Julien Grall wrote:
>>> On 05/01/2021 13:09, Jan Beulich wrote:
>>>> Neither evtchn_status() nor domain_dump_evtchn_info() nor
>>>> flask_get_peer_sid() need to hold the per-domain lock - they all only
>>>> read a single channel's state (at a time, in the dump case).
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>>> ---
>>>> v4: New.
>>>>
>>>> --- a/xen/common/event_channel.c
>>>> +++ b/xen/common/event_channel.c
>>>> @@ -968,15 +968,16 @@ int evtchn_status(evtchn_status_t *statu
>>>>        if ( d == NULL )
>>>>            return -ESRCH;
>>>>    
>>>> -    spin_lock(&d->event_lock);
>>>> -
>>>>        if ( !port_is_valid(d, port) )
>>>
>>> There is one issue that is now becoming more apparent. To be clear, the
>>> problem is not in this patch, but I think it is the best place to
>>> discuss it as d->event_lock may be part of the solution.
>>>
>>> After XSA-344, evtchn_destroy() will end up to decrement d->valid_evtchns.
>>>
>>> Given that evtchn_status() can work on the non-current domain, it would
>>> be possible to run it concurrently with evtchn_destroy(). As a
>>> consequence, port_is_valid() will be unstable as a valid event channel
>>> may turn invalid.
>>>
>>> AFAICT, we are getting away so far, as the memory is not freed until the
>>> domain is fully destroyed. However, we re-introduced XSA-338 in a
>>> different way.
>>>
>>> To be clear this is not the fault of this patch. But I don't think this
>>> is sane to re-introduce a behavior that lead us to an XSA.
>>
>> I'm getting confused, I'm afraid, from the varying statements above:
>> Are you suggesting this patch does re-introduce bad behavior or not?
> 
> No. I am pointing out that this is widening the bad behavior (again).

Since I'd really like to get in some more of this series before
the full freeze, and hence I want (need) to re-post, I thought
I'd reply here despite (or in light of) your request for input
from others not having been met.

I don't view this as "bad" behaviour, btw. The situation is
quite different to that which had led to the XSA: Here we
only deal with the "illusion" of a port having become invalid.
IOW yes, ...

>> Yes, the decrementing of ->valid_evtchns has a similar effect, but
>> I'm not convinced it gets us into XSA territory again. The problem
>> wasn't the reducing of ->max_evtchns as such, but the derived
>> assumptions elsewhere in the code. If there were any such again, I
>> suppose we'd have reason to issue another XSA.
> 
> I don't think it get us to the XSA territory yet. However, the 
> locking/interaction in the event channel code is quite complex.
> 
> To give a concrete example, below the current implementation of 
> free_xen_event_channel():
> 
>      if ( !port_is_valid(d, port) )
>      {
>          /*
>           * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
>           * with the spin_barrier() and BUG_ON() in evtchn_destroy().
>           */
>          smp_rmb();
>          BUG_ON(!d->is_dying);
>          return;
>      }
> 
>      evtchn_close(d, port, 0);
> 
> It would be fair for a developer to assume that after the check above, 
> port_is_valid() would return true. However, this is not the case...

... there needs to be awareness that putting e.g.

    ASSERT(port_is_valid(d, port));

anywhere past the if() cannot be done without considering domain
cleanup logic.

> I am not aware of any issue so far... But I am not ready to be this is 
> not going to be missed out. How about you?

There is a risk of this being overlooked, yes. But I'm unconvinced
this absolutely requires measures to be taken beyond, maybe, the
addition of a comment somewhere. I do, in particular, not think
this should stand in the way of the locking relaxation done by
this patch, even more so that (just to repeat) it merely introduces
more instances of a pattern found elsewhere already.

Jan

[PATCH v4 02/10] evtchn: bind-interdomain doesn't need to hold both domains' event locks

Posted by Jan Beulich 5 years, 1 month ago

The local domain's lock is needed for the port allocation, but for the
remote side the per-channel lock is sufficient. The per-channel locks
then need to be acquired slightly earlier, though.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -355,18 +355,7 @@ static long evtchn_bind_interdomain(evtc
     if ( (rd = rcu_lock_domain_by_id(rdom)) == NULL )
         return -ESRCH;
 
-    /* Avoid deadlock by first acquiring lock of domain with smaller id. */
-    if ( ld < rd )
-    {
-        spin_lock(&ld->event_lock);
-        spin_lock(&rd->event_lock);
-    }
-    else
-    {
-        if ( ld != rd )
-            spin_lock(&rd->event_lock);
-        spin_lock(&ld->event_lock);
-    }
+    spin_lock(&ld->event_lock);
 
     if ( (lport = get_free_port(ld)) < 0 )
         ERROR_EXIT(lport);
@@ -375,15 +364,19 @@ static long evtchn_bind_interdomain(evtc
     if ( !port_is_valid(rd, rport) )
         ERROR_EXIT_DOM(-EINVAL, rd);
     rchn = evtchn_from_port(rd, rport);
+
+    double_evtchn_lock(lchn, rchn);
+
     if ( (rchn->state != ECS_UNBOUND) ||
          (rchn->u.unbound.remote_domid != ld->domain_id) )
         ERROR_EXIT_DOM(-EINVAL, rd);
 
     rc = xsm_evtchn_interdomain(XSM_HOOK, ld, lchn, rd, rchn);
     if ( rc )
+    {
+        double_evtchn_unlock(lchn, rchn);
         goto out;
-
-    double_evtchn_lock(lchn, rchn);
+    }
 
     lchn->u.interdomain.remote_dom  = rd;
     lchn->u.interdomain.remote_port = rport;
@@ -407,8 +400,6 @@ static long evtchn_bind_interdomain(evtc
  out:
     check_free_port(ld, lport);
     spin_unlock(&ld->event_lock);
-    if ( ld != rd )
-        spin_unlock(&rd->event_lock);
     
     rcu_unlock_domain(rd);

Re: [PATCH v4 02/10] evtchn: bind-interdomain doesn't need to hold both domains' event locks

Posted by Julien Grall 5 years ago

Hi Jan,

On 05/01/2021 13:09, Jan Beulich wrote:
> The local domain's lock is needed for the port allocation, but for the
> remote side the per-channel lock is sufficient. The per-channel locks
> then need to be acquired slightly earlier, though.

I was expecting is little bit more information in the commit message 
because there are a few changes in behavior with this change:

  1) AFAICT, evtchn_allocate_port() rely on rchn->state to be protected 
by the rd->event_lock. Now that you dropped the rd->event_lock, 
rchn->state may be accessed while it is updated in 
evtchn_bind_interdomain(). The port cannot go back to ECS_FREE here, but 
I think the access needs to be switched to {read, write}_atomic() or 
ACCESS_ONCE.

   2) xsm_evtchn_interdomain() is now going to be called without the 
rd->event_lock. Can you confirm that the lock is not needed by XSM?

> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v4: New.
> 
> --- a/xen/common/event_channel.c
> +++ b/xen/common/event_channel.c
> @@ -355,18 +355,7 @@ static long evtchn_bind_interdomain(evtc
>       if ( (rd = rcu_lock_domain_by_id(rdom)) == NULL )
>           return -ESRCH;
>   
> -    /* Avoid deadlock by first acquiring lock of domain with smaller id. */
> -    if ( ld < rd )
> -    {
> -        spin_lock(&ld->event_lock);
> -        spin_lock(&rd->event_lock);
> -    }
> -    else
> -    {
> -        if ( ld != rd )
> -            spin_lock(&rd->event_lock);
> -        spin_lock(&ld->event_lock);
> -    }
> +    spin_lock(&ld->event_lock);
>   
>       if ( (lport = get_free_port(ld)) < 0 )
>           ERROR_EXIT(lport);
> @@ -375,15 +364,19 @@ static long evtchn_bind_interdomain(evtc
>       if ( !port_is_valid(rd, rport) )
>           ERROR_EXIT_DOM(-EINVAL, rd);
>       rchn = evtchn_from_port(rd, rport);
> +
> +    double_evtchn_lock(lchn, rchn);
> +
>       if ( (rchn->state != ECS_UNBOUND) ||
>            (rchn->u.unbound.remote_domid != ld->domain_id) )

You want to unlock the event channels here.

>           ERROR_EXIT_DOM(-EINVAL, rd);
>   
>       rc = xsm_evtchn_interdomain(XSM_HOOK, ld, lchn, rd, rchn);
>       if ( rc )
> +    {
> +        double_evtchn_unlock(lchn, rchn);
>           goto out;
> -
> -    double_evtchn_lock(lchn, rchn);
> +    }
>   
>       lchn->u.interdomain.remote_dom  = rd;
>       lchn->u.interdomain.remote_port = rport;
> @@ -407,8 +400,6 @@ static long evtchn_bind_interdomain(evtc
>    out:
>       check_free_port(ld, lport);
>       spin_unlock(&ld->event_lock);
> -    if ( ld != rd )
> -        spin_unlock(&rd->event_lock);
>       
>       rcu_unlock_domain(rd);
>   
> 

Cheers,

-- 
Julien Grall

Re: [PATCH v4 02/10] evtchn: bind-interdomain doesn't need to hold both domains' event locks

Posted by Julien Grall 5 years ago

On 09/01/2021 15:41, Julien Grall wrote:
> Hi Jan,
> 
> On 05/01/2021 13:09, Jan Beulich wrote:
>> The local domain's lock is needed for the port allocation, but for the
>> remote side the per-channel lock is sufficient. The per-channel locks
>> then need to be acquired slightly earlier, though.
> 
> I was expecting is little bit more information in the commit message 
> because there are a few changes in behavior with this change:
> 
>   1) AFAICT, evtchn_allocate_port() rely on rchn->state to be protected 
> by the rd->event_lock. Now that you dropped the rd->event_lock, 
> rchn->state may be accessed while it is updated in 
> evtchn_bind_interdomain(). The port cannot go back to ECS_FREE here, but 
> I think the access needs to be switched to {read, write}_atomic() or 
> ACCESS_ONCE.
> 
>    2) xsm_evtchn_interdomain() is now going to be called without the 
> rd->event_lock. Can you confirm that the lock is not needed by XSM?

Actually, I think there is a bigger issue. evtchn_close() will check 
chn1->state with just d1->event_lock held (IOW, there chn1->lock is not 
held).

If the remote domain happen to close the unbound port at the same time 
the local domain bound it, then you may end up in the following situation:

evtchn_bind_interdomain()        | evtchn_close()
                                  |
                                  |  switch ( chn1->state )
                                  |  case ECS_UNBOUND:
                                  |      /* nothing to do */
    double_evtchn_lock()          |
    rchn->state = ECS_INTERDOMAIN |
    double_evtchn_unlock()        |
                                  |  evtchn_write_lock(chn1)
                                  |  evtchn_free(d1, chn1)
                                  |  evtchn_write_unlock(chn1)

When the local domain will try to close the port, it will hit the 
BUG_ON(chn2->state != ECS_INTERDOMAIN) because the remote port were 
already freed.

I think this can be solved by acquiring the event lock earlier on in 
evtchn_close(). Although, this may become a can of worms as it would be 
more complex to prevent lock inversion because chn1->lock and chn2->lock.

Cheers,

-- 
Julien Grall

Re: [PATCH v4 02/10] evtchn: bind-interdomain doesn't need to hold both domains' event locks

Posted by Jan Beulich 5 years ago

On 09.01.2021 17:14, Julien Grall wrote:
> On 09/01/2021 15:41, Julien Grall wrote:
>> On 05/01/2021 13:09, Jan Beulich wrote:
>>> The local domain's lock is needed for the port allocation, but for the
>>> remote side the per-channel lock is sufficient. The per-channel locks
>>> then need to be acquired slightly earlier, though.
>>
>> I was expecting is little bit more information in the commit message 
>> because there are a few changes in behavior with this change:
>>
>>   1) AFAICT, evtchn_allocate_port() rely on rchn->state to be protected 
>> by the rd->event_lock. Now that you dropped the rd->event_lock, 
>> rchn->state may be accessed while it is updated in 
>> evtchn_bind_interdomain(). The port cannot go back to ECS_FREE here, but 
>> I think the access needs to be switched to {read, write}_atomic() or 
>> ACCESS_ONCE.
>>
>>    2) xsm_evtchn_interdomain() is now going to be called without the 
>> rd->event_lock. Can you confirm that the lock is not needed by XSM?
> 
> Actually, I think there is a bigger issue. evtchn_close() will check 
> chn1->state with just d1->event_lock held (IOW, there chn1->lock is not 
> held).
> 
> If the remote domain happen to close the unbound port at the same time 
> the local domain bound it, then you may end up in the following situation:
> 
> 
> evtchn_bind_interdomain()        | evtchn_close()
>                                   |
>                                   |  switch ( chn1->state )
>                                   |  case ECS_UNBOUND:
>                                   |      /* nothing to do */
>     double_evtchn_lock()          |
>     rchn->state = ECS_INTERDOMAIN |
>     double_evtchn_unlock()        |
>                                   |  evtchn_write_lock(chn1)
>                                   |  evtchn_free(d1, chn1)
>                                   |  evtchn_write_unlock(chn1)
> 
> When the local domain will try to close the port, it will hit the 
> BUG_ON(chn2->state != ECS_INTERDOMAIN) because the remote port were 
> already freed.

Hmm, yes, thanks for spotting (and sorry for taking a while to
reply).

> I think this can be solved by acquiring the event lock earlier on in 
> evtchn_close(). Although, this may become a can of worms as it would be 
> more complex to prevent lock inversion because chn1->lock and chn2->lock.

Indeed. I think I'll give up on trying to eliminate the double
per-domain event locking for the time being, and resubmit with
both patches dropped.

Jan

[PATCH v4 03/10] evtchn: convert domain event lock to an r/w one

Posted by Jan Beulich 5 years, 1 month ago

Especially for the use in evtchn_move_pirqs() (called when moving a vCPU
across pCPU-s) and the ones in EOI handling in PCI pass-through code,
serializing perhaps an entire domain isn't helpful when no state (which
isn't e.g. further protected by the per-channel lock) changes.

Unfortunately this implies dropping of lock profiling for this lock,
until r/w locks may get enabled for such functionality.

While ->notify_vcpu_id is now meant to be consistently updated with the
per-channel lock held, an extension applies to ECS_PIRQ: The field is
also guaranteed to not change with the per-domain event lock held for
writing. Therefore the link_pirq_port() call from evtchn_bind_pirq()
could in principle be moved out of the per-channel locked regions, but
this further code churn didn't seem worth it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Re-base, in particular over new earlier patches. Acquire both
    per-domain locks for writing in evtchn_close(). Adjust
    spin_barrier() related comments.
v3: Re-base.
v2: Consistently lock for writing in evtchn_reset(). Fix error path in
    pci_clean_dpci_irqs(). Lock for writing in pt_irq_time_out(),
    hvm_dirq_assist(), hvm_dpci_eoi(), and hvm_dpci_isairq_eoi(). Move
    rw_barrier() introduction here. Re-base over changes earlier in the
    series.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -903,7 +903,7 @@ int arch_domain_soft_reset(struct domain
     if ( !is_hvm_domain(d) )
         return -EINVAL;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
     for ( i = 0; i < d->nr_pirqs ; i++ )
     {
         if ( domain_pirq_to_emuirq(d, i) != IRQ_UNBOUND )
@@ -913,7 +913,7 @@ int arch_domain_soft_reset(struct domain
                 break;
         }
     }
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     if ( ret )
         return ret;
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -528,9 +528,9 @@ void hvm_migrate_pirqs(struct vcpu *v)
     if ( !is_iommu_enabled(d) || !hvm_domain_irq(d)->dpci )
        return;
 
-    spin_lock(&d->event_lock);
+    read_lock(&d->event_lock);
     pt_pirq_iterate(d, migrate_pirq, v);
-    spin_unlock(&d->event_lock);
+    read_unlock(&d->event_lock);
 }
 
 static bool hvm_get_pending_event(struct vcpu *v, struct x86_event *info)
--- a/xen/arch/x86/hvm/irq.c
+++ b/xen/arch/x86/hvm/irq.c
@@ -404,9 +404,9 @@ int hvm_inject_msi(struct domain *d, uin
             {
                 int rc;
 
-                spin_lock(&d->event_lock);
+                write_lock(&d->event_lock);
                 rc = map_domain_emuirq_pirq(d, pirq, IRQ_MSI_EMU);
-                spin_unlock(&d->event_lock);
+                write_unlock(&d->event_lock);
                 if ( rc )
                     return rc;
                 info = pirq_info(d, pirq);
--- a/xen/arch/x86/hvm/vioapic.c
+++ b/xen/arch/x86/hvm/vioapic.c
@@ -203,9 +203,9 @@ static int vioapic_hwdom_map_gsi(unsigne
     {
         gprintk(XENLOG_WARNING, "vioapic: error binding GSI %u: %d\n",
                 gsi, ret);
-        spin_lock(&currd->event_lock);
+        write_lock(&currd->event_lock);
         unmap_domain_pirq(currd, pirq);
-        spin_unlock(&currd->event_lock);
+        write_unlock(&currd->event_lock);
     }
     pcidevs_unlock();
 
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -465,7 +465,7 @@ int msixtbl_pt_register(struct domain *d
     int r = -EINVAL;
 
     ASSERT(pcidevs_locked());
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
 
     if ( !msixtbl_initialised(d) )
         return -ENODEV;
@@ -535,7 +535,7 @@ void msixtbl_pt_unregister(struct domain
     struct msixtbl_entry *entry;
 
     ASSERT(pcidevs_locked());
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
 
     if ( !msixtbl_initialised(d) )
         return;
@@ -589,13 +589,13 @@ void msixtbl_pt_cleanup(struct domain *d
     if ( !msixtbl_initialised(d) )
         return;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     list_for_each_entry_safe( entry, temp,
                               &d->arch.hvm.msixtbl_list, list )
         del_msixtbl_entry(entry);
 
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 }
 
 void msix_write_completion(struct vcpu *v)
@@ -726,9 +726,9 @@ int vpci_msi_arch_update(struct vpci_msi
                          msi->arch.pirq, msi->mask);
     if ( rc )
     {
-        spin_lock(&pdev->domain->event_lock);
+        write_lock(&pdev->domain->event_lock);
         unmap_domain_pirq(pdev->domain, msi->arch.pirq);
-        spin_unlock(&pdev->domain->event_lock);
+        write_unlock(&pdev->domain->event_lock);
         pcidevs_unlock();
         msi->arch.pirq = INVALID_PIRQ;
         return rc;
@@ -767,9 +767,9 @@ static int vpci_msi_enable(const struct
     rc = vpci_msi_update(pdev, data, address, vectors, pirq, mask);
     if ( rc )
     {
-        spin_lock(&pdev->domain->event_lock);
+        write_lock(&pdev->domain->event_lock);
         unmap_domain_pirq(pdev->domain, pirq);
-        spin_unlock(&pdev->domain->event_lock);
+        write_unlock(&pdev->domain->event_lock);
         pcidevs_unlock();
         return rc;
     }
@@ -814,9 +814,9 @@ static void vpci_msi_disable(const struc
         ASSERT(!rc);
     }
 
-    spin_lock(&pdev->domain->event_lock);
+    write_lock(&pdev->domain->event_lock);
     unmap_domain_pirq(pdev->domain, pirq);
-    spin_unlock(&pdev->domain->event_lock);
+    write_unlock(&pdev->domain->event_lock);
     pcidevs_unlock();
 }
 
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -2413,10 +2413,10 @@ int ioapic_guest_write(unsigned long phy
     }
     if ( pirq >= 0 )
     {
-        spin_lock(&hardware_domain->event_lock);
+        write_lock(&hardware_domain->event_lock);
         ret = map_domain_pirq(hardware_domain, pirq, irq,
                               MAP_PIRQ_TYPE_GSI, NULL);
-        spin_unlock(&hardware_domain->event_lock);
+        write_unlock(&hardware_domain->event_lock);
         if ( ret < 0 )
             return ret;
     }
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1547,7 +1547,7 @@ int pirq_guest_bind(struct vcpu *v, stru
     unsigned int        max_nr_guests = will_share ? irq_max_guests : 1;
     int                 rc = 0;
 
-    WARN_ON(!spin_is_locked(&v->domain->event_lock));
+    WARN_ON(!rw_is_write_locked(&v->domain->event_lock));
     BUG_ON(!local_irq_is_enabled());
 
  retry:
@@ -1761,7 +1761,7 @@ void pirq_guest_unbind(struct domain *d,
     struct irq_desc *desc;
     int irq = 0;
 
-    WARN_ON(!spin_is_locked(&d->event_lock));
+    WARN_ON(!rw_is_write_locked(&d->event_lock));
 
     BUG_ON(!local_irq_is_enabled());
     desc = pirq_spin_lock_irq_desc(pirq, NULL);
@@ -1798,7 +1798,7 @@ static bool pirq_guest_force_unbind(stru
     unsigned int i;
     bool bound = false;
 
-    WARN_ON(!spin_is_locked(&d->event_lock));
+    WARN_ON(!rw_is_write_locked(&d->event_lock));
 
     BUG_ON(!local_irq_is_enabled());
     desc = pirq_spin_lock_irq_desc(pirq, NULL);
@@ -2040,7 +2040,7 @@ int get_free_pirq(struct domain *d, int
 {
     int i;
 
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
 
     if ( type == MAP_PIRQ_TYPE_GSI )
     {
@@ -2065,7 +2065,7 @@ int get_free_pirqs(struct domain *d, uns
 {
     unsigned int i, found = 0;
 
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
 
     for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i )
         if ( is_free_pirq(d, pirq_info(d, i)) )
@@ -2093,7 +2093,7 @@ int map_domain_pirq(
     DECLARE_BITMAP(prepared, MAX_MSI_IRQS) = {};
     DECLARE_BITMAP(granted, MAX_MSI_IRQS) = {};
 
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
 
     if ( !irq_access_permitted(current->domain, irq))
         return -EPERM;
@@ -2312,7 +2312,7 @@ int unmap_domain_pirq(struct domain *d,
         return -EINVAL;
 
     ASSERT(pcidevs_locked());
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
 
     info = pirq_info(d, pirq);
     if ( !info || (irq = info->arch.irq) <= 0 )
@@ -2439,13 +2439,13 @@ void free_domain_pirqs(struct domain *d)
     int i;
 
     pcidevs_lock();
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     for ( i = 0; i < d->nr_pirqs; i++ )
         if ( domain_pirq_to_irq(d, i) > 0 )
             unmap_domain_pirq(d, i);
 
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
     pcidevs_unlock();
 }
 
@@ -2688,7 +2688,7 @@ int map_domain_emuirq_pirq(struct domain
     int old_emuirq = IRQ_UNBOUND, old_pirq = IRQ_UNBOUND;
     struct pirq *info;
 
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
 
     if ( !is_hvm_domain(d) )
         return -EINVAL;
@@ -2754,7 +2754,7 @@ int unmap_domain_pirq_emuirq(struct doma
     if ( (pirq < 0) || (pirq >= d->nr_pirqs) )
         return -EINVAL;
 
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
 
     emuirq = domain_pirq_to_emuirq(d, pirq);
     if ( emuirq == IRQ_UNBOUND )
@@ -2802,7 +2802,7 @@ static int allocate_pirq(struct domain *
 {
     int current_pirq;
 
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
     current_pirq = domain_irq_to_pirq(d, irq);
     if ( pirq < 0 )
     {
@@ -2874,7 +2874,7 @@ int allocate_and_map_gsi_pirq(struct dom
     }
 
     /* Verify or get pirq. */
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
     pirq = allocate_pirq(d, index, *pirq_p, irq, MAP_PIRQ_TYPE_GSI, NULL);
     if ( pirq < 0 )
     {
@@ -2887,7 +2887,7 @@ int allocate_and_map_gsi_pirq(struct dom
         *pirq_p = pirq;
 
  done:
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     return ret;
 }
@@ -2928,7 +2928,7 @@ int allocate_and_map_msi_pirq(struct dom
 
     pcidevs_lock();
     /* Verify or get pirq. */
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
     pirq = allocate_pirq(d, index, *pirq_p, irq, type, &msi->entry_nr);
     if ( pirq < 0 )
     {
@@ -2941,7 +2941,7 @@ int allocate_and_map_msi_pirq(struct dom
         *pirq_p = pirq;
 
  done:
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
     pcidevs_unlock();
     if ( ret )
     {
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -34,7 +34,7 @@ static int physdev_hvm_map_pirq(
 
     ASSERT(!is_hardware_domain(d));
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
     switch ( type )
     {
     case MAP_PIRQ_TYPE_GSI: {
@@ -84,7 +84,7 @@ static int physdev_hvm_map_pirq(
         break;
     }
 
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
     return ret;
 }
 
@@ -154,18 +154,18 @@ int physdev_unmap_pirq(domid_t domid, in
 
     if ( is_hvm_domain(d) && has_pirq(d) )
     {
-        spin_lock(&d->event_lock);
+        write_lock(&d->event_lock);
         if ( domain_pirq_to_emuirq(d, pirq) != IRQ_UNBOUND )
             ret = unmap_domain_pirq_emuirq(d, pirq);
-        spin_unlock(&d->event_lock);
+        write_unlock(&d->event_lock);
         if ( domid == DOMID_SELF || ret )
             goto free_domain;
     }
 
     pcidevs_lock();
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
     ret = unmap_domain_pirq(d, pirq);
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
     pcidevs_unlock();
 
  free_domain:
@@ -192,10 +192,10 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         ret = -EINVAL;
         if ( eoi.irq >= currd->nr_pirqs )
             break;
-        spin_lock(&currd->event_lock);
+        read_lock(&currd->event_lock);
         pirq = pirq_info(currd, eoi.irq);
         if ( !pirq ) {
-            spin_unlock(&currd->event_lock);
+            read_unlock(&currd->event_lock);
             break;
         }
         if ( currd->arch.auto_unmask )
@@ -214,7 +214,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
                     && hvm_irq->gsi_assert_count[gsi] )
                 send_guest_pirq(currd, pirq);
         }
-        spin_unlock(&currd->event_lock);
+        read_unlock(&currd->event_lock);
         ret = 0;
         break;
     }
@@ -626,7 +626,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
         if ( copy_from_guest(&out, arg, 1) != 0 )
             break;
 
-        spin_lock(&currd->event_lock);
+        write_lock(&currd->event_lock);
 
         ret = get_free_pirq(currd, out.type);
         if ( ret >= 0 )
@@ -639,7 +639,7 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
                 ret = -ENOMEM;
         }
 
-        spin_unlock(&currd->event_lock);
+        write_unlock(&currd->event_lock);
 
         if ( ret >= 0 )
         {
--- a/xen/arch/x86/pv/shim.c
+++ b/xen/arch/x86/pv/shim.c
@@ -448,7 +448,7 @@ static long pv_shim_event_channel_op(int
         if ( rc )                                                           \
             break;                                                          \
                                                                             \
-        spin_lock(&d->event_lock);                                          \
+        write_lock(&d->event_lock);                                         \
         rc = evtchn_allocate_port(d, op.port_field);                        \
         if ( rc )                                                           \
         {                                                                   \
@@ -457,7 +457,7 @@ static long pv_shim_event_channel_op(int
         }                                                                   \
         else                                                                \
             evtchn_reserve(d, op.port_field);                               \
-        spin_unlock(&d->event_lock);                                        \
+        write_unlock(&d->event_lock);                                       \
                                                                             \
         if ( !rc && __copy_to_guest(arg, &op, 1) )                          \
             rc = -EFAULT;                                                   \
@@ -585,11 +585,11 @@ static long pv_shim_event_channel_op(int
         if ( rc )
             break;
 
-        spin_lock(&d->event_lock);
+        write_lock(&d->event_lock);
         rc = evtchn_allocate_port(d, ipi.port);
         if ( rc )
         {
-            spin_unlock(&d->event_lock);
+            write_unlock(&d->event_lock);
 
             close.port = ipi.port;
             BUG_ON(xen_hypercall_event_channel_op(EVTCHNOP_close, &close));
@@ -598,7 +598,7 @@ static long pv_shim_event_channel_op(int
 
         evtchn_assign_vcpu(d, ipi.port, ipi.vcpu);
         evtchn_reserve(d, ipi.port);
-        spin_unlock(&d->event_lock);
+        write_unlock(&d->event_lock);
 
         if ( __copy_to_guest(arg, &ipi, 1) )
             rc = -EFAULT;
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -294,7 +294,7 @@ static long evtchn_alloc_unbound(evtchn_
     if ( d == NULL )
         return -ESRCH;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     if ( (port = get_free_port(d)) < 0 )
         ERROR_EXIT_DOM(port, d);
@@ -317,7 +317,7 @@ static long evtchn_alloc_unbound(evtchn_
 
  out:
     check_free_port(d, port);
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
     rcu_unlock_domain(d);
 
     return rc;
@@ -355,7 +355,7 @@ static long evtchn_bind_interdomain(evtc
     if ( (rd = rcu_lock_domain_by_id(rdom)) == NULL )
         return -ESRCH;
 
-    spin_lock(&ld->event_lock);
+    write_lock(&ld->event_lock);
 
     if ( (lport = get_free_port(ld)) < 0 )
         ERROR_EXIT(lport);
@@ -399,7 +399,7 @@ static long evtchn_bind_interdomain(evtc
 
  out:
     check_free_port(ld, lport);
-    spin_unlock(&ld->event_lock);
+    write_unlock(&ld->event_lock);
     
     rcu_unlock_domain(rd);
 
@@ -430,7 +430,7 @@ int evtchn_bind_virq(evtchn_bind_virq_t
     if ( (v = domain_vcpu(d, vcpu)) == NULL )
         return -ENOENT;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     if ( read_atomic(&v->virq_to_evtchn[virq]) )
         ERROR_EXIT(-EEXIST);
@@ -471,7 +471,7 @@ int evtchn_bind_virq(evtchn_bind_virq_t
     write_atomic(&v->virq_to_evtchn[virq], port);
 
  out:
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     return rc;
 }
@@ -487,7 +487,7 @@ static long evtchn_bind_ipi(evtchn_bind_
     if ( domain_vcpu(d, vcpu) == NULL )
         return -ENOENT;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     if ( (port = get_free_port(d)) < 0 )
         ERROR_EXIT(port);
@@ -505,7 +505,7 @@ static long evtchn_bind_ipi(evtchn_bind_
     bind->port = port;
 
  out:
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     return rc;
 }
@@ -551,7 +551,7 @@ static long evtchn_bind_pirq(evtchn_bind
     if ( !is_hvm_domain(d) && !pirq_access_permitted(d, pirq) )
         return -EPERM;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     if ( pirq_to_evtchn(d, pirq) != 0 )
         ERROR_EXIT(-EEXIST);
@@ -591,7 +591,7 @@ static long evtchn_bind_pirq(evtchn_bind
 
  out:
     check_free_port(d, port);
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     return rc;
 }
@@ -606,7 +606,7 @@ int evtchn_close(struct domain *d1, int
     long           rc = 0;
 
  again:
-    spin_lock(&d1->event_lock);
+    write_lock(&d1->event_lock);
 
     if ( !port_is_valid(d1, port1) )
     {
@@ -676,13 +676,11 @@ int evtchn_close(struct domain *d1, int
                 BUG();
 
             if ( d1 < d2 )
-            {
-                spin_lock(&d2->event_lock);
-            }
+                write_lock(&d2->event_lock);
             else if ( d1 != d2 )
             {
-                spin_unlock(&d1->event_lock);
-                spin_lock(&d2->event_lock);
+                write_unlock(&d1->event_lock);
+                write_lock(&d2->event_lock);
                 goto again;
             }
         }
@@ -729,11 +727,11 @@ int evtchn_close(struct domain *d1, int
     if ( d2 != NULL )
     {
         if ( d1 != d2 )
-            spin_unlock(&d2->event_lock);
+            write_unlock(&d2->event_lock);
         put_domain(d2);
     }
 
-    spin_unlock(&d1->event_lock);
+    write_unlock(&d1->event_lock);
 
     return rc;
 }
@@ -1031,7 +1029,7 @@ long evtchn_bind_vcpu(unsigned int port,
     if ( (v = domain_vcpu(d, vcpu_id)) == NULL )
         return -ENOENT;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     if ( !port_is_valid(d, port) )
     {
@@ -1075,7 +1073,7 @@ long evtchn_bind_vcpu(unsigned int port,
     }
 
  out:
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     return rc;
 }
@@ -1121,7 +1119,7 @@ int evtchn_reset(struct domain *d, bool
     if ( d != current->domain && !d->controller_pause_count )
         return -EINVAL;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     /*
      * If we are resuming, then start where we stopped. Otherwise, check
@@ -1132,7 +1130,7 @@ int evtchn_reset(struct domain *d, bool
     if ( i > d->next_evtchn )
         d->next_evtchn = i;
 
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     if ( !i )
         return -EBUSY;
@@ -1144,14 +1142,14 @@ int evtchn_reset(struct domain *d, bool
         /* NB: Choice of frequency is arbitrary. */
         if ( !(i & 0x3f) && hypercall_preempt_check() )
         {
-            spin_lock(&d->event_lock);
+            write_lock(&d->event_lock);
             d->next_evtchn = i;
-            spin_unlock(&d->event_lock);
+            write_unlock(&d->event_lock);
             return -ERESTART;
         }
     }
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     d->next_evtchn = 0;
 
@@ -1164,7 +1162,7 @@ int evtchn_reset(struct domain *d, bool
         evtchn_2l_init(d);
     }
 
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     return rc;
 }
@@ -1354,7 +1352,7 @@ int alloc_unbound_xen_event_channel(
     struct evtchn *chn;
     int            port, rc;
 
-    spin_lock(&ld->event_lock);
+    write_lock(&ld->event_lock);
 
     port = rc = get_free_port(ld);
     if ( rc < 0 )
@@ -1382,7 +1380,7 @@ int alloc_unbound_xen_event_channel(
 
  out:
     check_free_port(ld, port);
-    spin_unlock(&ld->event_lock);
+    write_unlock(&ld->event_lock);
 
     return rc < 0 ? rc : port;
 }
@@ -1393,7 +1391,7 @@ void free_xen_event_channel(struct domai
     {
         /*
          * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
-         * with the spin_barrier() and BUG_ON() in evtchn_destroy().
+         * with the kind-of-barrier and BUG_ON() in evtchn_destroy().
          */
         smp_rmb();
         BUG_ON(!d->is_dying);
@@ -1413,7 +1411,7 @@ void notify_via_xen_event_channel(struct
     {
         /*
          * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
-         * with the spin_barrier() and BUG_ON() in evtchn_destroy().
+         * with the kind-of-barrier and BUG_ON() in evtchn_destroy().
          */
         smp_rmb();
         ASSERT(ld->is_dying);
@@ -1470,7 +1468,8 @@ int evtchn_init(struct domain *d, unsign
         return -ENOMEM;
     d->valid_evtchns = EVTCHNS_PER_BUCKET;
 
-    spin_lock_init_prof(d, event_lock);
+    rwlock_init(&d->event_lock);
+
     if ( get_free_port(d) != 0 )
     {
         free_evtchn_bucket(d, d->evtchn);
@@ -1495,9 +1494,10 @@ int evtchn_destroy(struct domain *d)
 {
     unsigned int i;
 
-    /* After this barrier no new event-channel allocations can occur. */
+    /* After this kind-of-barrier no new event-channel allocations can occur. */
     BUG_ON(!d->is_dying);
-    spin_barrier(&d->event_lock);
+    read_lock(&d->event_lock);
+    read_unlock(&d->event_lock);
 
     /* Close all existing event channels. */
     for ( i = d->valid_evtchns; --i; )
@@ -1555,13 +1555,13 @@ void evtchn_move_pirqs(struct vcpu *v)
     unsigned int port;
     struct evtchn *chn;
 
-    spin_lock(&d->event_lock);
+    read_lock(&d->event_lock);
     for ( port = v->pirq_evtchn_head; port; port = chn->u.pirq.next_port )
     {
         chn = evtchn_from_port(d, port);
         pirq_set_affinity(d, chn->u.pirq.irq, mask);
     }
-    spin_unlock(&d->event_lock);
+    read_unlock(&d->event_lock);
 }
 
 
@@ -1626,11 +1626,11 @@ static void domain_dump_evtchn_info(stru
              */
             evtchn_read_unlock(chn);
             chn = NULL;
-            if ( spin_trylock(&d->event_lock) )
+            if ( read_trylock(&d->event_lock) )
             {
                 int irq = domain_pirq_to_irq(d, pirq);
 
-                spin_unlock(&d->event_lock);
+                read_unlock(&d->event_lock);
                 printk(" p=%u i=%d", pirq, irq);
             }
             else
--- a/xen/common/event_fifo.c
+++ b/xen/common/event_fifo.c
@@ -600,7 +600,7 @@ int evtchn_fifo_init_control(struct evtc
     if ( offset & (8 - 1) )
         return -EINVAL;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     /*
      * If this is the first control block, setup an empty event array
@@ -636,13 +636,13 @@ int evtchn_fifo_init_control(struct evtc
     else
         rc = map_control_block(v, gfn, offset);
 
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     return rc;
 
  error:
     evtchn_fifo_destroy(d);
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
     return rc;
 }
 
@@ -695,9 +695,9 @@ int evtchn_fifo_expand_array(const struc
     if ( !d->evtchn_fifo )
         return -EOPNOTSUPP;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
     rc = add_page_to_event_array(d, expand_array->array_gfn);
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     return rc;
 }
--- a/xen/drivers/passthrough/vtd/x86/hvm.c
+++ b/xen/drivers/passthrough/vtd/x86/hvm.c
@@ -54,7 +54,7 @@ void hvm_dpci_isairq_eoi(struct domain *
     if ( !is_iommu_enabled(d) )
         return;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     dpci = domain_get_irq_dpci(d);
 
@@ -63,5 +63,5 @@ void hvm_dpci_isairq_eoi(struct domain *
         /* Multiple mirq may be mapped to one isa irq */
         pt_pirq_iterate(d, _hvm_dpci_isairq_eoi, (void *)(long)isairq);
     }
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 }
--- a/xen/drivers/passthrough/x86/hvm.c
+++ b/xen/drivers/passthrough/x86/hvm.c
@@ -105,7 +105,7 @@ static void pt_pirq_softirq_reset(struct
 {
     struct domain *d = pirq_dpci->dom;
 
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_write_locked(&d->event_lock));
 
     switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 0) )
     {
@@ -162,7 +162,7 @@ static void pt_irq_time_out(void *data)
     const struct hvm_irq_dpci *dpci;
     const struct dev_intx_gsi_link *digl;
 
-    spin_lock(&irq_map->dom->event_lock);
+    write_lock(&irq_map->dom->event_lock);
 
     if ( irq_map->flags & HVM_IRQ_DPCI_IDENTITY_GSI )
     {
@@ -177,7 +177,7 @@ static void pt_irq_time_out(void *data)
         hvm_gsi_deassert(irq_map->dom, dpci_pirq(irq_map)->pirq);
         irq_map->flags |= HVM_IRQ_DPCI_EOI_LATCH;
         pt_irq_guest_eoi(irq_map->dom, irq_map, NULL);
-        spin_unlock(&irq_map->dom->event_lock);
+        write_unlock(&irq_map->dom->event_lock);
         return;
     }
 
@@ -185,7 +185,7 @@ static void pt_irq_time_out(void *data)
     if ( unlikely(!dpci) )
     {
         ASSERT_UNREACHABLE();
-        spin_unlock(&irq_map->dom->event_lock);
+        write_unlock(&irq_map->dom->event_lock);
         return;
     }
     list_for_each_entry ( digl, &irq_map->digl_list, list )
@@ -204,7 +204,7 @@ static void pt_irq_time_out(void *data)
 
     pt_pirq_iterate(irq_map->dom, pt_irq_guest_eoi, NULL);
 
-    spin_unlock(&irq_map->dom->event_lock);
+    write_unlock(&irq_map->dom->event_lock);
 }
 
 struct hvm_irq_dpci *domain_get_irq_dpci(const struct domain *d)
@@ -288,7 +288,7 @@ int pt_irq_create_bind(
         return -EINVAL;
 
  restart:
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     hvm_irq_dpci = domain_get_irq_dpci(d);
     if ( !hvm_irq_dpci && !is_hardware_domain(d) )
@@ -304,7 +304,7 @@ int pt_irq_create_bind(
         hvm_irq_dpci = xzalloc(struct hvm_irq_dpci);
         if ( hvm_irq_dpci == NULL )
         {
-            spin_unlock(&d->event_lock);
+            write_unlock(&d->event_lock);
             return -ENOMEM;
         }
         for ( i = 0; i < NR_HVM_DOMU_IRQS; i++ )
@@ -316,7 +316,7 @@ int pt_irq_create_bind(
     info = pirq_get_info(d, pirq);
     if ( !info )
     {
-        spin_unlock(&d->event_lock);
+        write_unlock(&d->event_lock);
         return -ENOMEM;
     }
     pirq_dpci = pirq_dpci(info);
@@ -331,7 +331,7 @@ int pt_irq_create_bind(
      */
     if ( pt_pirq_softirq_active(pirq_dpci) )
     {
-        spin_unlock(&d->event_lock);
+        write_unlock(&d->event_lock);
         cpu_relax();
         goto restart;
     }
@@ -389,7 +389,7 @@ int pt_irq_create_bind(
                 pirq_dpci->dom = NULL;
                 pirq_dpci->flags = 0;
                 pirq_cleanup_check(info, d);
-                spin_unlock(&d->event_lock);
+                write_unlock(&d->event_lock);
                 return rc;
             }
         }
@@ -399,7 +399,7 @@ int pt_irq_create_bind(
 
             if ( (pirq_dpci->flags & mask) != mask )
             {
-                spin_unlock(&d->event_lock);
+                write_unlock(&d->event_lock);
                 return -EBUSY;
             }
 
@@ -423,7 +423,7 @@ int pt_irq_create_bind(
 
         dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode);
         pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id;
-        spin_unlock(&d->event_lock);
+        write_unlock(&d->event_lock);
 
         pirq_dpci->gmsi.posted = false;
         vcpu = (dest_vcpu_id >= 0) ? d->vcpu[dest_vcpu_id] : NULL;
@@ -483,7 +483,7 @@ int pt_irq_create_bind(
 
             if ( !digl || !girq )
             {
-                spin_unlock(&d->event_lock);
+                write_unlock(&d->event_lock);
                 xfree(girq);
                 xfree(digl);
                 return -ENOMEM;
@@ -510,7 +510,7 @@ int pt_irq_create_bind(
             if ( pt_irq_bind->irq_type != PT_IRQ_TYPE_PCI ||
                  pirq >= hvm_domain_irq(d)->nr_gsis )
             {
-                spin_unlock(&d->event_lock);
+                write_unlock(&d->event_lock);
 
                 return -EINVAL;
             }
@@ -546,7 +546,7 @@ int pt_irq_create_bind(
 
                     if ( mask < 0 || trigger_mode < 0 )
                     {
-                        spin_unlock(&d->event_lock);
+                        write_unlock(&d->event_lock);
 
                         ASSERT_UNREACHABLE();
                         return -EINVAL;
@@ -594,14 +594,14 @@ int pt_irq_create_bind(
                 }
                 pirq_dpci->flags = 0;
                 pirq_cleanup_check(info, d);
-                spin_unlock(&d->event_lock);
+                write_unlock(&d->event_lock);
                 xfree(girq);
                 xfree(digl);
                 return rc;
             }
         }
 
-        spin_unlock(&d->event_lock);
+        write_unlock(&d->event_lock);
 
         if ( iommu_verbose )
         {
@@ -619,7 +619,7 @@ int pt_irq_create_bind(
     }
 
     default:
-        spin_unlock(&d->event_lock);
+        write_unlock(&d->event_lock);
         return -EOPNOTSUPP;
     }
 
@@ -672,13 +672,13 @@ int pt_irq_destroy_bind(
         return -EOPNOTSUPP;
     }
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
 
     hvm_irq_dpci = domain_get_irq_dpci(d);
 
     if ( !hvm_irq_dpci && !is_hardware_domain(d) )
     {
-        spin_unlock(&d->event_lock);
+        write_unlock(&d->event_lock);
         return -EINVAL;
     }
 
@@ -711,7 +711,7 @@ int pt_irq_destroy_bind(
 
         if ( girq )
         {
-            spin_unlock(&d->event_lock);
+            write_unlock(&d->event_lock);
             return -EINVAL;
         }
 
@@ -755,7 +755,7 @@ int pt_irq_destroy_bind(
         pirq_cleanup_check(pirq, d);
     }
 
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     if ( what && iommu_verbose )
     {
@@ -799,7 +799,7 @@ int pt_pirq_iterate(struct domain *d,
     unsigned int pirq = 0, n, i;
     struct pirq *pirqs[8];
 
-    ASSERT(spin_is_locked(&d->event_lock));
+    ASSERT(rw_is_locked(&d->event_lock));
 
     do {
         n = radix_tree_gang_lookup(&d->pirq_tree, (void **)pirqs, pirq,
@@ -880,9 +880,9 @@ void hvm_dpci_msi_eoi(struct domain *d,
          (!hvm_domain_irq(d)->dpci && !is_hardware_domain(d)) )
        return;
 
-    spin_lock(&d->event_lock);
+    read_lock(&d->event_lock);
     pt_pirq_iterate(d, _hvm_dpci_msi_eoi, (void *)(long)vector);
-    spin_unlock(&d->event_lock);
+    read_unlock(&d->event_lock);
 }
 
 static void hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci)
@@ -893,7 +893,7 @@ static void hvm_dirq_assist(struct domai
         return;
     }
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
     if ( test_and_clear_bool(pirq_dpci->masked) )
     {
         struct pirq *pirq = dpci_pirq(pirq_dpci);
@@ -947,7 +947,7 @@ static void hvm_dirq_assist(struct domai
     }
 
  out:
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 }
 
 static void hvm_pirq_eoi(struct pirq *pirq,
@@ -1012,7 +1012,7 @@ void hvm_dpci_eoi(struct domain *d, unsi
 
     if ( is_hardware_domain(d) )
     {
-        spin_lock(&d->event_lock);
+        write_lock(&d->event_lock);
         hvm_gsi_eoi(d, guest_gsi, ent);
         goto unlock;
     }
@@ -1023,7 +1023,7 @@ void hvm_dpci_eoi(struct domain *d, unsi
         return;
     }
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
     hvm_irq_dpci = domain_get_irq_dpci(d);
 
     if ( !hvm_irq_dpci )
@@ -1033,7 +1033,7 @@ void hvm_dpci_eoi(struct domain *d, unsi
         __hvm_dpci_eoi(d, girq, ent);
 
 unlock:
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 }
 
 static int pci_clean_dpci_irq(struct domain *d,
@@ -1072,7 +1072,7 @@ int arch_pci_clean_pirqs(struct domain *
     if ( !is_hvm_domain(d) )
         return 0;
 
-    spin_lock(&d->event_lock);
+    write_lock(&d->event_lock);
     hvm_irq_dpci = domain_get_irq_dpci(d);
     if ( hvm_irq_dpci != NULL )
     {
@@ -1090,14 +1090,14 @@ int arch_pci_clean_pirqs(struct domain *
             ret = pt_pirq_iterate(d, pci_clean_dpci_irq, NULL);
         if ( ret )
         {
-            spin_unlock(&d->event_lock);
+            write_unlock(&d->event_lock);
             return ret;
         }
 
         hvm_domain_irq(d)->dpci = NULL;
         free_hvm_irq_dpci(hvm_irq_dpci);
     }
-    spin_unlock(&d->event_lock);
+    write_unlock(&d->event_lock);
 
     return 0;
 }
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -377,7 +377,7 @@ struct domain
     unsigned int     xen_evtchns;
     /* Port to resume from in evtchn_reset(), when in a continuation. */
     unsigned int     next_evtchn;
-    spinlock_t       event_lock;
+    rwlock_t         event_lock;
     const struct evtchn_port_ops *evtchn_port_ops;
     struct evtchn_fifo_domain *evtchn_fifo;

[PATCH v4 04/10] evtchn: don't call Xen consumer callback with per-channel lock held

Posted by Jan Beulich 5 years, 1 month ago

While there don't look to be any problems with this right now, the lock
order implications from holding the lock can be very difficult to follow
(and may be easy to violate unknowingly). The present callbacks don't
(and no such callback should) have any need for the lock to be held.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Go back to v2.
v3: Drain callbacks before proceeding with closing. Re-base.

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -767,9 +767,18 @@ int evtchn_send(struct domain *ld, unsig
         rport = lchn->u.interdomain.remote_port;
         rchn  = evtchn_from_port(rd, rport);
         if ( consumer_is_xen(rchn) )
-            xen_notification_fn(rchn)(rd->vcpu[rchn->notify_vcpu_id], rport);
-        else
-            evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn);
+        {
+            /* Don't keep holding the lock for the call below. */
+            xen_event_channel_notification_t fn = xen_notification_fn(rchn);
+            struct vcpu *rv = rd->vcpu[rchn->notify_vcpu_id];
+
+            rcu_lock_domain(rd);
+            evtchn_read_unlock(lchn);
+            fn(rv, rport);
+            rcu_unlock_domain(rd);
+            return 0;
+        }
+        evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn);
         break;
     case ECS_IPI:
         evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn);

Re: [PATCH v4 04/10] evtchn: don't call Xen consumer callback with per-channel lock held

Posted by Julien Grall 5 years ago

Hi Jan,

On 05/01/2021 13:10, Jan Beulich wrote:
> While there don't look to be any problems with this right now, the lock
> order implications from holding the lock can be very difficult to follow
> (and may be easy to violate unknowingly). The present callbacks don't
> (and no such callback should) have any need for the lock to be held.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Julien Grall <jgrall@amazon.com>

> ---
> v4: Go back to v2.
> v3: Drain callbacks before proceeding with closing. Re-base.
> 
> --- a/xen/common/event_channel.c
> +++ b/xen/common/event_channel.c
> @@ -767,9 +767,18 @@ int evtchn_send(struct domain *ld, unsig
>           rport = lchn->u.interdomain.remote_port;
>           rchn  = evtchn_from_port(rd, rport);
>           if ( consumer_is_xen(rchn) )
> -            xen_notification_fn(rchn)(rd->vcpu[rchn->notify_vcpu_id], rport);
> -        else
> -            evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn);
> +        {
> +            /* Don't keep holding the lock for the call below. */
> +            xen_event_channel_notification_t fn = xen_notification_fn(rchn);
> +            struct vcpu *rv = rd->vcpu[rchn->notify_vcpu_id];
> +
> +            rcu_lock_domain(rd);
> +            evtchn_read_unlock(lchn);
> +            fn(rv, rport);
> +            rcu_unlock_domain(rd);
> +            return 0;
> +        }
> +        evtchn_port_set_pending(rd, rchn->notify_vcpu_id, rchn);
>           break;
>       case ECS_IPI:
>           evtchn_port_set_pending(ld, lchn->notify_vcpu_id, lchn);
> 

Cheers,

-- 
Julien Grall

[PATCH v4 05/10] evtchn: closing of vIRQ-s doesn't require looping over all vCPU-s

Posted by Jan Beulich 5 years, 1 month ago

Global vIRQ-s have their event channel association tracked on vCPU 0.
Per-vCPU vIRQ-s can't have their notify_vcpu_id changed. Hence it is
well-known which vCPU's virq_to_evtchn[] needs updating.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -600,7 +600,6 @@ static long evtchn_bind_pirq(evtchn_bind
 int evtchn_close(struct domain *d1, int port1, bool guest)
 {
     struct domain *d2 = NULL;
-    struct vcpu   *v;
     struct evtchn *chn1, *chn2;
     int            port2;
     long           rc = 0;
@@ -651,17 +650,19 @@ int evtchn_close(struct domain *d1, int
         break;
     }
 
-    case ECS_VIRQ:
-        for_each_vcpu ( d1, v )
-        {
-            unsigned long flags;
+    case ECS_VIRQ: {
+        struct vcpu *v;
+        unsigned long flags;
+
+        v = d1->vcpu[virq_is_global(chn1->u.virq) ? 0 : chn1->notify_vcpu_id];
+
+        write_lock_irqsave(&v->virq_lock, flags);
+        ASSERT(read_atomic(&v->virq_to_evtchn[chn1->u.virq]) == port1);
+        write_atomic(&v->virq_to_evtchn[chn1->u.virq], 0);
+        write_unlock_irqrestore(&v->virq_lock, flags);
 
-            write_lock_irqsave(&v->virq_lock, flags);
-            if ( read_atomic(&v->virq_to_evtchn[chn1->u.virq]) == port1 )
-                write_atomic(&v->virq_to_evtchn[chn1->u.virq], 0);
-            write_unlock_irqrestore(&v->virq_lock, flags);
-        }
         break;
+    }
 
     case ECS_IPI:
         break;

Re: [PATCH v4 05/10] evtchn: closing of vIRQ-s doesn't require looping over all vCPU-s

Posted by Julien Grall 5 years ago

Hi Jan,

On 05/01/2021 13:10, Jan Beulich wrote:
> Global vIRQ-s have their event channel association tracked on vCPU 0.
> Per-vCPU vIRQ-s can't have their notify_vcpu_id changed. Hence it is
> well-known which vCPU's virq_to_evtchn[] needs updating.

I went through the history and couldn't find a reason for looping the vCPUs.

I have also looked at the code and agree with the analysis provided in 
the commit message.

Reviewed-by: Julien Grall <jgrall@amazon.com>

Cheers,

-- 
Julien Grall

[PATCH v4 06/10] evtchn: slightly defer lock acquire where possible

Posted by Jan Beulich 5 years, 1 month ago

port_is_valid() and evtchn_from_port() are fine to use without holding
any locks. Accordingly acquire the per-domain lock slightly later in
evtchn_close() and evtchn_bind_vcpu().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -604,17 +604,14 @@ int evtchn_close(struct domain *d1, int
     int            port2;
     long           rc = 0;
 
- again:
-    write_lock(&d1->event_lock);
-
     if ( !port_is_valid(d1, port1) )
-    {
-        rc = -EINVAL;
-        goto out;
-    }
+        return -EINVAL;
 
     chn1 = evtchn_from_port(d1, port1);
 
+ again:
+    write_lock(&d1->event_lock);
+
     /* Guest cannot close a Xen-attached event channel. */
     if ( unlikely(consumer_is_xen(chn1)) && guest )
     {
@@ -1039,16 +1036,13 @@ long evtchn_bind_vcpu(unsigned int port,
     if ( (v = domain_vcpu(d, vcpu_id)) == NULL )
         return -ENOENT;
 
-    write_lock(&d->event_lock);
-
     if ( !port_is_valid(d, port) )
-    {
-        rc = -EINVAL;
-        goto out;
-    }
+        return -EINVAL;
 
     chn = evtchn_from_port(d, port);
 
+    write_lock(&d->event_lock);
+
     /* Guest cannot re-bind a Xen-attached event channel. */
     if ( unlikely(consumer_is_xen(chn)) )
     {

Re: [PATCH v4 06/10] evtchn: slightly defer lock acquire where possible

Posted by Julien Grall 5 years ago

Hi Jan,

On 05/01/2021 13:11, Jan Beulich wrote:
> port_is_valid() and evtchn_from_port() are fine to use without holding
> any locks.

Per my remark in patch #1, currently, this only holds as long as we are 
checking the port for the current domain.

If it were a different domain, then the risk a risk that port_is_valid() 
may return valid and then invalid.

AFAICT, evtchn_close() may be called with d != current. So there is some 
racyness in the code as well.

Therefore, I will defer my ack until we agree on whether port_is_valid() 
can be used locklessly when current != domain.

Cheers,

[PATCH v4 07/10] evtchn: add helper for port_is_valid() + evtchn_from_port()

Posted by Jan Beulich 5 years, 1 month ago

The combination is pretty common, so adding a simple local helper seems
worthwhile. Make it const- and type-correct, in turn requiring the
two called function to also be const-correct (and at this occasion also
make them type-correct).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -147,6 +147,11 @@ static bool virq_is_global(unsigned int
     return true;
 }
 
+static struct evtchn *_evtchn_from_port(const struct domain *d,
+                                        evtchn_port_t port)
+{
+    return port_is_valid(d, port) ? evtchn_from_port(d, port) : NULL;
+}
 
 static struct evtchn *alloc_evtchn_bucket(struct domain *d, unsigned int port)
 {
@@ -361,9 +366,9 @@ static long evtchn_bind_interdomain(evtc
         ERROR_EXIT(lport);
     lchn = evtchn_from_port(ld, lport);
 
-    if ( !port_is_valid(rd, rport) )
+    rchn = _evtchn_from_port(rd, rport);
+    if ( !rchn )
         ERROR_EXIT_DOM(-EINVAL, rd);
-    rchn = evtchn_from_port(rd, rport);
 
     double_evtchn_lock(lchn, rchn);
 
@@ -600,15 +605,12 @@ static long evtchn_bind_pirq(evtchn_bind
 int evtchn_close(struct domain *d1, int port1, bool guest)
 {
     struct domain *d2 = NULL;
-    struct evtchn *chn1, *chn2;
-    int            port2;
+    struct evtchn *chn1 = _evtchn_from_port(d1, port1), *chn2;
     long           rc = 0;
 
-    if ( !port_is_valid(d1, port1) )
+    if ( !chn1 )
         return -EINVAL;
 
-    chn1 = evtchn_from_port(d1, port1);
-
  again:
     write_lock(&d1->event_lock);
 
@@ -695,10 +697,8 @@ int evtchn_close(struct domain *d1, int
             goto out;
         }
 
-        port2 = chn1->u.interdomain.remote_port;
-        BUG_ON(!port_is_valid(d2, port2));
-
-        chn2 = evtchn_from_port(d2, port2);
+        chn2 = _evtchn_from_port(d2, chn1->u.interdomain.remote_port);
+        BUG_ON(!chn2);
         BUG_ON(chn2->state != ECS_INTERDOMAIN);
         BUG_ON(chn2->u.interdomain.remote_dom != d1);
 
@@ -736,15 +736,13 @@ int evtchn_close(struct domain *d1, int
 
 int evtchn_send(struct domain *ld, unsigned int lport)
 {
-    struct evtchn *lchn, *rchn;
+    struct evtchn *lchn = _evtchn_from_port(ld, lport), *rchn;
     struct domain *rd;
     int            rport, ret = 0;
 
-    if ( !port_is_valid(ld, lport) )
+    if ( !lchn )
         return -EINVAL;
 
-    lchn = evtchn_from_port(ld, lport);
-
     evtchn_read_lock(lchn);
 
     /* Guest cannot send via a Xen-attached event channel. */
@@ -956,7 +954,6 @@ int evtchn_status(evtchn_status_t *statu
 {
     struct domain   *d;
     domid_t          dom = status->dom;
-    int              port = status->port;
     struct evtchn   *chn;
     long             rc = 0;
 
@@ -964,14 +961,13 @@ int evtchn_status(evtchn_status_t *statu
     if ( d == NULL )
         return -ESRCH;
 
-    if ( !port_is_valid(d, port) )
+    chn = _evtchn_from_port(d, status->port);
+    if ( !chn )
     {
         rcu_unlock_domain(d);
         return -EINVAL;
     }
 
-    chn = evtchn_from_port(d, port);
-
     evtchn_read_lock(chn);
 
     if ( consumer_is_xen(chn) )
@@ -1036,11 +1032,10 @@ long evtchn_bind_vcpu(unsigned int port,
     if ( (v = domain_vcpu(d, vcpu_id)) == NULL )
         return -ENOENT;
 
-    if ( !port_is_valid(d, port) )
+    chn = _evtchn_from_port(d, port);
+    if ( !chn )
         return -EINVAL;
 
-    chn = evtchn_from_port(d, port);
-
     write_lock(&d->event_lock);
 
     /* Guest cannot re-bind a Xen-attached event channel. */
@@ -1086,13 +1081,11 @@ long evtchn_bind_vcpu(unsigned int port,
 int evtchn_unmask(unsigned int port)
 {
     struct domain *d = current->domain;
-    struct evtchn *evtchn;
+    struct evtchn *evtchn = _evtchn_from_port(d, port);
 
-    if ( unlikely(!port_is_valid(d, port)) )
+    if ( unlikely(!evtchn) )
         return -EINVAL;
 
-    evtchn = evtchn_from_port(d, port);
-
     evtchn_read_lock(evtchn);
 
     evtchn_port_unmask(d, evtchn);
@@ -1175,14 +1168,12 @@ static long evtchn_set_priority(const st
 {
     struct domain *d = current->domain;
     unsigned int port = set_priority->port;
-    struct evtchn *chn;
+    struct evtchn *chn = _evtchn_from_port(d, port);
     long ret;
 
-    if ( !port_is_valid(d, port) )
+    if ( !chn )
         return -EINVAL;
 
-    chn = evtchn_from_port(d, port);
-
     evtchn_read_lock(chn);
 
     ret = evtchn_port_set_priority(d, chn, set_priority->priority);
@@ -1408,10 +1399,10 @@ void free_xen_event_channel(struct domai
 
 void notify_via_xen_event_channel(struct domain *ld, int lport)
 {
-    struct evtchn *lchn, *rchn;
+    struct evtchn *lchn = _evtchn_from_port(ld, lport), *rchn;
     struct domain *rd;
 
-    if ( !port_is_valid(ld, lport) )
+    if ( !lchn )
     {
         /*
          * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
@@ -1422,8 +1413,6 @@ void notify_via_xen_event_channel(struct
         return;
     }
 
-    lchn = evtchn_from_port(ld, lport);
-
     if ( !evtchn_read_trylock(lchn) )
         return;
 
@@ -1577,16 +1566,17 @@ static void domain_dump_evtchn_info(stru
            "Polling vCPUs: {%*pbl}\n"
            "    port [p/m/s]\n", d->domain_id, d->max_vcpus, d->poll_mask);
 
-    for ( port = 1; port_is_valid(d, port); ++port )
+    for ( port = 1; ; ++port )
     {
-        struct evtchn *chn;
+        struct evtchn *chn = _evtchn_from_port(d, port);
         char *ssid;
 
+        if ( !chn )
+            break;
+
         if ( !(port & 0x3f) )
             process_pending_softirqs();
 
-        chn = evtchn_from_port(d, port);
-
         if ( !evtchn_read_trylock(chn) )
         {
             printk("    %4u in flux\n", port);
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -120,7 +120,7 @@ static inline void evtchn_read_unlock(st
     read_unlock(&evtchn->lock);
 }
 
-static inline bool_t port_is_valid(struct domain *d, unsigned int p)
+static inline bool port_is_valid(const struct domain *d, evtchn_port_t p)
 {
     if ( p >= read_atomic(&d->valid_evtchns) )
         return false;
@@ -135,7 +135,8 @@ static inline bool_t port_is_valid(struc
     return true;
 }
 
-static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p)
+static inline struct evtchn *evtchn_from_port(const struct domain *d,
+                                              evtchn_port_t p)
 {
     if ( p < EVTCHNS_PER_BUCKET )
         return &d->evtchn[array_index_nospec(p, EVTCHNS_PER_BUCKET)];

Re: [PATCH v4 07/10] evtchn: add helper for port_is_valid() + evtchn_from_port()

Posted by Julien Grall 5 years ago

Hi Jan,

On 05/01/2021 13:12, Jan Beulich wrote:
> The combination is pretty common, so adding a simple local helper seems
> worthwhile. Make it const- and type-correct, in turn requiring the
> two called function to also be const-correct (and at this occasion also
> make them type-correct).

Acked-by: Julien Grall <jgrall@amazon.com>

Cheers,

> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v4: New.
> 
> --- a/xen/common/event_channel.c
> +++ b/xen/common/event_channel.c
> @@ -147,6 +147,11 @@ static bool virq_is_global(unsigned int
>       return true;
>   }
>   
> +static struct evtchn *_evtchn_from_port(const struct domain *d,
> +                                        evtchn_port_t port)
> +{
> +    return port_is_valid(d, port) ? evtchn_from_port(d, port) : NULL;
> +}
>   
>   static struct evtchn *alloc_evtchn_bucket(struct domain *d, unsigned int port)
>   {
> @@ -361,9 +366,9 @@ static long evtchn_bind_interdomain(evtc
>           ERROR_EXIT(lport);
>       lchn = evtchn_from_port(ld, lport);
>   
> -    if ( !port_is_valid(rd, rport) )
> +    rchn = _evtchn_from_port(rd, rport);
> +    if ( !rchn )
>           ERROR_EXIT_DOM(-EINVAL, rd);
> -    rchn = evtchn_from_port(rd, rport);
>   
>       double_evtchn_lock(lchn, rchn);
>   
> @@ -600,15 +605,12 @@ static long evtchn_bind_pirq(evtchn_bind
>   int evtchn_close(struct domain *d1, int port1, bool guest)
>   {
>       struct domain *d2 = NULL;
> -    struct evtchn *chn1, *chn2;
> -    int            port2;
> +    struct evtchn *chn1 = _evtchn_from_port(d1, port1), *chn2;
>       long           rc = 0;
>   
> -    if ( !port_is_valid(d1, port1) )
> +    if ( !chn1 )
>           return -EINVAL;
>   
> -    chn1 = evtchn_from_port(d1, port1);
> -
>    again:
>       write_lock(&d1->event_lock);
>   
> @@ -695,10 +697,8 @@ int evtchn_close(struct domain *d1, int
>               goto out;
>           }
>   
> -        port2 = chn1->u.interdomain.remote_port;
> -        BUG_ON(!port_is_valid(d2, port2));
> -
> -        chn2 = evtchn_from_port(d2, port2);
> +        chn2 = _evtchn_from_port(d2, chn1->u.interdomain.remote_port);
> +        BUG_ON(!chn2);
>           BUG_ON(chn2->state != ECS_INTERDOMAIN);
>           BUG_ON(chn2->u.interdomain.remote_dom != d1);
>   
> @@ -736,15 +736,13 @@ int evtchn_close(struct domain *d1, int
>   
>   int evtchn_send(struct domain *ld, unsigned int lport)
>   {
> -    struct evtchn *lchn, *rchn;
> +    struct evtchn *lchn = _evtchn_from_port(ld, lport), *rchn;
>       struct domain *rd;
>       int            rport, ret = 0;
>   
> -    if ( !port_is_valid(ld, lport) )
> +    if ( !lchn )
>           return -EINVAL;
>   
> -    lchn = evtchn_from_port(ld, lport);
> -
>       evtchn_read_lock(lchn);
>   
>       /* Guest cannot send via a Xen-attached event channel. */
> @@ -956,7 +954,6 @@ int evtchn_status(evtchn_status_t *statu
>   {
>       struct domain   *d;
>       domid_t          dom = status->dom;
> -    int              port = status->port;
>       struct evtchn   *chn;
>       long             rc = 0;
>   
> @@ -964,14 +961,13 @@ int evtchn_status(evtchn_status_t *statu
>       if ( d == NULL )
>           return -ESRCH;
>   
> -    if ( !port_is_valid(d, port) )
> +    chn = _evtchn_from_port(d, status->port);
> +    if ( !chn )
>       {
>           rcu_unlock_domain(d);
>           return -EINVAL;
>       }
>   
> -    chn = evtchn_from_port(d, port);
> -
>       evtchn_read_lock(chn);
>   
>       if ( consumer_is_xen(chn) )
> @@ -1036,11 +1032,10 @@ long evtchn_bind_vcpu(unsigned int port,
>       if ( (v = domain_vcpu(d, vcpu_id)) == NULL )
>           return -ENOENT;
>   
> -    if ( !port_is_valid(d, port) )
> +    chn = _evtchn_from_port(d, port);
> +    if ( !chn )
>           return -EINVAL;
>   
> -    chn = evtchn_from_port(d, port);
> -
>       write_lock(&d->event_lock);
>   
>       /* Guest cannot re-bind a Xen-attached event channel. */
> @@ -1086,13 +1081,11 @@ long evtchn_bind_vcpu(unsigned int port,
>   int evtchn_unmask(unsigned int port)
>   {
>       struct domain *d = current->domain;
> -    struct evtchn *evtchn;
> +    struct evtchn *evtchn = _evtchn_from_port(d, port);
>   
> -    if ( unlikely(!port_is_valid(d, port)) )
> +    if ( unlikely(!evtchn) )
>           return -EINVAL;
>   
> -    evtchn = evtchn_from_port(d, port);
> -
>       evtchn_read_lock(evtchn);
>   
>       evtchn_port_unmask(d, evtchn);
> @@ -1175,14 +1168,12 @@ static long evtchn_set_priority(const st
>   {
>       struct domain *d = current->domain;
>       unsigned int port = set_priority->port;
> -    struct evtchn *chn;
> +    struct evtchn *chn = _evtchn_from_port(d, port);
>       long ret;
>   
> -    if ( !port_is_valid(d, port) )
> +    if ( !chn )
>           return -EINVAL;
>   
> -    chn = evtchn_from_port(d, port);
> -
>       evtchn_read_lock(chn);
>   
>       ret = evtchn_port_set_priority(d, chn, set_priority->priority);
> @@ -1408,10 +1399,10 @@ void free_xen_event_channel(struct domai
>   
>   void notify_via_xen_event_channel(struct domain *ld, int lport)
>   {
> -    struct evtchn *lchn, *rchn;
> +    struct evtchn *lchn = _evtchn_from_port(ld, lport), *rchn;
>       struct domain *rd;
>   
> -    if ( !port_is_valid(ld, lport) )
> +    if ( !lchn )
>       {
>           /*
>            * Make sure ->is_dying is read /after/ ->valid_evtchns, pairing
> @@ -1422,8 +1413,6 @@ void notify_via_xen_event_channel(struct
>           return;
>       }
>   
> -    lchn = evtchn_from_port(ld, lport);
> -
>       if ( !evtchn_read_trylock(lchn) )
>           return;
>   
> @@ -1577,16 +1566,17 @@ static void domain_dump_evtchn_info(stru
>              "Polling vCPUs: {%*pbl}\n"
>              "    port [p/m/s]\n", d->domain_id, d->max_vcpus, d->poll_mask);
>   
> -    for ( port = 1; port_is_valid(d, port); ++port )
> +    for ( port = 1; ; ++port )
>       {
> -        struct evtchn *chn;
> +        struct evtchn *chn = _evtchn_from_port(d, port);
>           char *ssid;
>   
> +        if ( !chn )
> +            break;
> +
>           if ( !(port & 0x3f) )
>               process_pending_softirqs();
>   
> -        chn = evtchn_from_port(d, port);
> -
>           if ( !evtchn_read_trylock(chn) )
>           {
>               printk("    %4u in flux\n", port);
> --- a/xen/include/xen/event.h
> +++ b/xen/include/xen/event.h
> @@ -120,7 +120,7 @@ static inline void evtchn_read_unlock(st
>       read_unlock(&evtchn->lock);
>   }
>   
> -static inline bool_t port_is_valid(struct domain *d, unsigned int p)
> +static inline bool port_is_valid(const struct domain *d, evtchn_port_t p)
>   {
>       if ( p >= read_atomic(&d->valid_evtchns) )
>           return false;
> @@ -135,7 +135,8 @@ static inline bool_t port_is_valid(struc
>       return true;
>   }
>   
> -static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p)
> +static inline struct evtchn *evtchn_from_port(const struct domain *d,
> +                                              evtchn_port_t p)
>   {
>       if ( p < EVTCHNS_PER_BUCKET )
>           return &d->evtchn[array_index_nospec(p, EVTCHNS_PER_BUCKET)];
> 

-- 
Julien Grall

[PATCH v4 08/10] evtchn: closing of ports doesn't need to hold both domains' event locks

Posted by Jan Beulich 5 years, 1 month ago

The local domain's lock is needed for the port freeing, but for the
remote side the per-channel lock is sufficient. Other logic then needs
rearranging, though, including the early dropping of both locks (and the
remote domain ref) in the ECS_PIRQ and ECS_VIRQ cases.

Note in particular that there is no real race with evtchn_bind_vcpu():
ECS_INTERDOMAIN and ECS_UNBOUND get treated identically there, and
evtchn_close() doesn't care about the notification vCPU ID.

Note also that we can't use double_evtchn_unlock() or
evtchn_write_unlock() when releasing locks to cover for possible races.
See the respective code comment.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -605,15 +605,15 @@ static long evtchn_bind_pirq(evtchn_bind
 int evtchn_close(struct domain *d1, int port1, bool guest)
 {
     struct domain *d2 = NULL;
-    struct evtchn *chn1 = _evtchn_from_port(d1, port1), *chn2;
+    struct evtchn *chn1 = _evtchn_from_port(d1, port1), *chn2 = NULL;
     long           rc = 0;
 
     if ( !chn1 )
         return -EINVAL;
 
- again:
     write_lock(&d1->event_lock);
 
+ again:
     /* Guest cannot close a Xen-attached event channel. */
     if ( unlikely(consumer_is_xen(chn1)) && guest )
     {
@@ -634,6 +634,22 @@ int evtchn_close(struct domain *d1, int
     case ECS_PIRQ: {
         struct pirq *pirq = pirq_info(d1, chn1->u.pirq.irq);
 
+        /*
+         * We don't require the per-channel lock here, so in case a race
+         * happened on the interdomain path below better release both.
+         */
+        if ( unlikely(chn2) )
+        {
+            /*
+             * See evtchn_write_unlock() vs plain write_unlock() comment in
+             * ECS_INTERDOMAIN handling below.
+             */
+            write_unlock(&chn1->lock);
+            write_unlock(&chn2->lock);
+            put_domain(d2);
+            chn2 = NULL;
+        }
+
         if ( pirq )
         {
             if ( !is_hvm_domain(d1) )
@@ -653,6 +669,22 @@ int evtchn_close(struct domain *d1, int
         struct vcpu *v;
         unsigned long flags;
 
+        /*
+         * The per-channel locks nest inside the vIRQ ones, so we must release
+         * them if a race happened on the interdomain path below.
+         */
+        if ( unlikely(chn2) )
+        {
+            /*
+             * See evtchn_write_unlock() vs plain write_unlock() comment in
+             * ECS_INTERDOMAIN handling below.
+             */
+            write_unlock(&chn1->lock);
+            write_unlock(&chn2->lock);
+            put_domain(d2);
+            chn2 = NULL;
+        }
+
         v = d1->vcpu[virq_is_global(chn1->u.virq) ? 0 : chn1->notify_vcpu_id];
 
         write_lock_irqsave(&v->virq_lock, flags);
@@ -669,63 +701,87 @@ int evtchn_close(struct domain *d1, int
     case ECS_INTERDOMAIN:
         if ( d2 == NULL )
         {
-            d2 = chn1->u.interdomain.remote_dom;
+            evtchn_write_lock(chn1);
 
-            /* If we unlock d1 then we could lose d2. Must get a reference. */
-            if ( unlikely(!get_domain(d2)) )
-                BUG();
-
-            if ( d1 < d2 )
-                write_lock(&d2->event_lock);
-            else if ( d1 != d2 )
-            {
-                write_unlock(&d1->event_lock);
-                write_lock(&d2->event_lock);
-                goto again;
-            }
+            if ( chn1->state == ECS_INTERDOMAIN )
+                d2 = chn1->u.interdomain.remote_dom;
+            else
+                /* See comment further down. */
+                write_unlock(&chn1->lock);
         }
-        else if ( d2 != chn1->u.interdomain.remote_dom )
+
+        if ( d2 != chn1->u.interdomain.remote_dom )
         {
             /*
-             * We can only get here if the port was closed and re-bound after
-             * unlocking d1 but before locking d2 above. We could retry but
-             * it is easier to return the same error as if we had seen the
-             * port in ECS_FREE. It must have passed through that state for
-             * us to end up here, so it's a valid error to return.
+             * We can only get here if the port was closed and re-bound
+             * - before locking chn1 or
+             * - after unlocking chn1 but before locking both channels
+             * above.  We could retry but it is easier to return the same error
+             * as if we had seen the port in ECS_FREE.  It must have passed
+             * through that state for us to end up here, so it's a valid error
+             * to return.
              */
+            if ( d2 && !chn2 )
+                write_unlock(&chn1->lock);
             rc = -EINVAL;
             goto out;
         }
 
-        chn2 = _evtchn_from_port(d2, chn1->u.interdomain.remote_port);
-        BUG_ON(!chn2);
-        BUG_ON(chn2->state != ECS_INTERDOMAIN);
-        BUG_ON(chn2->u.interdomain.remote_dom != d1);
+        if ( !chn2 )
+        {
+            /* If we unlock chn1 then we could lose d2. Must get a reference. */
+            if ( unlikely(!get_domain(d2)) )
+                BUG();
 
-        double_evtchn_lock(chn1, chn2);
+            chn2 = _evtchn_from_port(d2, chn1->u.interdomain.remote_port);
+            BUG_ON(!chn2);
 
-        evtchn_free(d1, chn1);
+            if ( chn1 > chn2 )
+            {
+                /*
+                 * Cannot use evtchn_write_unlock() here, as its assertion
+                 * likely won't hold.  However, a race - as per the comment
+                 * below - indicates a transition through ECS_FREE must
+                 * have occurred, so the assumptions by users of
+                 * evtchn_read_trylock() still hold (i.e. they're similarly
+                 * fine to bail).
+                 */
+                write_unlock(&chn1->lock);
+                double_evtchn_lock(chn2, chn1);
+                goto again;
+            }
+
+            evtchn_write_lock(chn2);
+        }
+
+        BUG_ON(chn2->state != ECS_INTERDOMAIN);
+        BUG_ON(chn2->u.interdomain.remote_dom != d1);
 
         chn2->state = ECS_UNBOUND;
         chn2->u.unbound.remote_domid = d1->domain_id;
 
-        double_evtchn_unlock(chn1, chn2);
-
-        goto out;
+        break;
 
     default:
         BUG();
     }
 
-    evtchn_write_lock(chn1);
+    if ( !chn2 )
+        evtchn_write_lock(chn1);
     evtchn_free(d1, chn1);
-    evtchn_write_unlock(chn1);
+    if ( !chn2 )
+        evtchn_write_unlock(chn1);
 
  out:
-    if ( d2 != NULL )
+    if ( chn2 )
     {
-        if ( d1 != d2 )
-            write_unlock(&d2->event_lock);
+        /*
+         * See evtchn_write_unlock() vs plain write_unlock() comment in
+         * ECS_INTERDOMAIN handling above.  In principle we could use
+         * double_evtchn_unlock() on the ECS_INTERDOMAIN success path.
+         */
+        write_unlock(&chn1->lock);
+        write_unlock(&chn2->lock);
         put_domain(d2);
     }

[PATCH v4 09/10] evtchn: type adjustments

Posted by Jan Beulich 5 years, 1 month ago

First of all avoid "long" when "int" suffices, i.e. in particular when
merely conveying error codes. 32-bit values are slightly cheaper to
deal with on x86, and their processing is at least no more expensive on
Arm. Where possible use evtchn_port_t for port numbers and unsigned int
for other unsigned quantities in adjacent code. In evtchn_set_priority()
eliminate a local variable altogether instead of changing its type.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -287,13 +287,12 @@ void evtchn_free(struct domain *d, struc
     xsm_evtchn_close_post(chn);
 }
 
-static long evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc)
+static int evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc)
 {
     struct evtchn *chn;
     struct domain *d;
-    int            port;
+    int            port, rc;
     domid_t        dom = alloc->dom;
-    long           rc;
 
     d = rcu_lock_domain_by_any_id(dom);
     if ( d == NULL )
@@ -346,13 +345,13 @@ static void double_evtchn_unlock(struct
     evtchn_write_unlock(rchn);
 }
 
-static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
+static int evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind)
 {
     struct evtchn *lchn, *rchn;
     struct domain *ld = current->domain, *rd;
-    int            lport, rport = bind->remote_port;
+    int            lport, rc;
+    evtchn_port_t  rport = bind->remote_port;
     domid_t        rdom = bind->remote_dom;
-    long           rc;
 
     if ( rdom == DOMID_SELF )
         rdom = current->domain->domain_id;
@@ -482,12 +481,12 @@ int evtchn_bind_virq(evtchn_bind_virq_t
 }
 
 
-static long evtchn_bind_ipi(evtchn_bind_ipi_t *bind)
+static int evtchn_bind_ipi(evtchn_bind_ipi_t *bind)
 {
     struct evtchn *chn;
     struct domain *d = current->domain;
-    int            port, vcpu = bind->vcpu;
-    long           rc = 0;
+    int            port, rc = 0;
+    unsigned int   vcpu = bind->vcpu;
 
     if ( domain_vcpu(d, vcpu) == NULL )
         return -ENOENT;
@@ -541,16 +540,16 @@ static void unlink_pirq_port(struct evtc
 }
 
 
-static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
+static int evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
 {
     struct evtchn *chn;
     struct domain *d = current->domain;
     struct vcpu   *v = d->vcpu[0];
     struct pirq   *info;
-    int            port = 0, pirq = bind->pirq;
-    long           rc;
+    int            port = 0, rc;
+    unsigned int   pirq = bind->pirq;
 
-    if ( (pirq < 0) || (pirq >= d->nr_pirqs) )
+    if ( pirq >= d->nr_pirqs )
         return -EINVAL;
 
     if ( !is_hvm_domain(d) && !pirq_access_permitted(d, pirq) )
@@ -606,7 +605,7 @@ int evtchn_close(struct domain *d1, int
 {
     struct domain *d2 = NULL;
     struct evtchn *chn1 = _evtchn_from_port(d1, port1), *chn2 = NULL;
-    long           rc = 0;
+    int            rc = 0;
 
     if ( !chn1 )
         return -EINVAL;
@@ -1011,7 +1010,7 @@ int evtchn_status(evtchn_status_t *statu
     struct domain   *d;
     domid_t          dom = status->dom;
     struct evtchn   *chn;
-    long             rc = 0;
+    int              rc = 0;
 
     d = rcu_lock_domain_by_any_id(dom);
     if ( d == NULL )
@@ -1077,11 +1076,11 @@ int evtchn_status(evtchn_status_t *statu
 }
 
 
-long evtchn_bind_vcpu(unsigned int port, unsigned int vcpu_id)
+int evtchn_bind_vcpu(evtchn_port_t port, unsigned int vcpu_id)
 {
     struct domain *d = current->domain;
     struct evtchn *chn;
-    long           rc = 0;
+    int           rc = 0;
     struct vcpu   *v;
 
     /* Use the vcpu info to prevent speculative out-of-bound accesses */
@@ -1220,12 +1219,11 @@ int evtchn_reset(struct domain *d, bool
     return rc;
 }
 
-static long evtchn_set_priority(const struct evtchn_set_priority *set_priority)
+static int evtchn_set_priority(const struct evtchn_set_priority *set_priority)
 {
     struct domain *d = current->domain;
-    unsigned int port = set_priority->port;
-    struct evtchn *chn = _evtchn_from_port(d, port);
-    long ret;
+    struct evtchn *chn = _evtchn_from_port(d, set_priority->port);
+    int ret;
 
     if ( !chn )
         return -EINVAL;
@@ -1241,7 +1239,7 @@ static long evtchn_set_priority(const st
 
 long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
 {
-    long rc;
+    int rc;
 
     switch ( cmd )
     {
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -54,7 +54,7 @@ void send_guest_pirq(struct domain *, co
 int evtchn_send(struct domain *d, unsigned int lport);
 
 /* Bind a local event-channel port to the specified VCPU. */
-long evtchn_bind_vcpu(unsigned int port, unsigned int vcpu_id);
+int evtchn_bind_vcpu(evtchn_port_t port, unsigned int vcpu_id);
 
 /* Bind a VIRQ. */
 int evtchn_bind_virq(evtchn_bind_virq_t *bind, evtchn_port_t port);

[PATCH v4 10/10] evtchn: drop acquiring of per-channel lock from send_guest_{global,vcpu}_virq()

Posted by Jan Beulich 5 years, 1 month ago

The per-vCPU virq_lock, which is being held anyway, together with there
not being any call to evtchn_port_set_pending() when v->virq_to_evtchn[]
is zero, provide sufficient guarantees. Undo the lock addition done for
XSA-343 (commit e045199c7c9c "evtchn: address races with
evtchn_reset()"). Update the description next to struct evtchn_port_ops
accordingly.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Move to end of series, for being the most controversial change.
v3: Re-base.
v2: New.

--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -863,7 +863,6 @@ void send_guest_vcpu_virq(struct vcpu *v
     unsigned long flags;
     int port;
     struct domain *d;
-    struct evtchn *chn;
 
     ASSERT(!virq_is_global(virq));
 
@@ -874,12 +873,7 @@ void send_guest_vcpu_virq(struct vcpu *v
         goto out;
 
     d = v->domain;
-    chn = evtchn_from_port(d, port);
-    if ( evtchn_read_trylock(chn) )
-    {
-        evtchn_port_set_pending(d, v->vcpu_id, chn);
-        evtchn_read_unlock(chn);
-    }
+    evtchn_port_set_pending(d, v->vcpu_id, evtchn_from_port(d, port));
 
  out:
     read_unlock_irqrestore(&v->virq_lock, flags);
@@ -908,11 +902,7 @@ void send_guest_global_virq(struct domai
         goto out;
 
     chn = evtchn_from_port(d, port);
-    if ( evtchn_read_trylock(chn) )
-    {
-        evtchn_port_set_pending(d, chn->notify_vcpu_id, chn);
-        evtchn_read_unlock(chn);
-    }
+    evtchn_port_set_pending(d, chn->notify_vcpu_id, chn);
 
  out:
     read_unlock_irqrestore(&v->virq_lock, flags);
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -193,9 +193,16 @@ int evtchn_reset(struct domain *d, bool
  * Low-level event channel port ops.
  *
  * All hooks have to be called with a lock held which prevents the channel
- * from changing state. This may be the domain event lock, the per-channel
- * lock, or in the case of sending interdomain events also the other side's
- * per-channel lock. Exceptions apply in certain cases for the PV shim.
+ * from changing state. This may be
+ * - the domain event lock,
+ * - the per-channel lock,
+ * - in the case of sending interdomain events the other side's per-channel
+ *   lock,
+ * - in the case of sending non-global vIRQ-s the per-vCPU virq_lock (in
+ *   combination with the ordering enforced through how the vCPU's
+ *   virq_to_evtchn[] gets updated),
+ * - in the case of sending global vIRQ-s vCPU 0's virq_lock.
+ * Exceptions apply in certain cases for the PV shim.
  */
 struct evtchn_port_ops {
     void (*init)(struct domain *d, struct evtchn *evtchn);