Unbreak evaluate_nospec() and livepatching

[Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec

Posted by Andrew Cooper 6 years, 3 months ago

evaluate_nospec() is incredibly fragile, and this is one giant bodge.

To correctly protect jumps, the generated code needs to be of the form:

    cmp/test <cond>
    jcc 1f
    lfence
    ...
 1: lfence
    ...

Critically, the lfence must be at the head of both basic blocks, later in the
instruction stream than the conditional jump in need of protection.

When a static inline is involved, the optimiser decides to be clever and
rearranges the code as:

 pred:
    lfence
    <calculate cond>
    ret

    call pred
    cmp $0, %eax
    jcc 1f
    ...
 1: ...

which breaks the speculative safety.

Any use of evaluate_nospec() needs all static inline predicates which use it
to be declared always_inline to prevent the optimiser having the flexibility
to generate unsafe code.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: George Dunlap <George.Dunlap@eu.citrix.com>
CC: Ian Jackson <ian.jackson@citrix.com>
CC: Jan Beulich <JBeulich@suse.com>
CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
CC: Stefano Stabellini <sstabellini@kernel.org>
CC: Wei Liu <wl@xen.org>
CC: Julien Grall <julien@xen.org>
CC: Juergen Gross <jgross@suse.com>

This is the transitive set of predicates which I can spot which need
protecting.  There are probably ones I've missed.  Personally, I'm -1 for this
approach, but the only other option for 4.13 is to revert it all to unbreak
livepatching.

v3:
 * New
---
 xen/arch/x86/domain.c               |  2 +-
 xen/arch/x86/pv/mm.h                | 12 ++++++------
 xen/include/asm-x86/event.h         |  2 +-
 xen/include/asm-x86/guest_pt.h      | 28 ++++++++++++++++------------
 xen/include/asm-x86/hvm/nestedhvm.h |  2 +-
 xen/include/asm-x86/paging.h        |  2 +-
 xen/include/xen/sched.h             | 20 ++++++++++----------
 7 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index c8d7f491ea..1b88cc2d68 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1699,7 +1699,7 @@ static void _update_runstate_area(struct vcpu *v)
  * regular per-CPU GDT frame to appear with selectors at the appropriate
  * offset.
  */
-static inline bool need_full_gdt(const struct domain *d)
+static always_inline bool need_full_gdt(const struct domain *d)
 {
     return is_pv_domain(d) && !is_idle_domain(d);
 }
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index 2d427b418d..a1bd473b29 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -88,8 +88,8 @@ static inline bool update_intpte(intpte_t *p, intpte_t old, intpte_t new,
                   _t ## e_get_intpte(_o), _t ## e_get_intpte(_n),   \
                   (_m), (_v), (_ad))
 
-static inline l1_pgentry_t adjust_guest_l1e(l1_pgentry_t l1e,
-                                            const struct domain *d)
+static always_inline l1_pgentry_t adjust_guest_l1e(l1_pgentry_t l1e,
+                                                   const struct domain *d)
 {
     if ( likely(l1e_get_flags(l1e) & _PAGE_PRESENT) &&
          likely(!is_pv_32bit_domain(d)) )
@@ -120,8 +120,8 @@ static inline l2_pgentry_t adjust_guest_l2e(l2_pgentry_t l2e,
     return l2e;
 }
 
-static inline l3_pgentry_t adjust_guest_l3e(l3_pgentry_t l3e,
-                                            const struct domain *d)
+static always_inline l3_pgentry_t adjust_guest_l3e(l3_pgentry_t l3e,
+                                                   const struct domain *d)
 {
     if ( likely(l3e_get_flags(l3e) & _PAGE_PRESENT) )
         l3e_add_flags(l3e, (likely(!is_pv_32bit_domain(d))
@@ -140,8 +140,8 @@ static inline l3_pgentry_t unadjust_guest_l3e(l3_pgentry_t l3e,
     return l3e;
 }
 
-static inline l4_pgentry_t adjust_guest_l4e(l4_pgentry_t l4e,
-                                            const struct domain *d)
+static always_inline l4_pgentry_t adjust_guest_l4e(l4_pgentry_t l4e,
+                                                   const struct domain *d)
 {
     /*
      * When shadowing an L4 behind the guests back (e.g. for per-pcpu
diff --git a/xen/include/asm-x86/event.h b/xen/include/asm-x86/event.h
index 2f6ea54bcb..98a85233cb 100644
--- a/xen/include/asm-x86/event.h
+++ b/xen/include/asm-x86/event.h
@@ -20,7 +20,7 @@ static inline int vcpu_event_delivery_is_enabled(struct vcpu *v)
 }
 
 int hvm_local_events_need_delivery(struct vcpu *v);
-static inline int local_events_need_delivery(void)
+static always_inline bool local_events_need_delivery(void)
 {
     struct vcpu *v = current;
 
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
index 8684b83fd6..6ab2041e48 100644
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -202,7 +202,7 @@ static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags)
 
 /* Which pagetable features are supported on this vcpu? */
 
-static inline bool guest_can_use_l2_superpages(const struct vcpu *v)
+static always_inline bool guest_can_use_l2_superpages(const struct vcpu *v)
 {
     /*
      * PV guests use Xen's paging settings.  Being 4-level, 2M
@@ -218,7 +218,7 @@ static inline bool guest_can_use_l2_superpages(const struct vcpu *v)
             (v->arch.hvm.guest_cr[4] & X86_CR4_PSE));
 }
 
-static inline bool guest_can_use_l3_superpages(const struct domain *d)
+static always_inline bool guest_can_use_l3_superpages(const struct domain *d)
 {
     /*
      * There are no control register settings for the hardware pagewalk on the
@@ -252,7 +252,7 @@ static inline bool guest_can_use_pse36(const struct domain *d)
     return paging_mode_hap(d) && cpu_has_pse36;
 }
 
-static inline bool guest_nx_enabled(const struct vcpu *v)
+static always_inline bool guest_nx_enabled(const struct vcpu *v)
 {
     if ( GUEST_PAGING_LEVELS == 2 ) /* NX has no effect witout CR4.PAE. */
         return false;
@@ -261,23 +261,23 @@ static inline bool guest_nx_enabled(const struct vcpu *v)
     return is_pv_vcpu(v) ? cpu_has_nx : hvm_nx_enabled(v);
 }
 
-static inline bool guest_wp_enabled(const struct vcpu *v)
+static always_inline bool guest_wp_enabled(const struct vcpu *v)
 {
     /* PV guests can't control CR0.WP, and it is unconditionally set by Xen. */
     return is_pv_vcpu(v) || hvm_wp_enabled(v);
 }
 
-static inline bool guest_smep_enabled(const struct vcpu *v)
+static always_inline bool guest_smep_enabled(const struct vcpu *v)
 {
     return !is_pv_vcpu(v) && hvm_smep_enabled(v);
 }
 
-static inline bool guest_smap_enabled(const struct vcpu *v)
+static always_inline bool guest_smap_enabled(const struct vcpu *v)
 {
     return !is_pv_vcpu(v) && hvm_smap_enabled(v);
 }
 
-static inline bool guest_pku_enabled(const struct vcpu *v)
+static always_inline bool guest_pku_enabled(const struct vcpu *v)
 {
     return !is_pv_vcpu(v) && hvm_pku_enabled(v);
 }
@@ -285,19 +285,21 @@ static inline bool guest_pku_enabled(const struct vcpu *v)
 /* Helpers for identifying whether guest entries have reserved bits set. */
 
 /* Bits reserved because of maxphysaddr, and (lack of) EFER.NX */
-static inline uint64_t guest_rsvd_bits(const struct vcpu *v)
+static always_inline uint64_t guest_rsvd_bits(const struct vcpu *v)
 {
     return ((PADDR_MASK &
              ~((1ul << v->domain->arch.cpuid->extd.maxphysaddr) - 1)) |
             (guest_nx_enabled(v) ? 0 : put_pte_flags(_PAGE_NX_BIT)));
 }
 
-static inline bool guest_l1e_rsvd_bits(const struct vcpu *v, guest_l1e_t l1e)
+static always_inline bool guest_l1e_rsvd_bits(const struct vcpu *v,
+                                              guest_l1e_t l1e)
 {
     return l1e.l1 & (guest_rsvd_bits(v) | GUEST_L1_PAGETABLE_RSVD);
 }
 
-static inline bool guest_l2e_rsvd_bits(const struct vcpu *v, guest_l2e_t l2e)
+static always_inline bool guest_l2e_rsvd_bits(const struct vcpu *v,
+                                              guest_l2e_t l2e)
 {
     uint64_t rsvd_bits = guest_rsvd_bits(v);
 
@@ -311,7 +313,8 @@ static inline bool guest_l2e_rsvd_bits(const struct vcpu *v, guest_l2e_t l2e)
 }
 
 #if GUEST_PAGING_LEVELS >= 3
-static inline bool guest_l3e_rsvd_bits(const struct vcpu *v, guest_l3e_t l3e)
+static always_inline bool guest_l3e_rsvd_bits(const struct vcpu *v,
+                                              guest_l3e_t l3e)
 {
     return ((l3e.l3 & (guest_rsvd_bits(v) | GUEST_L3_PAGETABLE_RSVD |
                        (guest_can_use_l3_superpages(v->domain) ? 0 : _PAGE_PSE))) ||
@@ -320,7 +323,8 @@ static inline bool guest_l3e_rsvd_bits(const struct vcpu *v, guest_l3e_t l3e)
 }
 
 #if GUEST_PAGING_LEVELS >= 4
-static inline bool guest_l4e_rsvd_bits(const struct vcpu *v, guest_l4e_t l4e)
+static always_inline bool guest_l4e_rsvd_bits(const struct vcpu *v,
+                                              guest_l4e_t l4e)
 {
     return l4e.l4 & (guest_rsvd_bits(v) | GUEST_L4_PAGETABLE_RSVD |
                      ((v->domain->arch.cpuid->x86_vendor == X86_VENDOR_AMD)
diff --git a/xen/include/asm-x86/hvm/nestedhvm.h b/xen/include/asm-x86/hvm/nestedhvm.h
index e09fa9d47d..256fed733a 100644
--- a/xen/include/asm-x86/hvm/nestedhvm.h
+++ b/xen/include/asm-x86/hvm/nestedhvm.h
@@ -33,7 +33,7 @@ enum nestedhvm_vmexits {
 };
 
 /* Nested HVM on/off per domain */
-static inline bool nestedhvm_enabled(const struct domain *d)
+static always_inline bool nestedhvm_enabled(const struct domain *d)
 {
     return is_hvm_domain(d) && d->arch.hvm.params &&
         d->arch.hvm.params[HVM_PARAM_NESTEDHVM];
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index 8c2027c791..7544f73121 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -383,7 +383,7 @@ static inline bool gfn_valid(const struct domain *d, gfn_t gfn)
 }
 
 /* Maxphysaddr supportable by the paging infrastructure. */
-static inline unsigned int paging_max_paddr_bits(const struct domain *d)
+static always_inline unsigned int paging_max_paddr_bits(const struct domain *d)
 {
     unsigned int bits = paging_mode_hap(d) ? hap_paddr_bits : paddr_bits;
 
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 629a4c52e0..9f7bc69293 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -968,50 +968,50 @@ void watchdog_domain_destroy(struct domain *d);
 
 #define VM_ASSIST(d, t) (test_bit(VMASST_TYPE_ ## t, &(d)->vm_assist))
 
-static inline bool is_pv_domain(const struct domain *d)
+static always_inline bool is_pv_domain(const struct domain *d)
 {
     return IS_ENABLED(CONFIG_PV) &&
         evaluate_nospec(!(d->options & XEN_DOMCTL_CDF_hvm));
 }
 
-static inline bool is_pv_vcpu(const struct vcpu *v)
+static always_inline bool is_pv_vcpu(const struct vcpu *v)
 {
     return is_pv_domain(v->domain);
 }
 
 #ifdef CONFIG_COMPAT
-static inline bool is_pv_32bit_domain(const struct domain *d)
+static always_inline bool is_pv_32bit_domain(const struct domain *d)
 {
     return is_pv_domain(d) && d->arch.is_32bit_pv;
 }
 
-static inline bool is_pv_32bit_vcpu(const struct vcpu *v)
+static always_inline bool is_pv_32bit_vcpu(const struct vcpu *v)
 {
     return is_pv_32bit_domain(v->domain);
 }
 
-static inline bool is_pv_64bit_domain(const struct domain *d)
+static always_inline bool is_pv_64bit_domain(const struct domain *d)
 {
     return is_pv_domain(d) && !d->arch.is_32bit_pv;
 }
 
-static inline bool is_pv_64bit_vcpu(const struct vcpu *v)
+static always_inline bool is_pv_64bit_vcpu(const struct vcpu *v)
 {
     return is_pv_64bit_domain(v->domain);
 }
 #endif
-static inline bool is_hvm_domain(const struct domain *d)
+static always_inline bool is_hvm_domain(const struct domain *d)
 {
     return IS_ENABLED(CONFIG_HVM) &&
         evaluate_nospec(d->options & XEN_DOMCTL_CDF_hvm);
 }
 
-static inline bool is_hvm_vcpu(const struct vcpu *v)
+static always_inline bool is_hvm_vcpu(const struct vcpu *v)
 {
     return is_hvm_domain(v->domain);
 }
 
-static inline bool hap_enabled(const struct domain *d)
+static always_inline bool hap_enabled(const struct domain *d)
 {
     /* sanitise_domain_config() rejects HAP && !HVM */
     return IS_ENABLED(CONFIG_HVM) &&
@@ -1034,7 +1034,7 @@ static inline bool is_xenstore_domain(const struct domain *d)
     return d->options & XEN_DOMCTL_CDF_xs_domain;
 }
 
-static inline bool is_iommu_enabled(const struct domain *d)
+static always_inline bool is_iommu_enabled(const struct domain *d)
 {
     return evaluate_nospec(d->options & XEN_DOMCTL_CDF_iommu);
 }
-- 
2.11.0


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec

Posted by Jürgen Groß 6 years, 3 months ago

On 23.10.19 15:58, Andrew Cooper wrote:
> evaluate_nospec() is incredibly fragile, and this is one giant bodge.
> 
> To correctly protect jumps, the generated code needs to be of the form:
> 
>      cmp/test <cond>
>      jcc 1f
>      lfence
>      ...
>   1: lfence
>      ...
> 
> Critically, the lfence must be at the head of both basic blocks, later in the
> instruction stream than the conditional jump in need of protection.
> 
> When a static inline is involved, the optimiser decides to be clever and
> rearranges the code as:
> 
>   pred:
>      lfence
>      <calculate cond>
>      ret
> 
>      call pred
>      cmp $0, %eax
>      jcc 1f
>      ...
>   1: ...
> 
> which breaks the speculative safety.
> 
> Any use of evaluate_nospec() needs all static inline predicates which use it
> to be declared always_inline to prevent the optimiser having the flexibility
> to generate unsafe code.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

Release-acked-by: Juergen Gross <jgross@suse.com>


Juergen

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec

Posted by Jan Beulich 6 years, 3 months ago

On 23.10.2019 15:58, Andrew Cooper wrote:
> evaluate_nospec() is incredibly fragile, and this is one giant bodge.
> 
> To correctly protect jumps, the generated code needs to be of the form:
> 
>     cmp/test <cond>
>     jcc 1f
>     lfence
>     ...
>  1: lfence
>     ...
> 
> Critically, the lfence must be at the head of both basic blocks, later in the
> instruction stream than the conditional jump in need of protection.
> 
> When a static inline is involved, the optimiser decides to be clever and
> rearranges the code as:
> 
>  pred:
>     lfence
>     <calculate cond>
>     ret
> 
>     call pred
>     cmp $0, %eax
>     jcc 1f
>     ...
>  1: ...
> 
> which breaks the speculative safety.

Aiui "pred" is a non-inlined static inline here. There's no "optimiser decides
to be clever" in this case imo - it all is a result of not inlining, when the
construct in evaluate_nospec() is specifically assuming this wouldn't happen.
Therefore I continue to think that the description is misleading.

> Any use of evaluate_nospec() needs all static inline predicates which use it
> to be declared always_inline to prevent the optimiser having the flexibility
> to generate unsafe code.

I agree with this part.

> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: George Dunlap <George.Dunlap@eu.citrix.com>
> CC: Ian Jackson <ian.jackson@citrix.com>
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> CC: Stefano Stabellini <sstabellini@kernel.org>
> CC: Wei Liu <wl@xen.org>
> CC: Julien Grall <julien@xen.org>
> CC: Juergen Gross <jgross@suse.com>
> 
> This is the transitive set of predicates which I can spot which need
> protecting.  There are probably ones I've missed.  Personally, I'm -1 for this
> approach, but the only other option for 4.13 is to revert it all to unbreak
> livepatching.

To unbreak livepatching, aiui what you need is symbol disambiguation,
a patch for which has been sent. With this I think we should focus on
code generation aspects here. I'm fine ack-ing the code changes with
a modified description. But since you're -1 for this, I'm not sure in
the first place that we want to go this route.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec

Posted by Andrew Cooper 6 years, 3 months ago

On 25/10/2019 13:03, Jan Beulich wrote:
> On 23.10.2019 15:58, Andrew Cooper wrote:
>> evaluate_nospec() is incredibly fragile, and this is one giant bodge.
>>
>> To correctly protect jumps, the generated code needs to be of the form:
>>
>>     cmp/test <cond>
>>     jcc 1f
>>     lfence
>>     ...
>>  1: lfence
>>     ...
>>
>> Critically, the lfence must be at the head of both basic blocks, later in the
>> instruction stream than the conditional jump in need of protection.
>>
>> When a static inline is involved, the optimiser decides to be clever and
>> rearranges the code as:
>>
>>  pred:
>>     lfence
>>     <calculate cond>
>>     ret
>>
>>     call pred
>>     cmp $0, %eax
>>     jcc 1f
>>     ...
>>  1: ...
>>
>> which breaks the speculative safety.
> Aiui "pred" is a non-inlined static inline here.

Correct, although it actually applies to anything which the compiler
chose to out of line, perhaps even as a side effect of CSE pass.

>  There's no "optimiser decides
> to be clever" in this case imo - it all is a result of not inlining, when the
> construct in evaluate_nospec() is specifically assuming this wouldn't happen.
> Therefore I continue to think that the description is misleading.
>
>> Any use of evaluate_nospec() needs all static inline predicates which use it
>> to be declared always_inline to prevent the optimiser having the flexibility
>> to generate unsafe code.
> I agree with this part.
>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> ---
>> CC: George Dunlap <George.Dunlap@eu.citrix.com>
>> CC: Ian Jackson <ian.jackson@citrix.com>
>> CC: Jan Beulich <JBeulich@suse.com>
>> CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
>> CC: Stefano Stabellini <sstabellini@kernel.org>
>> CC: Wei Liu <wl@xen.org>
>> CC: Julien Grall <julien@xen.org>
>> CC: Juergen Gross <jgross@suse.com>
>>
>> This is the transitive set of predicates which I can spot which need
>> protecting.  There are probably ones I've missed.  Personally, I'm -1 for this
>> approach, but the only other option for 4.13 is to revert it all to unbreak
>> livepatching.
> To unbreak livepatching, aiui what you need is symbol disambiguation,
> a patch for which has been sent.

Correct, but..

> With this I think we should focus on
> code generation aspects here. I'm fine ack-ing the code changes with
> a modified description. But since you're -1 for this, I'm not sure in
> the first place that we want to go this route.

... without this change, l1tf-barrier/branch-hardening is still broken,
and a performance overhead.

The two choices to unblock 4.13 are this patch, or the previous version
which made CONFIG_HARDEN_BRANCH depend on BROKEN, which was even more
disliked.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec

Posted by Jan Beulich 6 years, 3 months ago

On 25.10.2019 14:10, Andrew Cooper wrote:
> On 25/10/2019 13:03, Jan Beulich wrote:
>> On 23.10.2019 15:58, Andrew Cooper wrote:
>>> evaluate_nospec() is incredibly fragile, and this is one giant bodge.
>>>
>>> To correctly protect jumps, the generated code needs to be of the form:
>>>
>>>     cmp/test <cond>
>>>     jcc 1f
>>>     lfence
>>>     ...
>>>  1: lfence
>>>     ...
>>>
>>> Critically, the lfence must be at the head of both basic blocks, later in the
>>> instruction stream than the conditional jump in need of protection.
>>>
>>> When a static inline is involved, the optimiser decides to be clever and
>>> rearranges the code as:
>>>
>>>  pred:
>>>     lfence
>>>     <calculate cond>
>>>     ret
>>>
>>>     call pred
>>>     cmp $0, %eax
>>>     jcc 1f
>>>     ...
>>>  1: ...
>>>
>>> which breaks the speculative safety.
>> Aiui "pred" is a non-inlined static inline here.
> 
> Correct, although it actually applies to anything which the compiler
> chose to out of line, perhaps even as a side effect of CSE pass.

Not sure if you're alluding to such, but I've never seen the compiler
out-of-line something that wasn't a function (or perhaps a specialization
of one) at the source level.

>>> This is the transitive set of predicates which I can spot which need
>>> protecting.  There are probably ones I've missed.  Personally, I'm -1 for this
>>> approach, but the only other option for 4.13 is to revert it all to unbreak
>>> livepatching.
>> To unbreak livepatching, aiui what you need is symbol disambiguation,
>> a patch for which has been sent.
> 
> Correct, but..
> 
>> With this I think we should focus on
>> code generation aspects here. I'm fine ack-ing the code changes with
>> a modified description. But since you're -1 for this, I'm not sure in
>> the first place that we want to go this route.
> 
> ... without this change, l1tf-barrier/branch-hardening is still broken,
> and a performance overhead.

Well, it has less of an effect, but it's still better than without any
of this altogether. In some cases code generation is correct, and in
some other cases code generation is at least such that the window size
gets shrunk.

> The two choices to unblock 4.13 are this patch, or the previous version
> which made CONFIG_HARDEN_BRANCH depend on BROKEN, which was even more
> disliked.

Option 3 is to have just the config option, for people to turn this
off if they feel like doing so.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec

Posted by Andrew Cooper 6 years, 3 months ago

On 25/10/2019 13:34, Jan Beulich wrote:
> On 25.10.2019 14:10, Andrew Cooper wrote:
>> On 25/10/2019 13:03, Jan Beulich wrote:
>>> On 23.10.2019 15:58, Andrew Cooper wrote:
>>>> evaluate_nospec() is incredibly fragile, and this is one giant bodge.
>>>>
>>>> To correctly protect jumps, the generated code needs to be of the form:
>>>>
>>>>     cmp/test <cond>
>>>>     jcc 1f
>>>>     lfence
>>>>     ...
>>>>  1: lfence
>>>>     ...
>>>>
>>>> Critically, the lfence must be at the head of both basic blocks, later in the
>>>> instruction stream than the conditional jump in need of protection.
>>>>
>>>> When a static inline is involved, the optimiser decides to be clever and
>>>> rearranges the code as:
>>>>
>>>>  pred:
>>>>     lfence
>>>>     <calculate cond>
>>>>     ret
>>>>
>>>>     call pred
>>>>     cmp $0, %eax
>>>>     jcc 1f
>>>>     ...
>>>>  1: ...
>>>>
>>>> which breaks the speculative safety.
>>> Aiui "pred" is a non-inlined static inline here.
>> Correct, although it actually applies to anything which the compiler
>> chose to out of line, perhaps even as a side effect of CSE pass.
> Not sure if you're alluding to such, but I've never seen the compiler
> out-of-line something that wasn't a function (or perhaps a specialization
> of one) at the source level.

I've seen it with LTO in both Clang and GCC, where the compilers can
safely reason about the lack of side effects in function calls.

>
>>>> This is the transitive set of predicates which I can spot which need
>>>> protecting.  There are probably ones I've missed.  Personally, I'm -1 for this
>>>> approach, but the only other option for 4.13 is to revert it all to unbreak
>>>> livepatching.
>>> To unbreak livepatching, aiui what you need is symbol disambiguation,
>>> a patch for which has been sent.
>> Correct, but..
>>
>>> With this I think we should focus on
>>> code generation aspects here. I'm fine ack-ing the code changes with
>>> a modified description. But since you're -1 for this, I'm not sure in
>>> the first place that we want to go this route.
>> ... without this change, l1tf-barrier/branch-hardening is still broken,
>> and a performance overhead.
> Well, it has less of an effect, but it's still better than without any
> of this altogether.

I certainly don't agree with this conclusion.

> In some cases code generation is correct,

I agree with this, but ...

> and in some other cases code generation is at least such that the window size
> gets shrunk.

... this isn't accurate.  In the case that out-of-lining happens, you
get an lfence earlier in the instruction stream, which serialises an
unrelated bit of logic (hence the perf hit), and does nothing for the
speculation window which the evaluate_nospec() was intending to protect.

>
>> The two choices to unblock 4.13 are this patch, or the previous version
>> which made CONFIG_HARDEN_BRANCH depend on BROKEN, which was even more
>> disliked.
> Option 3 is to have just the config option, for people to turn this
> off if they feel like doing so.

Yes, but no.  A facade of security is worse than no security, and I
don't consider doing that an acceptable solution in this case.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec

Posted by Jan Beulich 6 years, 3 months ago

On 25.10.2019 17:27, Andrew Cooper wrote:
> On 25/10/2019 13:34, Jan Beulich wrote:
>> On 25.10.2019 14:10, Andrew Cooper wrote:
>>> The two choices to unblock 4.13 are this patch, or the previous version
>>> which made CONFIG_HARDEN_BRANCH depend on BROKEN, which was even more
>>> disliked.
>> Option 3 is to have just the config option, for people to turn this
>> off if they feel like doing so.
> 
> Yes, but no.  A facade of security is worse than no security, and I
> don't consider doing that an acceptable solution in this case.

But I thought we all agree that this is something that's presumably
going to remain incomplete (as in not provably complete) altogether
anyway. It's just that without the change here it's far more
incomplete then with it.

In any event I think we should (also) have an opinion from the people
who had originally contributed this logic. You didn't Cc anyone of
them; I've added at least Norbert now.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec

Posted by Andrew Cooper 6 years, 3 months ago

On 25/10/2019 13:03, Jan Beulich wrote:
> On 23.10.2019 15:58, Andrew Cooper wrote:
>> evaluate_nospec() is incredibly fragile, and this is one giant bodge.
>>
>> To correctly protect jumps, the generated code needs to be of the form:
>>
>>     cmp/test <cond>
>>     jcc 1f
>>     lfence
>>     ...
>>  1: lfence
>>     ...
>>
>> Critically, the lfence must be at the head of both basic blocks, later in the
>> instruction stream than the conditional jump in need of protection.
>>
>> When a static inline is involved, the optimiser decides to be clever and
>> rearranges the code as:
>>
>>  pred:
>>     lfence
>>     <calculate cond>
>>     ret
>>
>>     call pred
>>     cmp $0, %eax
>>     jcc 1f
>>     ...
>>  1: ...
>>
>> which breaks the speculative safety.
> Aiui "pred" is a non-inlined static inline here. There's no "optimiser decides
> to be clever" in this case imo - it all is a result of not inlining, when the
> construct in evaluate_nospec() is specifically assuming this wouldn't happen.
> Therefore I continue to think that the description is misleading.
>
>> Any use of evaluate_nospec() needs all static inline predicates which use it
>> to be declared always_inline to prevent the optimiser having the flexibility
>> to generate unsafe code.
> I agree with this part.

How about:

When the compiler chooses to out-of-line the condition calculation (e.g. by
not inlining a predicate), the code layout can end up as:
   
 pred:
    lfence
    <calculate cond>
    ret
   
    call pred
    cmp $0, %eax
    jcc 1f
    ...
 1: ...
   
which breaks the speculative safety, as the lfences are earlier in the
instruction stream than the jump in need of protection.

?

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec

Posted by Jan Beulich 6 years, 3 months ago

On 29.10.2019 17:53, Andrew Cooper wrote:
> On 25/10/2019 13:03, Jan Beulich wrote:
>> On 23.10.2019 15:58, Andrew Cooper wrote:
>>> evaluate_nospec() is incredibly fragile, and this is one giant bodge.
>>>
>>> To correctly protect jumps, the generated code needs to be of the form:
>>>
>>>     cmp/test <cond>
>>>     jcc 1f
>>>     lfence
>>>     ...
>>>  1: lfence
>>>     ...
>>>
>>> Critically, the lfence must be at the head of both basic blocks, later in the
>>> instruction stream than the conditional jump in need of protection.
>>>
>>> When a static inline is involved, the optimiser decides to be clever and
>>> rearranges the code as:
>>>
>>>  pred:
>>>     lfence
>>>     <calculate cond>
>>>     ret
>>>
>>>     call pred
>>>     cmp $0, %eax
>>>     jcc 1f
>>>     ...
>>>  1: ...
>>>
>>> which breaks the speculative safety.
>> Aiui "pred" is a non-inlined static inline here. There's no "optimiser decides
>> to be clever" in this case imo - it all is a result of not inlining, when the
>> construct in evaluate_nospec() is specifically assuming this wouldn't happen.
>> Therefore I continue to think that the description is misleading.
>>
>>> Any use of evaluate_nospec() needs all static inline predicates which use it
>>> to be declared always_inline to prevent the optimiser having the flexibility
>>> to generate unsafe code.
>> I agree with this part.
> 
> How about:
> 
> When the compiler chooses to out-of-line the condition calculation (e.g. by
> not inlining a predicate), the code layout can end up as:
>    
>  pred:
>     lfence
>     <calculate cond>
>     ret
>    
>     call pred
>     cmp $0, %eax
>     jcc 1f
>     ...
>  1: ...
>    
> which breaks the speculative safety, as the lfences are earlier in the
> instruction stream than the jump in need of protection.
> 
> ?

Sounds good, thanks. With this
Acked-by: Jan Beulich <jbeulich@suse.com>

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v3 1/7] x86/nospec: Two trivial fixes
[Xen-devel] [PATCH v3 2/7] xen/nospec: Use always_inline to fix code gen for evaluate_nospec
[Xen-devel] [PATCH v3 3/7] xen/nospec: Introduce CONFIG_SPECULATIVE_HARDEN_BRANCH
[Xen-devel] [PATCH v3 4/7] x86/nospec: Rename and rework l1tf-barrier as branch-harden
[Xen-devel] [PATCH v3 5/7] x86/livepatch: Fail the build if duplicate symbols exist
[Xen-devel] [PATCH for-next v3 6/7] x86/nospec: Move array_index_mask_nospec() into nospec.h
[Xen-devel] [PATCH v3 7/7] x86/nospec: Optimise array_index_mask_nospec() for power-of-2 arrays