fp-test + hostfloat

[Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction

Posted by Emilio G. Cota 7 years, 7 months ago

Performance results (single and double precision) for
fp-bench run under aarch64-linux-user on an Intel(R)
Core(TM) i7-4790K CPU @ 4.00GHz host:

- before:
add-single: 86.74 MFlops
add-double: 86.46 MFlops
sub-single: 83.33 MFlops
sub-double: 84.57 MFlops

- after:
add-single: 188.26 MFlops
add-double: 186.60 MFlops
sub-single: 186.19 MFlops
sub-double: 187.77 MFlops

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 include/fpu/hostfloat.h |  6 ++++++
 include/fpu/softfloat.h |  8 ++++----
 fpu/hostfloat.c         | 50 +++++++++++++++++++++++++++++++++++++++++++++++++
 fpu/softfloat.c         | 16 ++++++++--------
 4 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/include/fpu/hostfloat.h b/include/fpu/hostfloat.h
index b01291b..db49efa 100644
--- a/include/fpu/hostfloat.h
+++ b/include/fpu/hostfloat.h
@@ -11,4 +11,10 @@
 #error fpu/hostfloat.h must only be included from softfloat.h
 #endif
 
+float32 float32_add(float32 a, float32 b, float_status *status);
+float32 float32_sub(float32 a, float32 b, float_status *status);
+
+float64 float64_add(float64 a, float64 b, float_status *status);
+float64 float64_sub(float64 a, float64 b, float_status *status);
+
 #endif /* HOSTFLOAT_H */
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 8963b68..eb7e9bc 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -342,8 +342,8 @@ float128 float32_to_float128(float32, float_status *status);
 | Software IEC/IEEE single-precision operations.
 *----------------------------------------------------------------------------*/
 float32 float32_round_to_int(float32, float_status *status);
-float32 float32_add(float32, float32, float_status *status);
-float32 float32_sub(float32, float32, float_status *status);
+float32 soft_float32_add(float32, float32, float_status *status);
+float32 soft_float32_sub(float32, float32, float_status *status);
 float32 float32_mul(float32, float32, float_status *status);
 float32 float32_div(float32, float32, float_status *status);
 float32 float32_rem(float32, float32, float_status *status);
@@ -482,8 +482,8 @@ float128 float64_to_float128(float64, float_status *status);
 *----------------------------------------------------------------------------*/
 float64 float64_round_to_int(float64, float_status *status);
 float64 float64_trunc_to_int(float64, float_status *status);
-float64 float64_add(float64, float64, float_status *status);
-float64 float64_sub(float64, float64, float_status *status);
+float64 soft_float64_add(float64, float64, float_status *status);
+float64 soft_float64_sub(float64, float64, float_status *status);
 float64 float64_mul(float64, float64, float_status *status);
 float64 float64_div(float64, float64, float_status *status);
 float64 float64_rem(float64, float64, float_status *status);
diff --git a/fpu/hostfloat.c b/fpu/hostfloat.c
index cab0341..502552b 100644
--- a/fpu/hostfloat.c
+++ b/fpu/hostfloat.c
@@ -94,3 +94,53 @@ GEN_TYPE_CONV(double_to_float64, float64, double)
 GEN_INPUT_FLUSH(float32)
 GEN_INPUT_FLUSH(float64)
 #undef GEN_INPUT_FLUSH
+
+#define GEN_FPU_ADDSUB(add_name, sub_name, soft_t, host_t,              \
+                       host_abs_func, min_normal)                       \
+    static inline __attribute__((always_inline)) soft_t                 \
+    fpu_ ## soft_t ## _addsub(soft_t a, soft_t b, bool subtract,        \
+                              float_status *s)                          \
+    {                                                                   \
+        soft_t ## _input_flush2(&a, &b, s);                             \
+        if (likely((soft_t ## _is_normal(a) || soft_t ## _is_zero(a)) && \
+                   (soft_t ## _is_normal(b) || soft_t ## _is_zero(b)) && \
+                   s->float_exception_flags & float_flag_inexact &&     \
+                   s->float_rounding_mode == float_round_nearest_even)) { \
+            host_t ha = soft_t ## _to_ ## host_t(a);                    \
+            host_t hb = soft_t ## _to_ ## host_t(b);                    \
+            host_t hr;                                                  \
+            soft_t r;                                                   \
+                                                                        \
+            if (subtract) {                                             \
+                hb = -hb;                                               \
+            }                                                           \
+            hr = ha + hb;                                               \
+            r = host_t ## _to_ ## soft_t(hr);                           \
+            if (unlikely(soft_t ## _is_infinity(r))) {                  \
+                s->float_exception_flags |= float_flag_overflow;        \
+            } else if (unlikely(host_abs_func(hr) <= min_normal)) {     \
+                goto soft;                                              \
+            }                                                           \
+            return r;                                                   \
+        }                                                               \
+    soft:                                                               \
+        if (subtract) {                                                 \
+            return soft_ ## soft_t ## _sub(a, b, s);                    \
+        } else {                                                        \
+            return soft_ ## soft_t ## _add(a, b, s);                    \
+        }                                                               \
+    }                                                                   \
+                                                                        \
+    soft_t add_name(soft_t a, soft_t b, float_status *status)           \
+    {                                                                   \
+        return fpu_ ## soft_t ## _addsub(a, b, false, status);          \
+    }                                                                   \
+                                                                        \
+    soft_t sub_name(soft_t a, soft_t b, float_status *status)           \
+    {                                                                   \
+        return fpu_ ## soft_t ## _addsub(a, b, true, status);           \
+    }                                                                   \
+
+GEN_FPU_ADDSUB(float32_add, float32_sub, float32, float, fabsf, FLT_MIN)
+GEN_FPU_ADDSUB(float64_add, float64_sub, float64, double, fabs, DBL_MIN)
+#undef GEN_FPU_ADDSUB
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index ee615a9..bd82adf 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -738,8 +738,8 @@ float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
     return float16_round_pack_canonical(pr, status);
 }
 
-float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
-                                             float_status *status)
+float32 __attribute__((flatten)) soft_float32_add(float32 a, float32 b,
+                                                  float_status *status)
 {
     FloatParts pa = float32_unpack_canonical(a, status);
     FloatParts pb = float32_unpack_canonical(b, status);
@@ -748,8 +748,8 @@ float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
     return float32_round_pack_canonical(pr, status);
 }
 
-float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
-                                             float_status *status)
+float64 __attribute__((flatten)) soft_float64_add(float64 a, float64 b,
+                                                  float_status *status)
 {
     FloatParts pa = float64_unpack_canonical(a, status);
     FloatParts pb = float64_unpack_canonical(b, status);
@@ -768,8 +768,8 @@ float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
     return float16_round_pack_canonical(pr, status);
 }
 
-float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
-                                             float_status *status)
+float32 __attribute__((flatten)) soft_float32_sub(float32 a, float32 b,
+                                                  float_status *status)
 {
     FloatParts pa = float32_unpack_canonical(a, status);
     FloatParts pb = float32_unpack_canonical(b, status);
@@ -778,8 +778,8 @@ float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
     return float32_round_pack_canonical(pr, status);
 }
 
-float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
-                                             float_status *status)
+float64 __attribute__((flatten)) soft_float64_sub(float64 a, float64 b,
+                                                  float_status *status)
 {
     FloatParts pa = float64_unpack_canonical(a, status);
     FloatParts pb = float64_unpack_canonical(b, status);
-- 
2.7.4

Re: [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction

Posted by Richard Henderson 7 years, 7 months ago

On 03/22/2018 04:11 AM, Emilio G. Cota wrote:
> +#define GEN_FPU_ADDSUB(add_name, sub_name, soft_t, host_t,              \
> +                       host_abs_func, min_normal)                       \
> +    static inline __attribute__((always_inline)) soft_t                 \
> +    fpu_ ## soft_t ## _addsub(soft_t a, soft_t b, bool subtract,        \
> +                              float_status *s)                          \
> +    {                                                                   \
> +        soft_t ## _input_flush2(&a, &b, s);                             \
> +        if (likely((soft_t ## _is_normal(a) || soft_t ## _is_zero(a)) && \
> +                   (soft_t ## _is_normal(b) || soft_t ## _is_zero(b)) && \
> +                   s->float_exception_flags & float_flag_inexact &&     \
> +                   s->float_rounding_mode == float_round_nearest_even)) { \
> +            host_t ha = soft_t ## _to_ ## host_t(a);                    \
> +            host_t hb = soft_t ## _to_ ## host_t(b);                    \
> +            host_t hr;                                                  \
> +            soft_t r;                                                   \
> +                                                                        \
> +            if (subtract) {                                             \
> +                hb = -hb;                                               \
> +            }                                                           \
> +            hr = ha + hb;                                               \
> +            r = host_t ## _to_ ## soft_t(hr);                           \
> +            if (unlikely(soft_t ## _is_infinity(r))) {                  \
> +                s->float_exception_flags |= float_flag_overflow;        \
> +            } else if (unlikely(host_abs_func(hr) <= min_normal)) {     \
> +                goto soft;                                              \
> +            }                                                           \
> +            return r;                                                   \
> +        }                                                               \
> +    soft:                                                               \

Is there any especially good reason you want to not put this code into the
normal softfloat function?  Does it really many any measurable difference at
all to force this code to be inlined into a helper?


r~

Re: [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction

Posted by Emilio G. Cota 7 years, 7 months ago

On Thu, Mar 22, 2018 at 13:05:00 +0800, Richard Henderson wrote:
> On 03/22/2018 04:11 AM, Emilio G. Cota wrote:
> > +#define GEN_FPU_ADDSUB(add_name, sub_name, soft_t, host_t,              \
> > +                       host_abs_func, min_normal)                       \
> > +    static inline __attribute__((always_inline)) soft_t                 \
> > +    fpu_ ## soft_t ## _addsub(soft_t a, soft_t b, bool subtract,        \
> > +                              float_status *s)                          \
> > +    {                                                                   \
> > +        soft_t ## _input_flush2(&a, &b, s);                             \
> > +        if (likely((soft_t ## _is_normal(a) || soft_t ## _is_zero(a)) && \
> > +                   (soft_t ## _is_normal(b) || soft_t ## _is_zero(b)) && \
> > +                   s->float_exception_flags & float_flag_inexact &&     \
> > +                   s->float_rounding_mode == float_round_nearest_even)) { \
> > +            host_t ha = soft_t ## _to_ ## host_t(a);                    \
> > +            host_t hb = soft_t ## _to_ ## host_t(b);                    \
> > +            host_t hr;                                                  \
> > +            soft_t r;                                                   \
> > +                                                                        \
> > +            if (subtract) {                                             \
> > +                hb = -hb;                                               \
> > +            }                                                           \
> > +            hr = ha + hb;                                               \
> > +            r = host_t ## _to_ ## soft_t(hr);                           \
> > +            if (unlikely(soft_t ## _is_infinity(r))) {                  \
> > +                s->float_exception_flags |= float_flag_overflow;        \
> > +            } else if (unlikely(host_abs_func(hr) <= min_normal)) {     \
> > +                goto soft;                                              \
> > +            }                                                           \
> > +            return r;                                                   \
> > +        }                                                               \
> > +    soft:                                                               \
> 
> Is there any especially good reason you want to not put this code into the
> normal softfloat function?  Does it really many any measurable difference at
> all to force this code to be inlined into a helper?

You mean to do this? (... or see below)

--- a/fpu/hostfloat.c
+++ b/fpu/hostfloat.c
@@ -97,7 +97,7 @@ GEN_INPUT_FLUSH(float64)

 #define GEN_FPU_ADDSUB(add_name, sub_name, soft_t, host_t,              \
                        host_abs_func, min_normal)                       \
-    static inline __attribute__((always_inline)) soft_t                 \
+    static soft_t                                                       \
     fpu_ ## soft_t ## _addsub(soft_t a, soft_t b, bool subtract,        \
                               float_status *s)                          \
     {                                                                   \

That slows add/sub dramatically, because addsub is not inlined into
float32_add and float32_sub (that's an extra function call plus an
extra branch per emulated op).

For x86_64-linux-user/qemu-x86_64 tests/fp-bench -o add, the above gives
- before: 188.06 MFlops
- after:  117.56 MFlops



... or did you miss that fpu_##soft_t##_addsub is only called by
soft_t ## _add / sub, which are standalone? That is:

+#define GEN_FPU_ADDSUB(add_name, sub_name, soft_t, host_t,              \
+                       host_abs_func, min_normal)                       \
(...)
+    soft_t add_name(soft_t a, soft_t b, float_status *status)           \
+    {                                                                   \
+        return fpu_ ## soft_t ## _addsub(a, b, false, status);          \
+    }                                                                   \
+                                                                        \
+    soft_t sub_name(soft_t a, soft_t b, float_status *status)           \
+    {                                                                   \
+        return fpu_ ## soft_t ## _addsub(a, b, true, status);           \
+    } 
+
+GEN_FPU_ADDSUB(float32_add, float32_sub, float32, float, fabsf, FLT_MIN)
+GEN_FPU_ADDSUB(float64_add, float64_sub, float64, double, fabs, DBL_MIN)
+#undef GEN_FPU_ADDSUB

Note that add/sub_name expand to float32/64_add/sub; I like having
the full names in the macro to ease grepping.

Thanks,

		E.

Re: [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction

Posted by Richard Henderson 7 years, 7 months ago

On 03/22/2018 01:57 PM, Emilio G. Cota wrote:
>> Is there any especially good reason you want to not put this code into the
>> normal softfloat function?  Does it really many any measurable difference at
>> all to force this code to be inlined into a helper?
> 
> You mean to do this? (... or see below)
> 
> --- a/fpu/hostfloat.c
> +++ b/fpu/hostfloat.c
> @@ -97,7 +97,7 @@ GEN_INPUT_FLUSH(float64)
> 
>  #define GEN_FPU_ADDSUB(add_name, sub_name, soft_t, host_t,              \
>                         host_abs_func, min_normal)                       \
> -    static inline __attribute__((always_inline)) soft_t                 \
> +    static soft_t                                                       \
>      fpu_ ## soft_t ## _addsub(soft_t a, soft_t b, bool subtract,        \
>                                float_status *s)                          \
>      {                                                                   \
> 
> That slows add/sub dramatically, because addsub is not inlined into
> float32_add and float32_sub (that's an extra function call plus an
> extra branch per emulated op).
> 
> For x86_64-linux-user/qemu-x86_64 tests/fp-bench -o add, the above gives
> - before: 188.06 MFlops
> - after:  117.56 MFlops

Well, not quite.  I meant putting all of the new code into softfloat.c, and not
attempting to inline any of it in target/cpu/foo_helper.c.

The best written target helpers currently simply tail-call to softfloat.c.
There are others that do so after minor argument adjustment.  For these, I have
had in my plans to rearrange things such that e.g. float32_add is called from
TCG directly, with no other function calls at all.

For targets that cannot do that, I simply cannot bring myself to care about the
final percentage points enough to want to introduce extra macros.

Another thought re all of the soft_is_normal || soft_is_zero checks that you're
performing.  I think it would be nice if we could work with
float*_unpack_canonical so that we don't have to duplicate work.  E.g.

/* Return true for float_class_normal && float_class_zero.  */
static inline bool is_finite(FloatClass c) { return c <= float_class_zero; }

float32 float32_add(float32 a, float32 b, float_status *s)
{
  FloatClass a_cls = float32_classify(a);
  FloatClass b_cls = float32_classify(b);

  if (is_finite(a_cls) && is_finite(b_cls) && ...) {
      /* do hardfp thing */
  }

  pa = float32_unpack(a, ca, s);
  pb = float32_unpack(b, cb, s);
  pr = addsub_floats(pa, pb, s, false);
  return float32_round_pack(pr, s);
}

Where float32_classify produces Normal/Zero/Inf/NaN and might avoid duplicate
work within float32_unpack.

r~

Re: [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction

Posted by Emilio G. Cota 7 years, 7 months ago

On Thu, Mar 22, 2018 at 14:41:05 +0800, Richard Henderson wrote:
> On 03/22/2018 01:57 PM, Emilio G. Cota wrote:
> >> Is there any especially good reason you want to not put this code into the
> >> normal softfloat function?  Does it really many any measurable difference at
> >> all to force this code to be inlined into a helper?
(snip)
> Well, not quite.  I meant putting all of the new code into softfloat.c,

I put all this in a separate file because I didn't want to spend time
trying to understand the licensing of softfloat. It isn't clear to
me whether what goes into softfloat.[ch] is only GPL'ed or also BSD'ed.

> and not
> attempting to inline any of it in target/cpu/foo_helper.c.

Unless I've made a mistake, I have not tried to inline anything in
target/*helper*.c

> The best written target helpers currently simply tail-call to softfloat.c.
> There are others that do so after minor argument adjustment.  For these, I have
> had in my plans to rearrange things such that e.g. float32_add is called from
> TCG directly, with no other function calls at all.

Yes, this is definitely worth trying.

> For targets that cannot do that, I simply cannot bring myself to care about the
> final percentage points enough to want to introduce extra macros.

Agreed.

> Another thought re all of the soft_is_normal || soft_is_zero checks that you're
> performing.  I think it would be nice if we could work with
> float*_unpack_canonical so that we don't have to duplicate work.  E.g.
> 
> /* Return true for float_class_normal && float_class_zero.  */
> static inline bool is_finite(FloatClass c) { return c <= float_class_zero; }
> 
> float32 float32_add(float32 a, float32 b, float_status *s)
> {
>   FloatClass a_cls = float32_classify(a);
>   FloatClass b_cls = float32_classify(b);
> 
>   if (is_finite(a_cls) && is_finite(b_cls) && ...) {
>       /* do hardfp thing */
>   }
> 
>   pa = float32_unpack(a, ca, s);
>   pb = float32_unpack(b, cb, s);
>   pr = addsub_floats(pa, pb, s, false);
>   return float32_round_pack(pr, s);
> }
> 
> Where float32_classify produces Normal/Zero/Inf/NaN and might avoid duplicate
> work within float32_unpack.

I'll look into this.

Thanks,

		Emilio

Re: [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction

Posted by Laurent Vivier 7 years, 7 months ago

Le 22/03/2018 à 16:08, Emilio G. Cota a écrit :
> On Thu, Mar 22, 2018 at 14:41:05 +0800, Richard Henderson wrote:
>> On 03/22/2018 01:57 PM, Emilio G. Cota wrote:
>>>> Is there any especially good reason you want to not put this code into the
>>>> normal softfloat function?  Does it really many any measurable difference at
>>>> all to force this code to be inlined into a helper?
> (snip)
>> Well, not quite.  I meant putting all of the new code into softfloat.c,
> 
> I put all this in a separate file because I didn't want to spend time
> trying to understand the licensing of softfloat. It isn't clear to
> me whether what goes into softfloat.[ch] is only GPL'ed or also BSD'ed.

It's in the header of sofltloat.c:

...
 * so some portions are provided under:
 *  the SoftFloat-2a license
 *  the BSD license
 *  GPL-v2-or-later
 *
 * Any future contributions to this file after December 1st 2014 will be
 * taken to be licensed under the Softfloat-2a license unless specifically
 * indicated otherwise.
 */

and

/* Portions of this work are licensed under the terms of the GNU GPL,
 * version 2 or later. See the COPYING file in the top-level directory.
 */

So you can choose your license, just say what you want.

Thanks,
Laurent

Re: [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction

Posted by Emilio G. Cota 7 years, 7 months ago

On Thu, Mar 22, 2018 at 14:41:05 +0800, Richard Henderson wrote:
(snip)
> Another thought re all of the soft_is_normal || soft_is_zero checks that you're
> performing.  I think it would be nice if we could work with
> float*_unpack_canonical so that we don't have to duplicate work.  E.g.
> 
> /* Return true for float_class_normal && float_class_zero.  */
> static inline bool is_finite(FloatClass c) { return c <= float_class_zero; }
> 
> float32 float32_add(float32 a, float32 b, float_status *s)
> {
>   FloatClass a_cls = float32_classify(a);
>   FloatClass b_cls = float32_classify(b);

Just looked at this. It can be done, although it comes at the
price of some performance for fp-bench -o add:
180 Mflops vs. 196 Mflops, i.e. a 8% slowdown. That is with
adequate inlining etc., otherwise perf is worse.

I'm not convinced that we can gain much in simplicity to
justify the perf impact. Yes, we'd simplify canonicalize(),
but we'd probably need a float_class_denormal[*], which
would complicate everything else.

I think it makes sense to keep some inlines that work on
the float32/64's directly.

>   if (is_finite(a_cls) && is_finite(b_cls) && ...) {
>       /* do hardfp thing */
>   }

[*] Taking 0, denormals and normals would be OK from correctness,
but we really don't want to compute ops with denormal inputs on
the host; it is very likely that the output will also be denormal,
and we'll end up deferring to soft-fp anyway to avoid
computing whether the underflow exception has occurred,
which is expensive.

>   pa = float32_unpack(a, ca, s);
>   pb = float32_unpack(b, cb, s);
>   pr = addsub_floats(pa, pb, s, false);
>   return float32_round_pack(pr, s);
> }

It pays off to have two separate functions (add & sub) for the
slow path. With soft_f32_add/sub factored out:

$ taskset -c 0 x86_64-linux-user/qemu-x86_64 tests/fp-bench -o add
197.53 MFlops

With the above four lines (pa...return) as an else branch:
169.16 MFlops

BTW flattening makes things worse (150.63 MFlops).

Note that fp-bench only tests normal numbers. But I think it's fair
to assume that this is the path we want to speed up.

Thanks,

		E.

Re: [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction

Posted by Alex Bennée 7 years, 7 months ago

Emilio G. Cota <cota@braap.org> writes:

> On Thu, Mar 22, 2018 at 14:41:05 +0800, Richard Henderson wrote:
> (snip)
>> Another thought re all of the soft_is_normal || soft_is_zero checks that you're
>> performing.  I think it would be nice if we could work with
>> float*_unpack_canonical so that we don't have to duplicate work.  E.g.
>>
>> /* Return true for float_class_normal && float_class_zero.  */
>> static inline bool is_finite(FloatClass c) { return c <= float_class_zero; }
>>
>> float32 float32_add(float32 a, float32 b, float_status *s)
>> {
>>   FloatClass a_cls = float32_classify(a);
>>   FloatClass b_cls = float32_classify(b);
>
> Just looked at this. It can be done, although it comes at the
> price of some performance for fp-bench -o add:
> 180 Mflops vs. 196 Mflops, i.e. a 8% slowdown. That is with
> adequate inlining etc., otherwise perf is worse.
>
> I'm not convinced that we can gain much in simplicity to
> justify the perf impact. Yes, we'd simplify canonicalize(),
> but we'd probably need a float_class_denormal[*], which
> would complicate everything else.
>
> I think it makes sense to keep some inlines that work on
> the float32/64's directly.
>
>>   if (is_finite(a_cls) && is_finite(b_cls) && ...) {
>>       /* do hardfp thing */
>>   }
>
> [*] Taking 0, denormals and normals would be OK from correctness,
> but we really don't want to compute ops with denormal inputs on
> the host; it is very likely that the output will also be denormal,
> and we'll end up deferring to soft-fp anyway to avoid
> computing whether the underflow exception has occurred,
> which is expensive.
>
>>   pa = float32_unpack(a, ca, s);
>>   pb = float32_unpack(b, cb, s);
>>   pr = addsub_floats(pa, pb, s, false);
>>   return float32_round_pack(pr, s);
>> }
>
> It pays off to have two separate functions (add & sub) for the
> slow path. With soft_f32_add/sub factored out:
>
> $ taskset -c 0 x86_64-linux-user/qemu-x86_64 tests/fp-bench -o add
> 197.53 MFlops
>
> With the above four lines (pa...return) as an else branch:
> 169.16 MFlops
>
> BTW flattening makes things worse (150.63 MFlops).

That's disappointing. Did you look at the generated code? Because the
way we are abusing __flatten__ to effectively make a compile time
template you would hope it could pull the relevant classify bits to
before the hard float branch and do the rest later if needed.

Everything was inline or in softfloat.c for this test right?

>
> Note that fp-bench only tests normal numbers. But I think it's fair
> to assume that this is the path we want to speed up.
>
> Thanks,
>
> 		E.


--
Alex Bennée

Re: [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction

Posted by Emilio G. Cota 7 years, 7 months ago

On Tue, Mar 27, 2018 at 12:41:18 +0100, Alex Bennée wrote:
> 
> Emilio G. Cota <cota@braap.org> writes:
> 
> > On Thu, Mar 22, 2018 at 14:41:05 +0800, Richard Henderson wrote:
> > (snip)
> >> Another thought re all of the soft_is_normal || soft_is_zero checks that you're
> >> performing.  I think it would be nice if we could work with
> >> float*_unpack_canonical so that we don't have to duplicate work.  E.g.
> >>
> >> /* Return true for float_class_normal && float_class_zero.  */
> >> static inline bool is_finite(FloatClass c) { return c <= float_class_zero; }
> >>
> >> float32 float32_add(float32 a, float32 b, float_status *s)
> >> {
> >>   FloatClass a_cls = float32_classify(a);
> >>   FloatClass b_cls = float32_classify(b);
> >
> > Just looked at this. It can be done, although it comes at the
> > price of some performance for fp-bench -o add:
> > 180 Mflops vs. 196 Mflops, i.e. a 8% slowdown. That is with
> > adequate inlining etc., otherwise perf is worse.
> >
> > I'm not convinced that we can gain much in simplicity to
> > justify the perf impact. Yes, we'd simplify canonicalize(),
> > but we'd probably need a float_class_denormal[*], which
> > would complicate everything else.
> >
> > I think it makes sense to keep some inlines that work on
> > the float32/64's directly.
> >
> >>   if (is_finite(a_cls) && is_finite(b_cls) && ...) {
> >>       /* do hardfp thing */
> >>   }
> >
> > [*] Taking 0, denormals and normals would be OK from correctness,
> > but we really don't want to compute ops with denormal inputs on
> > the host; it is very likely that the output will also be denormal,
> > and we'll end up deferring to soft-fp anyway to avoid
> > computing whether the underflow exception has occurred,
> > which is expensive.
> >
> >>   pa = float32_unpack(a, ca, s);
> >>   pb = float32_unpack(b, cb, s);
> >>   pr = addsub_floats(pa, pb, s, false);
> >>   return float32_round_pack(pr, s);
> >> }
> >
> > It pays off to have two separate functions (add & sub) for the
> > slow path. With soft_f32_add/sub factored out:
> >
> > $ taskset -c 0 x86_64-linux-user/qemu-x86_64 tests/fp-bench -o add
> > 197.53 MFlops
> >
> > With the above four lines (pa...return) as an else branch:
> > 169.16 MFlops
> >
> > BTW flattening makes things worse (150.63 MFlops).
> 
> That's disappointing. Did you look at the generated code? Because the
> way we are abusing __flatten__ to effectively make a compile time
> template you would hope it could pull the relevant classify bits to
> before the hard float branch and do the rest later if needed.
> 
> Everything was inline or in softfloat.c for this test right?

Yes. It's just that the classify bits are more expensive than
the alternative. This is fair enough when you look at classify().

		E.