Skip to content

Commit 79920db

Browse files
stev47musm
andauthoredApr 25, 2021
implement faster floating-point isless (#39090)
* implement faster floating-point `isless` Previously `isless` relied on the C intrinsic `fpislt` in `src/runtime_intrinsics.c`, while the new implementation in Julia arguably generates better code, namely: 1. The NaN-check compiles to a single instruction + branch amenable for branch prediction in arguably most usecases (i.e. comparing non-NaN floats), thus speeding up execution. 2. The compiler now often manages to remove NaN-computation if the embedding code has already proven the arguments to be non-NaN. 3. The actual operation compares both arguments as sign-magnitude integers instead of case analysis based on the sign of one argument. This symmetric treatment may generate vectorized instructions for the sign-magnitude conversion depending on how the arguments are layed out. The actual behaviour of `isless` did not change and apart from the Julia-specific NaN-handling (which may be up for debate) the resulting total order corresponds to the IEEE-754 specified `totalOrder`. While the new implementation no longer generates fully branchless code I did not manage to construct a usecase where this was detrimental: the saved work seems to outweight the potential cost of a branch misprediction in all of my tests with various NaN-polluted data. Also auto-vectorization was not effective on the previous `fpislt` either. Quick benchmarks (AMD A10-7860K) on `sort`, avoiding the specialized algorithm: ```julia a = rand(1000); @Btime sort($a, lt=(a,b)->isless(a,b)); # before: 56.030 μs (1 allocation: 7.94 KiB) # after: 40.853 μs (1 allocation: 7.94 KiB) a = rand(1000000); @Btime sort($a, lt=(a,b)->isless(a,b)); # before: 159.499 ms (2 allocations: 7.63 MiB) # after: 120.536 ms (2 allocations: 7.63 MiB) a = [rand((rand(), NaN)) for _ in 1:1000000]; @Btime sort($a, lt=(a,b)->isless(a,b)); # before: 111.925 ms (2 allocations: 7.63 MiB) # after: 77.669 ms (2 allocations: 7.63 MiB) ``` * Remove old intrinsic fpslt code Co-authored-by: Mustafa Mohamad <mus-m@outlook.com>
1 parent 248c02f commit 79920db

7 files changed

+13
-50
lines changed
 

‎.clang-format

-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ StatementMacros:
109109
- checked_intrinsic_ctype
110110
- cvt_iintrinsic
111111
- fpiseq_n
112-
- fpislt_n
113112
- ter_fintrinsic
114113
- ter_intrinsic_ctype
115114
- un_fintrinsic

‎base/compiler/tfuncs.jl

-1
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,6 @@ add_tfunc(ne_float, 2, 2, cmp_tfunc, 2)
193193
add_tfunc(lt_float, 2, 2, cmp_tfunc, 2)
194194
add_tfunc(le_float, 2, 2, cmp_tfunc, 2)
195195
add_tfunc(fpiseq, 2, 2, cmp_tfunc, 1)
196-
add_tfunc(fpislt, 2, 2, cmp_tfunc, 1)
197196
add_tfunc(eq_float_fast, 2, 2, cmp_tfunc, 1)
198197
add_tfunc(ne_float_fast, 2, 2, cmp_tfunc, 1)
199198
add_tfunc(lt_float_fast, 2, 2, cmp_tfunc, 1)

‎base/float.jl

+13-3
Original file line numberDiff line numberDiff line change
@@ -439,9 +439,19 @@ end
439439
isequal(x::Float16, y::Float16) = fpiseq(x, y)
440440
isequal(x::Float32, y::Float32) = fpiseq(x, y)
441441
isequal(x::Float64, y::Float64) = fpiseq(x, y)
442-
isless( x::Float16, y::Float16) = fpislt(x, y)
443-
isless( x::Float32, y::Float32) = fpislt(x, y)
444-
isless( x::Float64, y::Float64) = fpislt(x, y)
442+
443+
# interpret as sign-magnitude integer
444+
@inline function _fpint(x)
445+
IntT = inttype(typeof(x))
446+
ix = reinterpret(IntT, x)
447+
return ifelse(ix < zero(IntT), ix typemax(IntT), ix)
448+
end
449+
450+
@inline function isless(a::T, b::T) where T<:IEEEFloat
451+
(isnan(a) || isnan(b)) && return !isnan(a)
452+
453+
return _fpint(a) < _fpint(b)
454+
end
445455

446456
# Exact Float (Tf) vs Integer (Ti) comparisons
447457
# Assumes:

‎src/intrinsics.cpp

-21
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ static void jl_init_intrinsic_functions_codegen(void)
5858
float_func[lt_float_fast] = true;
5959
float_func[le_float_fast] = true;
6060
float_func[fpiseq] = true;
61-
float_func[fpislt] = true;
6261
float_func[abs_float] = true;
6362
float_func[copysign_float] = true;
6463
float_func[ceil_llvm] = true;
@@ -1165,26 +1164,6 @@ static Value *emit_untyped_intrinsic(jl_codectx_t &ctx, intrinsic f, Value **arg
11651164
ctx.builder.CreateICmpEQ(xi, yi));
11661165
}
11671166

1168-
case fpislt: {
1169-
*newtyp = jl_bool_type;
1170-
Type *it = INTT(t);
1171-
Value *xi = ctx.builder.CreateBitCast(x, it);
1172-
Value *yi = ctx.builder.CreateBitCast(y, it);
1173-
return ctx.builder.CreateOr(
1174-
ctx.builder.CreateAnd(
1175-
ctx.builder.CreateFCmpORD(x, x),
1176-
ctx.builder.CreateFCmpUNO(y, y)),
1177-
ctx.builder.CreateAnd(
1178-
ctx.builder.CreateFCmpORD(x, y),
1179-
ctx.builder.CreateOr(
1180-
ctx.builder.CreateAnd(
1181-
ctx.builder.CreateICmpSGE(xi, ConstantInt::get(it, 0)),
1182-
ctx.builder.CreateICmpSLT(xi, yi)),
1183-
ctx.builder.CreateAnd(
1184-
ctx.builder.CreateICmpSLT(xi, ConstantInt::get(it, 0)),
1185-
ctx.builder.CreateICmpUGT(xi, yi)))));
1186-
}
1187-
11881167
case and_int: return ctx.builder.CreateAnd(x, y);
11891168
case or_int: return ctx.builder.CreateOr(x, y);
11901169
case xor_int: return ctx.builder.CreateXor(x, y);

‎src/intrinsics.h

-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@
4545
ALIAS(lt_float_fast, lt_float) \
4646
ALIAS(le_float_fast, le_float) \
4747
ADD_I(fpiseq, 2) \
48-
ADD_I(fpislt, 2) \
4948
/* bitwise operators */ \
5049
ADD_I(and_int, 2) \
5150
ADD_I(or_int, 2) \

‎src/julia_internal.h

-1
Original file line numberDiff line numberDiff line change
@@ -1114,7 +1114,6 @@ JL_DLLEXPORT jl_value_t *jl_ne_float(jl_value_t *a, jl_value_t *b);
11141114
JL_DLLEXPORT jl_value_t *jl_lt_float(jl_value_t *a, jl_value_t *b);
11151115
JL_DLLEXPORT jl_value_t *jl_le_float(jl_value_t *a, jl_value_t *b);
11161116
JL_DLLEXPORT jl_value_t *jl_fpiseq(jl_value_t *a, jl_value_t *b);
1117-
JL_DLLEXPORT jl_value_t *jl_fpislt(jl_value_t *a, jl_value_t *b);
11181117

11191118
JL_DLLEXPORT jl_value_t *jl_not_int(jl_value_t *a);
11201119
JL_DLLEXPORT jl_value_t *jl_and_int(jl_value_t *a, jl_value_t *b);

‎src/runtime_intrinsics.c

-22
Original file line numberDiff line numberDiff line change
@@ -834,33 +834,11 @@ fpiseq_n(double, 64)
834834
#define fpiseq(a,b) \
835835
sizeof(a) == sizeof(float) ? fpiseq32(a, b) : fpiseq64(a, b)
836836

837-
#define fpislt_n(c_type, nbits) \
838-
static inline int fpislt##nbits(c_type a, c_type b) JL_NOTSAFEPOINT \
839-
{ \
840-
bits##nbits ua, ub; \
841-
ua.f = a; \
842-
ub.f = b; \
843-
if (!isnan(a) && isnan(b)) \
844-
return 1; \
845-
if (isnan(a) || isnan(b)) \
846-
return 0; \
847-
if (ua.d >= 0 && ua.d < ub.d) \
848-
return 1; \
849-
if (ua.d < 0 && ua.ud > ub.ud) \
850-
return 1; \
851-
return 0; \
852-
}
853-
fpislt_n(float, 32)
854-
fpislt_n(double, 64)
855-
#define fpislt(a, b) \
856-
sizeof(a) == sizeof(float) ? fpislt32(a, b) : fpislt64(a, b)
857-
858837
bool_fintrinsic(eq,eq_float)
859838
bool_fintrinsic(ne,ne_float)
860839
bool_fintrinsic(lt,lt_float)
861840
bool_fintrinsic(le,le_float)
862841
bool_fintrinsic(fpiseq,fpiseq)
863-
bool_fintrinsic(fpislt,fpislt)
864842

865843
// bitwise operators
866844
#define and_op(a,b) a & b

0 commit comments

Comments
 (0)
Please sign in to comment.