JuliaMath · NHDaly · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FixedPointDecimals"
 uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0"
 authors = ["Fengyang Wang <[email protected]>", "Curtis Vogt <[email protected]>"]
-version = "0.5.3"
+version = "0.5.4"
 
 [deps]
 BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
@@ -36,9 +36,28 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld,
 
 using Base: decompose, BitInteger
 
-import BitIntegers  # For 128-bit _widemul / _widen
+using BitIntegers: BitIntegers, Int256, Int512, UInt256, UInt512
 import Parsers
 
+# The effects system in newer versions of Julia is necessary for the fldmod_by_const
+# optimization to take affect without adverse performance impacts.
+# For older versions of julia, we will fall back to the flmod implementation, which works
+# very fast for all FixedDecimals of size <= 64 bits.
+if isdefined(Base, Symbol("@assume_effects"))
+    include("fldmod-by-const.jl")
+else
+    # We force-inline this, to allow the fldmod operation to be specialized for a constant
+    # second argument (converting divide-by-power-of-ten instructions with the cheaper
+    # multiply-by-inverse-coefficient.) Sadly this doesn't work for Int128.
+    if VERSION >= v"1.8.0-"
+        # Since julia 1.8+, the built-in fldmod optimizes for constant divisors, which is
+        # even better than computing the division twice.
+        @inline fldmod_by_const(x,y) = fldmod(x,y)
+    else
+        @inline fldmod_by_const(x,y) = (fld(x,y), mod(x,y))
+    end
+end
+
 # floats that support fma and are roughly IEEE-like
 const FMAFloat = Union{Float16, Float32, Float64, BigFloat}
 
@@ -129,8 +148,10 @@ _widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y)
 
 # Custom widen implementation to avoid the cost of widening to BigInt.
 # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
-_widen(::Type{Int128}) = BitIntegers.Int256
-_widen(::Type{UInt128}) = BitIntegers.UInt256
+_widen(::Type{Int128}) = Int256
+_widen(::Type{UInt128}) = UInt256
+_widen(::Type{Int256}) = Int512
+_widen(::Type{UInt256}) = UInt512
 _widen(t::Type) = widen(t)
 _widen(x::T) where {T} = (_widen(T))(x)
 
@@ -176,15 +197,16 @@ function _round_to_nearest(quotient::T,
     halfdivisor = divisor >> 1
     # PERF Note: Only need the last bit to check iseven, and default iseven(Int256)
     # allocates, so we truncate first.
-    if iseven((divisor % Int8)) && remainder == halfdivisor
+    _iseven(x) = (x & 0x1) == 0
+    if _iseven(divisor) && remainder == halfdivisor
         # `:NearestTiesAway` will tie away from zero, e.g. -8.5 ->
         # -9. `:NearestTiesUp` will always ties towards positive
         # infinity. `:Nearest` will tie towards the nearest even
         # integer.
         if M == :NearestTiesAway
             ifelse(quotient < zero(quotient), quotient, quotient + one(quotient))
         elseif M == :Nearest
-            ifelse(iseven(quotient), quotient, quotient + one(quotient))
+            ifelse(_iseven(quotient), quotient, quotient + one(quotient))
         else
             quotient + one(quotient)
         end
@@ -196,18 +218,12 @@ function _round_to_nearest(quotient::T,
 end
 _round_to_nearest(q, r, d, m=RoundNearest) = _round_to_nearest(promote(q, r, d)..., m)
 
-# In many of our calls to fldmod, `y` is a constant (the coefficient, 10^f). However, since
-# `fldmod` is sometimes not being inlined, that constant information is not available to the
-# optimizer. We need an inlined version of fldmod so that the compiler can replace expensive
-# divide-by-power-of-ten instructions with the cheaper multiply-by-inverse-coefficient.
-@inline fldmodinline(x,y) = (fld(x,y), mod(x,y))
-
 # multiplication rounds to nearest even representation
 # TODO: can we use floating point to speed this up? after we build a
 # correctness test suite.
 function Base.:*(x::FD{T, f}, y::FD{T, f}) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt)
+    quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt)
     reinterpret(FD{T, f}, _round_to_nearest(quotient, remainder, powt))
 end
 
@@ -234,12 +250,12 @@ function Base.round(x::FD{T, f},
                              RoundingMode{:NearestTiesUp},
                              RoundingMode{:NearestTiesAway}}=RoundNearest) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(x.i, powt)
+    quotient, remainder = fldmod_by_const(x.i, powt)
     FD{T, f}(_round_to_nearest(quotient, remainder, powt, m))
 end
 function Base.ceil(x::FD{T, f}) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(x.i, powt)
+    quotient, remainder = fldmod_by_const(x.i, powt)
     if remainder > 0
         FD{T, f}(quotient + one(quotient))
     else
@@ -435,7 +451,7 @@ function Base.checked_sub(x::T, y::T) where {T<:FD}
 end
 function Base.checked_mul(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt)
+    quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt)
     v = _round_to_nearest(quotient, remainder, powt)
     typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:*, x, y)
     return reinterpret(FD{T, f}, T(v))

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
@@ -0,0 +1,131 @@
+# This file includes a manual implementation of the divide-by-constant optimization for
+# non-power-of-2 divisors. LLVM automatically includes this optimization, but only for
+# smaller integer sizes (<=64-bits on my 64-bit machine).
+#
+# NOTE: We use LLVM's built-in implementation for Int64 and smaller, to keep the code
+# simpler (though the code we produce is identical). We apply this optimization to (U)Int128
+# and (U)Int256, which result from multiplying FD{Int64}s and FD{Int128}s.
+# Before:
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234))
+#     54.125 μs (0 allocations: 0 bytes)   # (Unchanged)
+#   FixedDecimal{Int32,3}(1700943.280)
+#
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
+#     174.625 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int64,3}(4230510070790917.029)
+#
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234))
+#     2.119 ms (79986 allocations: 1.60 MiB)
+#   FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324)
+#
+# After:
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234))
+#     56.958 μs (0 allocations: 0 bytes)   # (Unchanged)
+#   FixedDecimal{Int32,3}(1700943.280)
+#
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
+#     90.708 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int64,3}(4230510070790917.029)
+#
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234))
+#     180.167 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324)
+
+"""
+    ShouldUseCustomFldmodByConst(::Type{<:MyCustomIntType})) = true
+A trait to control opt-in for the custom `fldmod_by_const` implementation. To use this for a
+given integer type, you can define this overload for your integer type.
+You will also need to implement some parts of the interface below, including _widen().
+"""
+ShouldUseCustomFldmodByConst(::Type{<:Union{Int128,UInt128}}) = true  # For FD{Int64}
+ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,UInt256}}) = true  # For FD{Int128}
+ShouldUseCustomFldmodByConst(::Type) = false
+
+@inline function fldmod_by_const(x, y)
+    if ShouldUseCustomFldmodByConst(typeof(x))
+        # For large Int types, LLVM doesn't optimize well, so we use a custom implementation
+        # of fldmod, which extends that optimization to those larger integer types.
+        d = fld_by_const(x, Val(y))
+        return d, (x - d * y)
+    else
+        # For other integers, LLVM might be able to correctly optimize away the division, if
+        # it knows it's dividing by a const.
+        # Since julia 1.8+, fldmod(x,y) automatically optimizes for constant divisors.
+        return fldmod(x, y)
+    end
+end
+
+# Calculate fld(x,y) when y is a Val constant.
+# The implementation for fld_by_const was lifted directly from Base.fld(x,y), except that
+# it uses `div_by_const` instead of `div`.
+fld_by_const(x::T, y::Val{C}) where {T<:Unsigned, C} = div_by_const(x, y)
+function fld_by_const(x::T, y::Val{C}) where {T<:Signed, C}
+    d = div_by_const(x, y)
+    return d - (signbit(x ⊻ C) & (d * C != x))
+end
+
+function div_by_const(x::T, ::Val{C}) where {T, C}
+    # These checks will be compiled away during specialization.
+    # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
+    # checks allow this function to work for any `C > 0`, in case that's useful in the
+    # future.
+    if C == 1
+        return x
+    elseif ispow2(C)
+        return div(x, C) # Will already do the right thing
+    elseif C <= 0
+        throw(DomainError("C must be > 0"))
+    end
+    # Calculate the magic number and shift amount, based on Hacker's Delight, Chapter 10.
+    magic_number, shift = magicg(typemax(T), C)
+
+    out = _widemul(promote(x, magic_number)...)
+    out >>= shift
+    # Adding one as implied by formula (1b) in Hacker's delight, Chapter 10-4.
+    return (out % T) + (x < zero(T))
+end
+
+# Unsigned magic number computation + shift by constant
+# See Hacker's delight, equations (26) and (27) from Chapter 10-9.
+# (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/)
+# requires nmax >= divisor > 2. divisor must not be a power of 2.
+Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
+    T = typeof(nmax)
+    W = _widen(T)
+    d = W(divisor)
+
+    nc = div(W(nmax) + W(1), d) * d - W(1)      # largest multiple of d <= nmax, minus 1
+    nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit
+    # shift must be larger than int size because we want the high bits of the wide multiplication
+    for p in nbits-1:2nbits-1
+        if W(2)^p > nc * (d - W(1) - rem(W(2)^p - W(1), d))       # (27)
+            m = div(W(2)^p + d - W(1) - rem(W(2)^p - W(1), d), d) # (26)
+            return (m, p)
+        end
+    end
+    @assert false """magicg bug: Unreachable reached. divisor=$divisor, nmax=$nmax.
+        Please report an issue to https://github.com/JuliaMath/FixedPointDecimals.jl
+        """
+end
+
+# See Hacker's delight, equations (5) and (6) from Chapter 10-4.
+# (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/)
+# requires nmax >= divisor > 2. divisor must not be a power of 2.
+Base.@assume_effects :foldable function magicg(nmax::Signed, divisor)
+    T = typeof(nmax)
+    W = _widen(T)
+    d = W(divisor)
+
+    nc = div(W(nmax) + W(1), d) * d - W(1)      # largest multiple of d <= nmax, minus 1
+    nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit
+    # shift must be larger than int size because we want the high bits of the wide multiplication
+    for p in nbits-1:2nbits-1
+        if W(2)^p > nc * (d - rem(W(2)^p, d))       # (6)
+            m = div(W(2)^p + d - rem(W(2)^p, d), d) # (5)
+            return (m, p)
+        end
+    end
+    @assert false """magicg bug: Unreachable reached. divisor=$divisor, nmax=$nmax.
+        Please report an issue to https://github.com/JuliaMath/FixedPointDecimals.jl
+        """
+end
diff --git a/test/FixedDecimal.jl b/test/FixedDecimal.jl
@@ -1408,3 +1408,25 @@ end
     @test hash(FD2(1//10)) ≠ hash(0.1)
 end
 
+@testset "_widemul" begin
+    _widemul = FixedPointDecimals._widemul
+    Int256, UInt256 = FixedPointDecimals.Int256, FixedPointDecimals.UInt256
+    Int512, UInt512 = FixedPointDecimals.Int512, FixedPointDecimals.UInt512
+
+    @test _widemul(UInt8(3), UInt8(2)) === widemul(UInt8(3), UInt8(2)) === UInt16(6)
+    @test _widemul(Int8(3), UInt8(2)) === widemul(Int8(3), UInt8(2)) === Int16(6)
+    @test _widemul(UInt8(3), Int8(2)) === widemul(UInt8(3), Int8(2)) === Int16(6)
+
+    @test _widemul(UInt16(3), UInt8(2)) === widemul(UInt16(3), UInt8(2)) === UInt32(6)
+    @test _widemul(UInt16(3), Int8(2)) === widemul(UInt16(3), Int8(2)) === Int32(6)
+    @test _widemul(Int16(3), UInt8(2)) === widemul(Int16(3), UInt8(2)) === Int32(6)
+
+    # Custom widenings
+    @test _widemul(UInt128(3), UInt128(2)) === UInt256(6)
+    @test _widemul(Int128(3), UInt128(2)) === Int256(6)
+    @test _widemul(UInt128(3), Int128(2)) === Int256(6)
+
+    @test _widemul(UInt256(3), UInt256(2)) === UInt512(6)
+    @test _widemul(Int256(3), UInt256(2)) === Int512(6)
+    @test _widemul(UInt256(3), Int256(2)) === Int512(6)
+end
diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
@@ -0,0 +1,115 @@
+using Test
+using FixedPointDecimals
+
+@testset "div_by_const" begin
+    vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
+    for a_base in vals
+        # Only test negative numbers on `a`, since div_by_const requires b > 0.
+        @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed))
+            a, b = promote(f(a), f(b))
+            @test FixedPointDecimals.div_by_const(a, Val(b)) == a ÷ b
+        end
+    end
+end
+
+@testset "fldmod_by_const" begin
+    vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
+    for a_base in vals
+        # Only test negative numbers on `a`, since fldmod_by_const requires b > 0.
+        @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed))
+            a, b = promote(f(a), f(b))
+            @test FixedPointDecimals.fldmod_by_const(a, b) == fldmod(a, b)
+        end
+    end
+end
+
+# We don't actually use fldmod_by_const with 8-bit ints, but they're useful because
+# we can exhaustively test every possible combination, to increase our confidence in
+# the implementation.
+@testset "flmdod_by_const - exhaustive 8-bit" begin
+    @testset for T in (Int8, UInt8)
+        @testset for x in typemin(T) : typemax(T)
+            @testset for y in typemin(T) : typemax(T)
+                y == 0 && continue
+                y == -1 && continue
+                @testset "fldmod($x, $y)" begin
+                    @test fldmod(x, y) == FixedPointDecimals.fldmod_by_const(x, y)
+                end
+            end
+        end
+    end
+end
+
+# 64-bit FD multiplication uses the custom fldmod_by_const implementation.
+# Test a few various different cases to try to ensure it works correctly.
+@testset "fixed decimal multiplication - 64-bit" begin
+    @testset for P in (0,1,2,3,4)
+        @testset for T in (Int64, UInt64)
+            FD = FixedDecimal{T,P}
+
+            function test_multiplies_correctly(fd, x)
+                big = FixedDecimal{BigInt, P}(fd)
+                big_mul = big * x
+                # This might overflow: ...
+                mul = fd * x
+                @testset "$fd * $x" begin
+                    # ... so we truncate big to the same size
+                    @test big_mul.i % T == mul.i % T
+                end
+            end
+            num_tests = 2<<11
+            # Add one to avoid powers-of-2 to get hopefully better tests
+            epsilon = (typemax(FD) ÷ num_tests) + eps(FD)
+            @testset for v in typemin(FD) : epsilon : typemax(FD)
+                test_multiplies_correctly(v, typemin(T))
+                test_multiplies_correctly(v, -1)
+                test_multiplies_correctly(v, -eps(FD))
+                test_multiplies_correctly(v, 0)
+                test_multiplies_correctly(v, eps(FD))
+                test_multiplies_correctly(v, 1)
+                test_multiplies_correctly(v, 2)
+                test_multiplies_correctly(v, 3)
+                test_multiplies_correctly(v, typemax(T))
+            end
+        end
+    end
+end
+
+@testset "fixed decimal multiplication - 128-bit" begin
+    @testset for P in 0:37
+        @testset for T in (Int128, UInt128)
+            FD = FixedDecimal{T,P}
+
+            function test_multiplies_correctly(fd, x)
+                big = FixedDecimal{BigInt, P}(fd)
+                big_mul = big * x
+                # This might overflow: ...
+                mul = fd * x
+                @testset "$fd * $x" begin
+                    # ... so we truncate big to the same size
+                    @test big_mul.i % T == mul.i % T
+                end
+            end
+            vals = FD[
+                typemin(FD), typemax(FD),
+                typemin(FD) + eps(FD), typemax(FD) - eps(FD),
+                0.0, eps(FD), 0.1, 1.0, 2.0,
+                typemax(FD) ÷ 2,
+            ]
+            if T <: Signed
+                append!(vals, vals.*-1)
+            end
+            @testset for v in vals
+                test_multiplies_correctly(v, typemin(T))
+                test_multiplies_correctly(v, -1)
+                test_multiplies_correctly(v, -eps(FD))
+                test_multiplies_correctly(v, 0)
+                test_multiplies_correctly(v, eps(FD))
+                test_multiplies_correctly(v, 1)
+                test_multiplies_correctly(v, 2)
+                test_multiplies_correctly(v, 3)
+                test_multiplies_correctly(v, typemax(T))
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -10,4 +10,6 @@ include(joinpath(pkg_path, "test", "utils.jl"))
 
 @testset "FixedPointDecimals" begin
     include("FixedDecimal.jl")
-end  # global testset
+    isdefined(Base, Symbol("@assume_effects")) && include("fldmod-by-const_tests.jl")
+end
+