Skip to content

Commit c35ec36

Browse files
Zentrikmaleadt
authored andcommitted
Add test.
1 parent c92379f commit c35ec36

File tree

2 files changed

+24
-0
lines changed

2 files changed

+24
-0
lines changed

src/compiler/execution.jl

+1
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ The following keyword arguments are supported:
306306
supported on LLVM 4.0+)
307307
- `name`: override the name that the kernel will have in the generated code
308308
- `always_inline`: inline all function calls in the kernel
309+
- `fastmath`: use less precise square roots and flush denormals
309310
310311
The output of this function is automatically cached, i.e. you can simply call `cufunction`
311312
in a hot path without degrading performance. New code will be generated automatically, when

test/core/codegen.jl

+23
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,29 @@ end
157157
@test !occursin(".local", asm)
158158
end
159159

160+
@testset "fastmath" begin
161+
function sqrt_kernel(x)
162+
i = threadIdx().x
163+
@inbounds x[i] = sqrt(x[i])
164+
return
165+
end
166+
167+
function div_kernel(x)
168+
i = threadIdx().x
169+
@fastmath @inbounds x[i] = 1 / x[i]
170+
return
171+
end
172+
173+
asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
174+
@test occursin("sqrt.r", asm)
175+
176+
asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
177+
@test occursin("sqrt.approx.ftz", asm)
178+
179+
asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
180+
@test occursin("div.approx.ftz", asm)
181+
end
182+
160183
end
161184

162185
############################################################################################

0 commit comments

Comments
 (0)