@@ -158,26 +158,29 @@ end
158
158
end
159
159
160
160
@testset " fastmath" begin
161
- function sqrt_kernel (x)
162
- i = threadIdx (). x
163
- @inbounds x[i] = sqrt (x[i])
164
- return
165
- end
166
-
167
161
function div_kernel (x)
168
162
i = threadIdx (). x
169
163
@fastmath @inbounds x[i] = 1 / x[i]
170
164
return
171
165
end
172
166
173
- asm = sprint (io-> CUDA. code_ptx (io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1 ,AS. Global}}))
174
- @test occursin (" sqrt.r" , asm)
175
-
176
- asm = sprint (io-> CUDA. code_ptx (io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1 ,AS. Global}}; fastmath= true ))
177
- @test occursin (" sqrt.approx.ftz" , asm)
178
-
179
167
asm = sprint (io-> CUDA. code_ptx (io, div_kernel, Tuple{CuDeviceArray{Float32,1 ,AS. Global}}; fastmath= true ))
180
168
@test occursin (" div.approx.ftz" , asm)
169
+
170
+ # libdevice only contains fast math versions of sqrt for CUDA 11.1+
171
+ if CUDA. runtime_version () >= v " 11.1"
172
+ function sqrt_kernel (x)
173
+ i = threadIdx (). x
174
+ @inbounds x[i] = sqrt (x[i])
175
+ return
176
+ end
177
+
178
+ asm = sprint (io-> CUDA. code_ptx (io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1 ,AS. Global}}))
179
+ @test occursin (" sqrt.r" , asm)
180
+
181
+ asm = sprint (io-> CUDA. code_ptx (io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1 ,AS. Global}}; fastmath= true ))
182
+ @test occursin (" sqrt.approx.ftz" , asm)
183
+ end
181
184
end
182
185
183
186
end
0 commit comments