|
16 | 16 |
|
17 | 17 | //================================================================================
|
18 | 18 | // this file has been auto-generated, do not modify its contents!
|
19 |
| -// date: 2024-05-17 12:31:58.621011 |
20 |
| -// git hash: cc083808180f25c4addd969a3387e3c9fd14fa35 |
| 19 | +// date: 2024-05-31 13:40:42.460927 |
| 20 | +// git hash: 2e7077de71fb2bede6d4e666126ed1d7c4a88da4 |
21 | 21 | //================================================================================
|
22 | 22 |
|
23 | 23 | #ifndef KERNEL_FLOAT_MACROS_H
|
@@ -1345,26 +1345,26 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan)
|
1345 | 1345 | KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, exp, __expf)
|
1346 | 1346 | KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, log, __logf)
|
1347 | 1347 |
|
1348 |
| -#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR) \ |
| 1348 | +#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR, REG) \ |
1349 | 1349 | namespace detail { \
|
1350 | 1350 | template<size_t N> \
|
1351 | 1351 | struct apply_fastmath_impl<ops::F<T>, N, T, T> { \
|
1352 | 1352 | KERNEL_FLOAT_INLINE static void call(ops::F<T> fun, T* result, const T* inputs) { \
|
1353 | 1353 | for (size_t i = 0; i < N; i++) { \
|
1354 |
| - asm(INSTR, : "=f"(result[i]) : "f"(inputs[i])); \ |
| 1354 | + asm(INSTR : "=" REG(result[i]) : REG(inputs[i])); \ |
1355 | 1355 | } \
|
1356 | 1356 | } \
|
1357 | 1357 | }; \
|
1358 | 1358 | }
|
1359 | 1359 |
|
1360 |
| -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1") |
1361 |
| -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1") |
| 1360 | +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1;", "d") |
| 1361 | +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1;", "d") |
1362 | 1362 |
|
1363 |
| -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1") |
1364 |
| -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1") |
1365 |
| -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1") |
1366 |
| -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1") |
1367 |
| -KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1") |
| 1363 | +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1;", "f") |
| 1364 | +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1;", "f") |
| 1365 | +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1;", "f") |
| 1366 | +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1;", "f") |
| 1367 | +KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1;", "f") |
1368 | 1368 |
|
1369 | 1369 | #endif
|
1370 | 1370 |
|
|
0 commit comments