Skip to content

Commit b59ee5f

Browse files
committed
Fix invalid inline PTX for fast_rcp
1 parent 2e7077d commit b59ee5f

File tree

2 files changed

+20
-20
lines changed

2 files changed

+20
-20
lines changed

include/kernel_float/unops.h

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -227,26 +227,26 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan)
227227
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, exp, __expf)
228228
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, log, __logf)
229229

230-
#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR) \
230+
#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR, REG) \
231231
namespace detail { \
232232
template<size_t N> \
233233
struct apply_fastmath_impl<ops::F<T>, N, T, T> { \
234234
KERNEL_FLOAT_INLINE static void call(ops::F<T> fun, T* result, const T* inputs) { \
235235
for (size_t i = 0; i < N; i++) { \
236-
asm(INSTR, : "=f"(result[i]) : "f"(inputs[i])); \
236+
asm(INSTR : "=" REG(result[i]) : REG(inputs[i])); \
237237
} \
238238
} \
239239
}; \
240240
}
241241

242-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1")
243-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1")
242+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1;", "d")
243+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1;", "d")
244244

245-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1")
246-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1")
247-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1")
248-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1")
249-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1")
245+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1;", "f")
246+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1;", "f")
247+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1;", "f")
248+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1;", "f")
249+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1;", "f")
250250

251251
#endif
252252

single_include/kernel_float.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
//================================================================================
1818
// this file has been auto-generated, do not modify its contents!
19-
// date: 2024-05-17 12:31:58.621011
20-
// git hash: cc083808180f25c4addd969a3387e3c9fd14fa35
19+
// date: 2024-05-31 13:40:42.460927
20+
// git hash: 2e7077de71fb2bede6d4e666126ed1d7c4a88da4
2121
//================================================================================
2222

2323
#ifndef KERNEL_FLOAT_MACROS_H
@@ -1345,26 +1345,26 @@ KERNEL_FLOAT_DEFINE_UNARY_FUN_FAST(tan)
13451345
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, exp, __expf)
13461346
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_FUN(float, log, __logf)
13471347

1348-
#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR) \
1348+
#define KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(T, F, INSTR, REG) \
13491349
namespace detail { \
13501350
template<size_t N> \
13511351
struct apply_fastmath_impl<ops::F<T>, N, T, T> { \
13521352
KERNEL_FLOAT_INLINE static void call(ops::F<T> fun, T* result, const T* inputs) { \
13531353
for (size_t i = 0; i < N; i++) { \
1354-
asm(INSTR, : "=f"(result[i]) : "f"(inputs[i])); \
1354+
asm(INSTR : "=" REG(result[i]) : REG(inputs[i])); \
13551355
} \
13561356
} \
13571357
}; \
13581358
}
13591359

1360-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1")
1361-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1")
1360+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rcp, "rcp.approx.ftz.f64 %0, %1;", "d")
1361+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(double, rsqrt, "rsqrt.approx.f64 %0, %1;", "d")
13621362

1363-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1")
1364-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1")
1365-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1")
1366-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1")
1367-
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1")
1363+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sqrt, "sqrt.approx.f32 %0, %1;", "f")
1364+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rcp, "rcp.approx.f32 %0, %1;", "f")
1365+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, rsqrt, "rsqrt.approx.f32 %0, %1;", "f")
1366+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, sin, "sin.approx.f32 %0, %1;", "f")
1367+
KERNEL_FLOAT_DEFINE_UNARY_FAST_IMPL_PTX(float, cos, "cos.approx.f32 %0, %1;", "f")
13681368

13691369
#endif
13701370

0 commit comments

Comments
 (0)