@@ -204,30 +204,14 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
204
204
Lmax = std::max (Lmax, L);
205
205
}
206
206
207
- #ifdef __HIPCC__
208
- cuda_status = hipMemcpyAsync (&(repeats_[start_idx]), repeats,
209
- (end_idx - start_idx) * sizeof (int ),
210
- hipMemcpyHostToDevice, stream_);
211
- #else
212
- cuda_status = cudaMemcpyAsync (&(repeats_[start_idx]), repeats,
213
- (end_idx - start_idx) * sizeof (int ),
214
- cudaMemcpyHostToDevice, stream_);
215
- #endif
216
-
207
+ cuda_status = warpctc::memcpy_h2d_async (
208
+ &(repeats_[start_idx]), repeats, (end_idx - start_idx) * sizeof (int ), stream_);
217
209
if (cuda_status != gpuSuccess)
218
210
return CTC_STATUS_MEMOPS_FAILED;
219
211
220
212
221
- #ifdef __HIPCC__
222
- cuda_status = hipMemcpyAsync (&(label_offsets_[start_idx]), label_offsets,
223
- (end_idx - start_idx) * sizeof (int ),
224
- hipMemcpyHostToDevice, stream_);
225
- #else
226
- cuda_status = cudaMemcpyAsync (&(label_offsets_[start_idx]), label_offsets,
227
- (end_idx - start_idx) * sizeof (int ),
228
- cudaMemcpyHostToDevice, stream_);
229
- #endif
230
-
213
+ cuda_status = warpctc::memcpy_h2d_async (
214
+ &(label_offsets_[start_idx]), label_offsets, (end_idx - start_idx) * sizeof (int ), stream_);
231
215
if (cuda_status != gpuSuccess)
232
216
return CTC_STATUS_MEMOPS_FAILED;
233
217
}
@@ -243,16 +227,8 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
243
227
gpu_bytes_used);
244
228
gpu_bytes_used += minibatch_ * sizeof (int );
245
229
246
- #ifdef __HIPCC__
247
- cuda_status = hipMemcpyAsync (utt_length_, input_lengths,
248
- minibatch_ * sizeof (int ),
249
- hipMemcpyHostToDevice, stream_);
250
- #else
251
- cuda_status = cudaMemcpyAsync (utt_length_, input_lengths,
252
- minibatch_ * sizeof (int ),
253
- cudaMemcpyHostToDevice, stream_);
254
- #endif
255
-
230
+ cuda_status = warpctc::memcpy_h2d_async (
231
+ utt_length_, input_lengths, minibatch_ * sizeof (int ), stream_);
256
232
if (cuda_status != gpuSuccess)
257
233
return CTC_STATUS_MEMOPS_FAILED;
258
234
@@ -261,16 +237,8 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
261
237
gpu_bytes_used);
262
238
gpu_bytes_used += minibatch_ * sizeof (int );
263
239
264
- #ifdef __HIPCC__
265
- cuda_status = hipMemcpyAsync (label_sizes_, label_lengths,
266
- minibatch_ * sizeof (int ),
267
- hipMemcpyHostToDevice, stream_);
268
- #else
269
- cuda_status = cudaMemcpyAsync (label_sizes_, label_lengths,
270
- minibatch_ * sizeof (int ),
271
- cudaMemcpyHostToDevice, stream_);
272
- #endif
273
-
240
+ cuda_status = warpctc::memcpy_h2d_async (
241
+ label_sizes_, label_lengths, minibatch_ * sizeof (int ), stream_);
274
242
if (cuda_status != gpuSuccess)
275
243
return CTC_STATUS_MEMOPS_FAILED;
276
244
@@ -279,16 +247,8 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
279
247
gpu_bytes_used);
280
248
gpu_bytes_used += Lmax * minibatch_ * sizeof (int );
281
249
282
- #ifdef __HIPCC__
283
- cuda_status = hipMemcpyAsync (labels_without_blanks_, flat_labels,
284
- total_label_length * sizeof (int ),
285
- hipMemcpyHostToDevice, stream_);
286
- #else
287
- cuda_status = cudaMemcpyAsync (labels_without_blanks_, flat_labels,
288
- total_label_length * sizeof (int ),
289
- cudaMemcpyHostToDevice, stream_);
290
- #endif
291
-
250
+ cuda_status = warpctc::memcpy_h2d_async (
251
+ labels_without_blanks_, flat_labels, total_label_length * sizeof (int ), stream_);
292
252
if (cuda_status != gpuSuccess)
293
253
return CTC_STATUS_MEMOPS_FAILED;
294
254
@@ -302,7 +262,6 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
302
262
gpu_bytes_used);
303
263
gpu_bytes_used += (S_ * T_) * minibatch_ * sizeof (ProbT);
304
264
305
-
306
265
denoms_ =
307
266
reinterpret_cast <ProbT *>(static_cast <char *>(gpu_workspace_) +
308
267
gpu_bytes_used);
@@ -330,25 +289,19 @@ ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
330
289
// away
331
290
const int stride = minibatch_;
332
291
333
- if (compute_alpha)
292
+ if (compute_alpha) {
334
293
compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0 , stream_>>>
335
294
(probs, label_sizes_, utt_length_,
336
295
repeats_, labels_without_blanks_, label_offsets_,
337
296
labels_with_blanks_, alphas_, nll_forward_,
338
297
stride, out_dim_, S_, T_, blank_label_);
339
-
298
+ }
340
299
341
300
if (compute_beta) {
342
301
compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0 , stream_>>>
343
302
(probs, label_sizes_, utt_length_, repeats_,
344
303
labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
345
304
grads, stride, out_dim_, S_, T_, blank_label_);
346
-
347
- #ifdef __HIPCC__
348
- hipStreamSynchronize (stream_);
349
- #else
350
- cudaStreamSynchronize (stream_);
351
- #endif
352
305
}
353
306
354
307
#ifdef __HIPCC__
0 commit comments