Skip to content

Commit 05e4860

Browse files
authored
Merge pull request opencv#25412 from ZelboK:update-cudnn-to-9
Refactor DNN module to build with cudnn 9 opencv#25412 A lot of APIs that are currently being used in the dnn module have been removed in cudnn 9. They were deprecated in 8. This PR updates said code accordingly to the newer API. Some key notes: 1) This is my first PR. I am new to openCV. 2) `opencv_test_core` tests pass 3) On a 3080, cuda 12.4(should be irrelevant since I didn't build the `opencv_modules`, gcc 11.4, WSL 2. 4) For brevity I will avoid including macro code that will allow for older versions of cudnn to build. I was unable to get the tests working for `opencv_test_dnn` and `opencv_perf_dnn`. The errors I get are of the following: ``` OpenCV tests: Can't find required data file: dnn/onnx/conformance/node/test_reduce_prod_default_axes_keepdims_example/model.onnx in function 'findData' " thrown in the test body. ``` So before I spend more time investigating I was hoping to get a maintainer to point me in the right direction here. I would like to run these tests and confirm things are working as intended. I may have missed some details. ### Pull Request Readiness Checklist relevant issue (opencv#24983 - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
1 parent b659351 commit 05e4860

File tree

3 files changed

+134
-25
lines changed

3 files changed

+134
-25
lines changed

modules/dnn/src/cuda4dnn/csl/cudnn/recurrent.hpp

+56-14
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class RNNDescriptor
9797

9898
/**
9999
*/
100-
RNNDescriptor(const Handle &handle, RNNMode mode, int hidden_size, int num_layers,
100+
RNNDescriptor(const Handle &handle, RNNMode mode, int input_size, int hidden_size, int num_layers,
101101
bool bidirectional, const DropoutDescriptor &dropoutDesc)
102102
{
103103
CUDA4DNN_CHECK_CUDNN(cudnnCreateRNNDescriptor(&descriptor));
@@ -119,12 +119,35 @@ class RNNDescriptor
119119

120120
try
121121
{
122+
#if CUDNN_MAJOR >= 9
123+
CUDA4DNN_CHECK_CUDNN(cudnnSetRNNDescriptor_v8(
124+
descriptor,
125+
algo,
126+
rnn_mode,
127+
CUDNN_RNN_DOUBLE_BIAS,
128+
bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL,
129+
CUDNN_LINEAR_INPUT, detail::get_data_type<T>(),
130+
detail::get_data_type<T>(),
131+
detail::get_data_type<T>() == CUDNN_DATA_HALF ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH,
132+
input_size,
133+
hidden_size,
134+
hidden_size,
135+
num_layers,
136+
dropoutDesc.get(),
137+
0)); // What other flags do we might want here?
138+
#else
122139
CUDA4DNN_CHECK_CUDNN(cudnnSetRNNDescriptor_v6(
123-
handle.get(), descriptor, hidden_size, num_layers, dropoutDesc.get(),
124-
CUDNN_LINEAR_INPUT, bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL,
125-
rnn_mode,
126-
algo, //CUDNN_RNN_ALGO_STANDARD,
127-
detail::get_data_type<T>()));
140+
handle.get(),
141+
descriptor,
142+
hidden_size,
143+
num_layers,
144+
dropoutDesc.get(),
145+
CUDNN_LINEAR_INPUT,
146+
bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL,
147+
rnn_mode,
148+
algo,
149+
detail::get_data_type<T>()));
150+
#endif
128151
}
129152
catch (...)
130153
{
@@ -158,16 +181,34 @@ class RNNDescriptor
158181
cudnnRNNAlgo_t algo{CUDNN_RNN_ALGO_STANDARD};
159182
};
160183

161-
template<class T>
162-
size_t getRNNWorkspaceSize(const Handle &handle, const RNNDescriptor<T> &rnnDesc,
163-
const int seqLength, const TensorDescriptorsArray<T> &inputDesc)
184+
#if CUDNN_MAJOR >= 9
185+
template <class T>
186+
void LSTMForward(const Handle &handle, const RNNDescriptor<T> &rnnDesc,
187+
cudnnRNNDataDescriptor_t xDesc, DevicePtr<const T> x,
188+
cudnnRNNDataDescriptor_t yDesc, DevicePtr<T> y,
189+
cudnnTensorDescriptor_t hDesc, DevicePtr<const T> hx, DevicePtr<T> hy,
190+
cudnnTensorDescriptor_t cDesc, DevicePtr<const T> cx, DevicePtr<T> cy,
191+
size_t weightSpaceSize, DevicePtr<const T> weightSpace,
192+
size_t cudnn_WorkspaceSize, DevicePtr<T> cudnn_Workspace,
193+
size_t reserveSpaceSize, DevicePtr<T> reserveSpace)
164194
{
165-
size_t workSize;
166-
CUDA4DNN_CHECK_CUDNN(cudnnGetRNNWorkspaceSize(handle.get(), rnnDesc.get(), seqLength,
167-
inputDesc.get().data(), &workSize));
168-
return workSize;
195+
CV_Assert(handle);
196+
197+
std::cout << "cudnn_WorkspaceSize: " << cudnn_WorkspaceSize << std::endl;
198+
std::cout << "reserveSpaceSize: " << reserveSpaceSize << std::endl;
199+
200+
CUDA4DNN_CHECK_CUDNN(cudnnRNNForward(
201+
handle.get(), rnnDesc.get(), CUDNN_FWD_MODE_INFERENCE,
202+
nullptr, // docs say use this as null on >= 8.9.1
203+
xDesc, x.get(), yDesc, y.get(),
204+
hDesc, hx.get(), hy.get(),
205+
cDesc, cx.get(), cy.get(),
206+
weightSpaceSize, weightSpace.get(),
207+
cudnn_WorkspaceSize, cudnn_Workspace.get(),
208+
reserveSpaceSize, reserveSpace.get()));
169209
}
170210

211+
#else
171212
template<class T>
172213
void LSTMForward(const Handle &handle, const RNNDescriptor<T> &rnnDesc,
173214
const FilterDescriptor<T> &filterDesc, DevicePtr<const T> filterPtr,
@@ -189,7 +230,8 @@ void LSTMForward(const Handle &handle, const RNNDescriptor<T> &rnnDesc,
189230
initialCDesc.get(), ycOutputPtr.get(),
190231
static_cast<void*>(workspace.get()), workspace.size_in_bytes()));
191232
}
233+
#endif
192234

193235
}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
194236

195-
#endif //OPENCV_DNN_CUDA4DNN_CSL_CUDNN_RECURRENT_HPP
237+
#endif //OPENCV_DNN_CUDA4DNN_CSL_CUDNN_RECURRENT_HPP

modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp

+76-5
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,46 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
528528
LSTM() = default;
529529
LSTM(const LSTM&) = delete;
530530
LSTM(LSTM&&) = default;
531+
532+
#if CUDNN_MAJOR >= 9
533+
LSTM(cudnn::Handle handle, const params_type &params)
534+
: cudnnHandle(std::move(handle)), seqLength(params.seqLength)
535+
{
536+
std::vector<int> seqLenArr(params.miniBatch, seqLength);
537+
cudnnCreateRNNDataDescriptor(&xDesc);
538+
cudnnSetRNNDataDescriptor(xDesc, cudnn::detail::get_data_type<T>(),
539+
CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, seqLength,
540+
params.miniBatch, params.inputSize, seqLenArr.data(),
541+
nullptr);
542+
cudnnCreateRNNDataDescriptor(&cyDesc);
543+
cudnnSetRNNDataDescriptor(
544+
cyDesc, cudnn::detail::get_data_type<T>(),
545+
CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
546+
seqLength, params.miniBatch,
547+
params.bidirectional ? params.hiddenSize * 2 : params.hiddenSize,
548+
seqLenArr.data(),
549+
nullptr);
550+
551+
dropoutDesc = DropoutDescriptor(cudnnHandle, params.dropout);
552+
rnnDesc = RNNDescriptor(cudnnHandle, params.type, params.inputSize, params.hiddenSize,
553+
params.numLayers, params.bidirectional, dropoutDesc);
554+
555+
int num_direction = params.bidirectional ? 2 : 1;
556+
h0TensorDesc = TensorDescriptor(num_direction, params.miniBatch, params.hiddenSize);
557+
c0TensorDesc = TensorDescriptor(num_direction, params.miniBatch, params.hiddenSize);
558+
559+
// Get amount of work space required to execute the RNN described by rnnDesc
560+
// with input dimensions defined by inputDesc
561+
CUDA4DNN_CHECK_CUDNN(cudnnGetRNNTempSpaceSizes(
562+
cudnnHandle.get(), rnnDesc.get(), CUDNN_FWD_MODE_INFERENCE,
563+
xDesc, &workSpaceSize, &reserveSpaceSize));
564+
565+
csl::WorkspaceBuilder builder;
566+
builder.require<T>(workSpaceSize);
567+
builder.require<T>(reserveSpaceSize);
568+
scratch_mem_in_bytes = builder.required_workspace_size();
569+
}
570+
#else
531571
LSTM(cudnn::Handle handle, const params_type& params)
532572
: cudnnHandle(std::move(handle)), seqLength{params.seqLength},
533573
inputDesc(seqLength, {params.miniBatch, params.inputSize, 1}),
@@ -538,7 +578,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
538578
{
539579
dropoutDesc = DropoutDescriptor(cudnnHandle, params.dropout);
540580
filterDesc = FilterDescriptor(params.weights_shape);
541-
rnnDesc = RNNDescriptor(cudnnHandle, params.type, params.hiddenSize,
581+
rnnDesc = RNNDescriptor(cudnnHandle, params.type, params.inputSize, params.hiddenSize,
542582
params.numLayers, params.bidirectional, dropoutDesc);
543583

544584
int num_direction = params.bidirectional ? 2 : 1;
@@ -550,19 +590,44 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
550590
// Get amount of work space required to execute the RNN described by rnnDesc
551591
// with input dimensions defined by inputDesc
552592
csl::WorkspaceBuilder builder;
553-
builder.require(cudnn::getRNNWorkspaceSize<T>(cudnnHandle, rnnDesc, seqLength, inputDesc));
593+
size_t workSize;
594+
CUDA4DNN_CHECK_CUDNN(cudnnGetRNNWorkspaceSize(cudnnHandle.get(), rnnDesc.get(), seqLength,
595+
inputDesc.get().data(), &workSize));
596+
builder.require(workSize);
554597
scratch_mem_in_bytes = builder.required_workspace_size();
555598
}
599+
#endif
556600

557601
LSTM& operator=(const LSTM&) = delete;
558602
LSTM& operator=(LSTM&&) = default;
559603

560604
void inference(TensorView<T> input, TensorSpan<T> y_output, TensorSpan<T> yc_output, TensorView<T> filters,
561-
TensorView<T> h0, TensorView<T> c0, WorkspaceInstance workspace)
605+
TensorView<T> h0, TensorView<T> c0, csl::Workspace& workspace)
562606
{
607+
auto ws_allocator = csl::WorkspaceAllocator(workspace);
608+
609+
#if CUDNN_MAJOR >= 9
610+
size_t weightSpaceSize = sizeof(typename TensorView<T>::value_type) * filters.size();
611+
auto workspaceData = ws_allocator.get_span<T>(workSpaceSize);
612+
auto reserveSpaceData = ws_allocator.get_span<T>(reserveSpaceSize);
613+
cudnn::LSTMForward<T>(cudnnHandle, rnnDesc, xDesc, input.get(), cyDesc,
614+
y_output.get(), h0TensorDesc.get(), h0.get(),
615+
DevicePtr<T>(nullptr), // hy, final state
616+
c0TensorDesc.get(), // maps to cxDesc
617+
c0.get(), // maps to cx
618+
yc_output.get(), // maps to cy
619+
weightSpaceSize,
620+
filters.get(), // maps to weightSpace
621+
workSpaceSize,
622+
workspaceData.data(), // workSpaceSize and workSpace
623+
reserveSpaceSize, // reserveSpaceSize
624+
reserveSpaceData.data()
625+
);
626+
#else
563627
cudnn::LSTMForward<T>(cudnnHandle, rnnDesc, filterDesc, filters.get(), inputDesc,
564628
input.get(), h0TensorDesc, h0.get(), c0TensorDesc, c0.get(),
565-
seqLength, outputDesc, y_output.get(), yc_output.get(), workspace);
629+
seqLength, outputDesc, y_output.get(), yc_output.get(), ws_allocator.get_instance());
630+
#endif
566631
}
567632

568633
std::size_t get_workspace_memory_in_bytes() const noexcept { return scratch_mem_in_bytes; }
@@ -575,11 +640,17 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
575640
RNNDescriptor rnnDesc;
576641
DropoutDescriptor dropoutDesc;
577642

578-
FilterDescriptor filterDesc;
579643
TensorDescriptor h0TensorDesc, c0TensorDesc;
580644

645+
#if CUDNN_MAJOR >= 9
646+
size_t weightSpaceSize, workSpaceSize, reserveSpaceSize;
647+
cudnnRNNDataDescriptor_t xDesc;
648+
cudnnRNNDataDescriptor_t cyDesc; // represents cyDesc or cDesc(now reps both final and beginning)
649+
#else
650+
FilterDescriptor filterDesc;
581651
TensorDescriptorsArray inputDesc;
582652
TensorDescriptorsArray outputDesc;
653+
#endif
583654
};
584655

585656
}}}} /* namespace cv::dnn::cuda4dnn::csl */

modules/dnn/src/cuda4dnn/primitives/recurrent_cells.hpp

+2-6
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,6 @@ class LSTMOp final : public CUDABackendNode
5555

5656
c0Tensor = csl::makeTensorHeader<T>(c0);
5757
csl::copyMatToTensor<T>(c0, c0Tensor, stream);
58-
59-
csl::WorkspaceBuilder builder;
60-
builder.require<T>(lstm.get_workspace_memory_in_bytes());
6158
}
6259

6360
void forward(const std::vector<cv::Ptr<BackendWrapper>>& inputs,
@@ -75,8 +72,7 @@ class LSTMOp final : public CUDABackendNode
7572
Ptr<wrapper_type> yc_output_wrapper = outputs.size() == 2 ? outputs[1].dynamicCast<wrapper_type>() : Ptr<wrapper_type>();
7673
csl::TensorSpan<T> yc_output = yc_output_wrapper.empty() ? csl::TensorSpan<T>() : yc_output_wrapper->getSpan();
7774

78-
csl::WorkspaceAllocator allocator(workspace);
79-
lstm.inference(input, y_output, yc_output, filtersTensor, h0Tensor, c0Tensor, allocator.get_instance());
75+
lstm.inference(input, y_output, yc_output, filtersTensor, h0Tensor, c0Tensor, workspace);
8076
}
8177

8278
std::size_t get_workspace_memory_in_bytes() const noexcept override
@@ -94,4 +90,4 @@ class LSTMOp final : public CUDABackendNode
9490

9591
}}} /* namespace cv::dnn::cuda4dnn */
9692

97-
#endif //OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RECURRENT_CELLS_HPP
93+
#endif //OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RECURRENT_CELLS_HPP

0 commit comments

Comments
 (0)