int main(int argc, char **argv)
{
cudnnHandle_t cudnnHandle;
cudnnTensorDescriptor_t inTensorDesc, outTensorDesc, biasTensorDesc;
cudnnFilterDescriptor_t filterDesc;
cudnnConvolutionDescriptor_t convDesc;
cudnnActivationDescriptor_t actDesc;
cudaEvent_t start, stop;
clock_t cpu_start, cpu_end;
int in_channel = 8;
int in_height = 1024;
int in_width = 2048;
int batch_count = 1;
int filter_width = 3;
int filter_height = 3;
int out_channel = 16;
int padding_w = 1;
int padding_h = 1;
int stride_horizontal = 2;
int stride_vertical = 2;
float alpha = 1.0f;
float beta = 0.0f;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCUDNN(cudnnCreate(&cudnnHandle));
checkCUDNN(cudnnCreateTensorDescriptor(&inTensorDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&outTensorDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&biasTensorDesc));
checkCUDNN(cudnnCreateFilterDescriptor(&filterDesc));
checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
checkCUDNN(cudnnCreateActivationDescriptor(&actDesc));
checkCUDNN(cudnnSetTensor4dDescriptor(inTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, batch_count, in_channel, in_height, in_width));
checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NHWC, out_channel, in_channel, filter_height, filter_width));
//checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, padding_h, padding_w, stride_vertical, stride_horizontal, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_HALF));
// FOR PSEUDO FP16
checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, padding_h, padding_w, stride_vertical, stride_horizontal, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
checkCUDNN(cudnnSetTensor4dDescriptor(biasTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, 1, out_channel, 1, 1));
checkCUDNN(cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0));
void *inData_d;
void *outData_d;
void *filterData_d;
void* workSpace;
checkCudaErrors(cudaMalloc((void**)&inData_d, sizeof(float)*in_channel*in_width*in_height*out_channel));
checkCudaErrors(cudaMalloc((void**)&filterData_d, sizeof(float)*in_channel*filter_width*filter_height*out_channel));
cudaMemset(inData_d,0x00,sizeof(float)*in_channel*in_width*in_height*out_channel );
cudaMemset(filterData_d,0x00,sizeof(float)*in_channel*filter_width*filter_height*out_channel);
int out_n, out_c, out_h, out_w;
checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, inTensorDesc, filterDesc, &out_n, &out_c, &out_h, &out_w));
printf("conv out shape (n x c x h x w) = (%d x %d x %d x %d)\n", out_n, out_c, out_h, out_w);
checkCUDNN(cudnnSetTensor4dDescriptor(outTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, out_n, out_c, out_h, out_w));
checkCudaErrors(cudaMalloc((void**)&outData_d, sizeof(float)*out_c*out_w*out_h));
checkCUDNN(cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH));
int RetCnt;
cudnnConvolutionFwdAlgoPerf_t fwd_algo_pref_[8];
cudnnFindConvolutionForwardAlgorithm(cudnnHandle,
inTensorDesc,
filterDesc,
convDesc,
outTensorDesc,
8,
&RetCnt,
fwd_algo_pref_);
/*checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle,
inTensorDesc,
filterDesc,
convDesc,
outTensorDesc,
4,
&RetCnt,
fwd_algo_pref_)); */
cout << "Fastest algorithm for conv = " << fwd_algo_pref_[0].algo << endl;
size_t sizeInBytes = 0;
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
inTensorDesc,
filterDesc,
convDesc,
outTensorDesc,
fwd_algo_pref_[0].algo,
&sizeInBytes));
cout << "sizeInBytes " << sizeInBytes << endl;
if (sizeInBytes != 0)
checkCudaErrors(cudaMalloc(&workSpace, sizeInBytes));
for(int i=0;i<300;i++) // gpu warm-up
{
checkCUDNN(cudnnConvolutionForward(cudnnHandle,
&alpha,
inTensorDesc,
inData_d,
filterDesc,
filterData_d,
convDesc,
fwd_algo_pref_[0].algo,
workSpace,
sizeInBytes,
&beta,
outTensorDesc,
outData_d));
}
cudaEventRecord(start);
for(int i=0;i<1000;i++)
{
checkCUDNN(cudnnConvolutionForward(cudnnHandle,
&alpha,
inTensorDesc,
inData_d,
filterDesc,
filterData_d,
convDesc,
fwd_algo_pref_[0].algo,
workSpace,
sizeInBytes,
&beta,
outTensorDesc,
outData_d));
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("gpu - %f \n", milliseconds / 1000);
checkCudaErrors(cudaFree(inData_d));
checkCudaErrors(cudaFree(outData_d));
if(sizeInBytes)
checkCudaErrors(cudaFree(workSpace));
cudaEventDestroy(start);
cudaEventDestroy(stop);
checkCUDNN(cudnnDestroyTensorDescriptor(inTensorDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(outTensorDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(biasTensorDesc));
checkCUDNN(cudnnDestroyConvolutionDescriptor(convDesc));
checkCUDNN(cudnnDestroyFilterDescriptor(filterDesc));
checkCUDNN(cudnnDestroyActivationDescriptor(actDesc));
checkCUDNN(cudnnDestroy(cudnnHandle));
return EXIT_SUCCESS;
}
Also, I am using JetPack 5.0