Please provide the following info (check/uncheck the boxes after creating this topic):
Software Version
[V] DRIVE OS Linux 5.2.6
DRIVE OS Linux 5.2.6 and DriveWorks 4.0
DRIVE OS Linux 5.2.0
DRIVE OS Linux 5.2.0 and DriveWorks 3.5
NVIDIA DRIVE™ Software 10.0 (Linux)
NVIDIA DRIVE™ Software 9.0 (Linux)
other DRIVE OS version
other
Target Operating System
[V] Linux
QNX
other
Hardware Platform
[V] NVIDIA DRIVE™ AGX Xavier DevKit (E3550)
NVIDIA DRIVE™ AGX Pegasus DevKit (E3550)
other
SDK Manager Version
[V] 1.8.0.10363
other
Host Machine Version
[V] native Ubuntu 18.04
other
When convolution operation is performed using cudnn, FP32 is the fastest and INT8 is the slowest.
Please check if my setting value is wrong.
FP32 - 2.621643
FP16 - 2.938600 (with tensorcore)
INT8 - 7.822851
int main(int argc, char **argv)
{
cudnnHandle_t cudnnHandle;
cudnnTensorDescriptor_t inTensorDesc, outTensorDesc, biasTensorDesc;
cudnnFilterDescriptor_t filterDesc;
cudnnConvolutionDescriptor_t convDesc;
cudnnActivationDescriptor_t actDesc;
cudaEvent_t start, stop;
clock_t cpu_start, cpu_end;
int in_channel = 8;
int in_height = 1024;
int in_width = 2048;
int batch_count = 1;
int filter_width = 3;
int filter_height = 3;
int out_channel = 16;
int padding_w = 1;
int padding_h = 1;
int stride_horizontal = 2;
int stride_vertical = 2;
float alpha = 1.0f;
float beta = 0.0f;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCUDNN(cudnnCreate(&cudnnHandle));
checkCUDNN(cudnnCreateTensorDescriptor(&inTensorDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&outTensorDesc));
checkCUDNN(cudnnCreateTensorDescriptor(&biasTensorDesc));
checkCUDNN(cudnnCreateFilterDescriptor(&filterDesc));
checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
checkCUDNN(cudnnCreateActivationDescriptor(&actDesc));
checkCUDNN(cudnnSetTensor4dDescriptor(inTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch_count, in_channel, in_height, in_width));
checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, out_channel, in_channel, filter_height, filter_width));
checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, padding_h, padding_w, stride_vertical, stride_horizontal, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
checkCUDNN(cudnnSetTensor4dDescriptor(biasTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, out_channel, 1, 1));
checkCUDNN(cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0));
float *inData_d;
float *outData_d;
float *filterData_d;
void* workSpace;
checkCudaErrors(cudaMalloc((void**)&inData_d, sizeof(float)*in_channel*in_width*in_height*out_channel));
checkCudaErrors(cudaMalloc((void**)&filterData_d, sizeof(float)*in_channel*filter_width*filter_height*out_channel));
cudaMemset(inData_d,0x00,sizeof(float)*in_channel*in_width*in_height*out_channel );
cudaMemset(filterData_d,0x00,sizeof(float)*in_channel*filter_width*filter_height*out_channel);
int out_n, out_c, out_h, out_w;
checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, inTensorDesc, filterDesc, &out_n, &out_c, &out_h, &out_w));
printf("conv out shape (n x c x h x w) = (%d x %d x %d x %d)\n", out_n, out_c, out_h, out_w);
checkCUDNN(cudnnSetTensor4dDescriptor(outTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, out_n, out_c, out_h, out_w));
checkCudaErrors(cudaMalloc((void**)&outData_d, sizeof(float)*out_c*out_w*out_h));
checkCUDNN(cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH));
int RetCnt;
cudnnConvolutionFwdAlgoPerf_t fwd_algo_pref_[8];
/* cudnnFindConvolutionForwardAlgorithm(cudnnHandle,
inTensorDesc,
filterDesc,
convDesc,
outTensorDesc,
8,
&RetCnt,
fwd_algo_pref_);*/
checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle,
inTensorDesc,
filterDesc,
convDesc,
outTensorDesc,
4,
&RetCnt,
fwd_algo_pref_));
cout << "Fastest algorithm for conv = " << fwd_algo_pref_[0].algo << endl;
size_t sizeInBytes = 0;
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
inTensorDesc,
filterDesc,
convDesc,
outTensorDesc,
fwd_algo_pref_[0].algo,
&sizeInBytes));
cout << "sizeInBytes " << sizeInBytes << endl;
if (sizeInBytes != 0)
checkCudaErrors(cudaMalloc(&workSpace, sizeInBytes));
for(int i=0;i<300;i++)
{
checkCUDNN(cudnnConvolutionForward(cudnnHandle,
&alpha,
inTensorDesc,
inData_d,
filterDesc,
filterData_d,
convDesc,
fwd_algo_pref_[0].algo,
workSpace,
sizeInBytes,
&beta,
outTensorDesc,
outData_d));
}
cudaEventRecord(start);
cpu_start = clock();
for(int i=0;i<1000;i++)
{
checkCUDNN(cudnnConvolutionForward(cudnnHandle,
&alpha,
inTensorDesc,
inData_d,
filterDesc,
filterData_d,
convDesc,
fwd_algo_pref_[0].algo,
workSpace,
sizeInBytes,
&beta,
outTensorDesc,
outData_d));
}
cpu_end = clock();
double result = (double)(cpu_end - cpu_start);
result = result / 1000 /1000;
printf("cpu - %f \n", result);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("gpu - %f \n", milliseconds / 1000);
checkCudaErrors(cudaFree(inData_d));
checkCudaErrors(cudaFree(outData_d));
checkCudaErrors(cudaFree(workSpace));
cudaEventDestroy(start);
cudaEventDestroy(stop);
checkCUDNN(cudnnDestroyTensorDescriptor(inTensorDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(outTensorDesc));
checkCUDNN(cudnnDestroyTensorDescriptor(biasTensorDesc));
checkCUDNN(cudnnDestroyConvolutionDescriptor(convDesc));
checkCUDNN(cudnnDestroyFilterDescriptor(filterDesc));
checkCUDNN(cudnnDestroyActivationDescriptor(actDesc));
checkCUDNN(cudnnDestroy(cudnnHandle));
return EXIT_SUCCESS;
}
-Full source code for FP32-
checkCUDNN(cudnnSetTensor4dDescriptor(inTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, batch_count, in_channel, in_height, in_width));
checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_HALF, CUDNN_TENSOR_NHWC, out_channel, in_channel, filter_height, filter_width));
checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, padding_h, padding_w, stride_vertical, stride_horizontal, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_HALF));
checkCUDNN(cudnnSetTensor4dDescriptor(biasTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_HALF, 1, out_channel, 1, 1));
checkCUDNN(cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0));
-Setting FP16-
checkCUDNN(cudnnSetTensor4dDescriptor(inTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_INT8, batch_count, in_channel, in_height, in_width));
checkCUDNN(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_INT8, CUDNN_TENSOR_NHWC, out_channel, in_channel, filter_height, filter_width));
checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, padding_h, padding_w, stride_vertical, stride_horizontal, 1, 1, CUDNN_CONVOLUTION, CUDNN_DATA_INT32));
checkCUDNN(cudnnSetTensor4dDescriptor(biasTensorDesc, CUDNN_TENSOR_NHWC, CUDNN_DATA_INT8, 1, out_channel, 1, 1));
checkCUDNN(cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0));
-Setting INT8-