CUDNN: cudnnConvolutionForward very bad performance(very long execution time) on xavier

Hi,
we tried to use convolution function from the CUDNN library , measured running time of the cudnnConvolutionForward function and the function takes very long time to run.
the parameters of our input image is: Width:4096 , Height:128, Batch size:1
the kernel mask is: 7x7 and all the inputs/output are Floating point(32bit).
we got that it takes the function about 2.6 msec to run.
we tried to use NPP library nppiFilter_32f_C1R function and got 0.5 msec.
our own implementation took about 0.18 msec.
we expected the CUDNN to give the best results because it used Tensor Cores.

please help us to understand what is wrong with our code which is added entirely below.
we used the following command to compile: nvcc -O0 CUDNNConvolution.cpp -lcudnn -o CUDNNConvolution

#include
#include
#include
#include <unistd.h>
#include
#include
#include <sys/time.h>
#include “time.h”
#include “cuda_fp16.h”
#include <cudnn.h>

#define FILTER_WIDTH_MAX 7
#define FILTER_PADDING (FILTER_WIDTH_MAX - 1)
#define FILTER_RADIUS (FILTER_PADDING/2)

#define SEC2NANO 1000000000.0f
#define NANOTOMICRO 1000.0f
#define NANOTOMILI 1000000.0f

#define IMAGE_WIDTH (4096)
#define IMAGE_HEIGHT (128*1)

#define BATCH 1
#define NUM_ITERATIONS 1

int main(void)
{
int priority_high,priority_low;
cudnnHandle_t cudnn;
cudaStream_t stream;
cudnnStatus_t sts;
cudnnTensorDescriptor_t input_descriptor;
cudnnTensorDescriptor_t output_descriptor;
cudnnFilterDescriptor_t kernel_descriptor;
cudnnConvolutionDescriptor_t convolution_descriptor;
cudnnConvolutionFwdAlgo_t convolution_algorithm;
size_t workspace_bytes = 0;
void* d_workspace;
const float alpha = 1, beta = 0;
cudaError_t cuError;
//output dimensions based on the kernel , input dimensions and padding done
int out_n;
int out_c;
int out_h;
int out_w;

float *input_mat;
float *output_mat;
float *Mask;
unsigned int nBytes = IMAGE_WIDTH*IMAGE_HEIGHT*BATCH*sizeof(float);

////////////////////////////////////////
timespec cpu_start;
timespec cpu_end;
long long diffNano;
double TimeArr[NUM_ITERATIONS];
double TotalTime;
double AverageTime;

////////////////////////////////////////
cudaMallocManaged((void **)&input_mat,nBytes);
cudaMallocManaged((void **)&output_mat,nBytes);
cudaMallocManaged((void **)&Mask,FILTER_WIDTH_MAXFILTER_WIDTH_MAXsizeof(float));

for(int i=0; i < FILTER_WIDTH_MAX* FILTER_WIDTH_MAX;i++)
{
	Mask[i] = (1.0/49);
}

for(int j=0; j < IMAGE_WIDTH*IMAGE_HEIGHT;j++)
{
	input_mat[j] = (49);
}


cudaDeviceGetStreamPriorityRange(&priority_low,&priority_high);
cudaStreamCreateWithPriority(&stream,cudaStreamNonBlocking,priority_high);

sts =  cudnnCreate(&cudnn);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnCreate!!!!\n");
	return -1;
}

sts =  cudnnSetStream(cudnn,stream);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnSetStream!!!!\n");
	return -1;
}

//create input descriptor
sts = cudnnCreateTensorDescriptor(&input_descriptor);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnCreateTensorDescriptor for the input_descriptor!!!!\n");
	return -1;
}
//set the descriptor parameters for the input_descriptor
sts = cudnnSetTensor4dDescriptor(input_descriptor,
                                  /*format=*/CUDNN_TENSOR_NCHW,//maybe should try CUDNN_TENSOR_NHWC???
                                  /*dataType=*/CUDNN_DATA_FLOAT,
                                  /*batch_size=*/BATCH,
                                  /*channels=*/1,
                                  /*image_height=*/IMAGE_HEIGHT,
                                  /*image_width=*/IMAGE_WIDTH);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnSetTensor4dDescriptor for the input_descriptor!!!!\n");
	return -1;
}	




//create the kernel filter descriptor
sts = cudnnCreateFilterDescriptor(&kernel_descriptor);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnCreateFilterDescriptor for the kernel_descriptor!!!!\n");
	return -1;
}

//set the descriptor parameters for the kernel_descriptor
sts = cudnnSetFilter4dDescriptor(kernel_descriptor,
                                  /*dataType=*/CUDNN_DATA_FLOAT,
                                  /*format=*/CUDNN_TENSOR_NCHW,//maybe should try NCHW???
                                  /*out_channels=*/1,
                                  /*in_channels=*/1,
                                  /*kernel_height=*/FILTER_WIDTH_MAX,
                                  /*kernel_width=*/FILTER_WIDTH_MAX);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnSetFilter4dDescriptor for the kernel_descriptor!!!!\n");
	return -1;
}

//create convolutional descriptor
sts = cudnnCreateConvolutionDescriptor(&convolution_descriptor);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnCreateConvolutionDescriptor!!!!\n");
	return -1;
}
//set the descriptor parameters for the convolution_descriptor
sts = cudnnSetConvolution2dDescriptor(convolution_descriptor,
	                                   /*pad_height=*/FILTER_RADIUS,//check!!!!!!!!!!!!!!!!!!!!!!!!!!
	                                   /*pad_width=*/FILTER_RADIUS,//check!!!!!!!!!!!!!!!!!!!!!!!!!!
	                                   /*vertical_stride=*/1,
	                                   /*horizontal_stride=*/1,
	                                   /*dilation_height=*/1,
	                                   /*dilation_width=*/1,
	                                   /*mode=*/CUDNN_CONVOLUTION,
	                                   /*computeType=*/CUDNN_DATA_FLOAT);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnSetConvolution2dDescriptor!!!!\n");
	return -1;
}

//set the convolution to use the tensor cores by converting to FP16
sts = cudnnSetConvolutionMathType(convolution_descriptor, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnSetConvolutionMathType!!!!\n");
	return -1;
}

//get the output image dimensions 
sts = cudnnGetConvolution2dForwardOutputDim(convolution_descriptor, input_descriptor, kernel_descriptor,&out_n, &out_c, &out_h, &out_w);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnGetConvolution2dForwardOutputDim!!!!\n");
	return -1;
}

  std::cout << "out_n: " << out_n << std::endl;
  std::cout << "out_c: " << out_c << std::endl;
  std::cout << "out_h: " << out_h << std::endl;
  std::cout << "out_w: " << out_w << std::endl;
  std::cout << std::endl;


//create output descriptor
sts = cudnnCreateTensorDescriptor(&output_descriptor);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnCreateTensorDescriptor for the output_descriptor!!!!\n");
	return -1;
}
//set the descriptor parameters for the output_descriptor
sts = cudnnSetTensor4dDescriptor(output_descriptor,
                                  /*format=*/CUDNN_TENSOR_NCHW,//maybe should try NCHW???
                                  /*dataType=*/CUDNN_DATA_FLOAT,
                                  /*batch_size=*/BATCH,
                                  /*channels=*/1,
                                  /*image_height=*/out_h,
                                  /*image_width=*/out_w);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnSetTensor4dDescriptor for the output_descriptor!!!!\n");
	return -1;
}


sts = cudnnGetConvolutionForwardAlgorithm(cudnn,
	                                input_descriptor,
	                                kernel_descriptor,
	                                convolution_descriptor,
	                                output_descriptor,
	                                CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
	                                /*memoryLimitInBytes=*/0,
	                                &convolution_algorithm);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnGetConvolutionForwardAlgorithm!!!!\n");
	return -1;
}


//the following function is needed to know the size of the buffer for internal use
//that needs to be allocated
sts = cudnnGetConvolutionForwardWorkspaceSize(cudnn,
			                        input_descriptor,
			                        kernel_descriptor,
			                        convolution_descriptor,
			                        output_descriptor,
			                        convolution_algorithm,
			                        &workspace_bytes);
if (sts != CUDNN_STATUS_SUCCESS)
{
	printf("Failed to call to cudnnGetConvolutionForwardWorkspaceSize!!!!\n");
	return -1;
}
//printf("Internal space needed: %d \n",workspace_bytes);
if(workspace_bytes > 0)
{
	cuError = cudaMalloc(&d_workspace, workspace_bytes);
	if(cudaSuccess != cudaSuccess)
	{
		printf("Failed to call to cudaMalloc for d_workspace!!!!\n");
		return -1;
	}
}

for(int i=0;i<NUM_ITERATIONS;i++)
{
clock_gettime(CLOCK_MONOTONIC_RAW,&cpu_start);
sts = cudnnConvolutionForward(cudnn,
&alpha,
input_descriptor,
input_mat,
kernel_descriptor,
Mask,
convolution_descriptor,
convolution_algorithm,
d_workspace,
workspace_bytes,
&beta,
output_descriptor,
output_mat);
if (sts != CUDNN_STATUS_SUCCESS)
{
printf(“Failed to call to cudnnConvolutionForward!!!\n”);
return -1;
}
cudaStreamSynchronize(stream);
clock_gettime(CLOCK_MONOTONIC_RAW,&cpu_end);
diffNano = (cpu_end.tv_sec - cpu_start.tv_sec) * SEC2NANO + cpu_end.tv_nsec - cpu_start.tv_nsec;
TimeArr[i] = (double)diffNano / NANOTOMILI;
TotalTime +=TimeArr[i];
}
AverageTime = TotalTime/NUM_ITERATIONS;
printf(“The AverageTime time measured by CPU is: %f\n”,AverageTime );

/*
for(int i=0; i < IMAGE_HEIGHT;i++)
{
for(int j=0; j < IMAGE_WIDTH;j++)
{
printf("%3.1f “,output_mat[i*IMAGE_WIDTH + j]);
//printf(”%4d “,output_mat[i*IMAGE_WIDTH + j]);
}
printf(”\n");
}
*/
return 0;
}

Hi,

Have you enabled the Xavier to the performance mode first?

sudo nvpmodel -m 0
sudo jetson_clocks.sh

CudnnConvolutionForward with type CUDNN_TENSOR_NCHW_VECT_C can reach 22TLOPS.
Thanks.

Hi,
Thanks for your response,
Of course we are using xavier full performance mode.
when you wrote
“CudnnConvolutionForward with type CUDNN_TENSOR_NCHW_VECT_C can reach 22TLOPS.”
Have you meant that this is for fp32 type or for other type?
As we mentioned before we got pure performance
Have you tried this function?
Is this function implementation took advantage of the DLA (Deep learning accelerator) automaticlly?

please advice,
Thanks

Hi,

It is INT8 mode. Our QA and some forum user tested it before.

For DLA:
TensorRT is the only API to use DLA currently.
You can always enable the DLA mode and TensorRT will automatically fallback it to GPU if a layer is not supported.
But we don’t have a dynamic scheduler on running a model on DLA or GPU yet.

Thanks.