I am testing Tesla V100 using CUDA 9 and cuDNN 7 (on Windows 10).
I measured good performance for cuBLAS ~90 Tflops on matrix multiplication. However, in cuDNN I measured only low performance and no advantage of tensor cores on V100.
For example, the following code shows only ~14 Tflops. It is unacceptable taking into account NVIDIA’s marketing promises and the price of V100.
Is there parameter settings when Tesla V100 has benefit from using tensor cores in cuDNN when computing convolution?
//--------------------------------------------------------
void Check(cudnnStatus_t status)
{
if (status != CUDNN_STATUS_SUCCESS)
{
fprintf(stderr, “!!! GPU program execution error: %s\n”, cudnnGetErrorString(status));
exit(0);
}
}
void Check(cudaError_t status)
{
if (status != cudaSuccess)
{
fprintf(stderr, “!!! CUDA error\n”);
exit(0);
}
}
size_t Get_size(cudnnTensorDescriptor_t tensor_desc)
{
size_t size = 0;
Check(cudnnGetTensorSizeInBytes(tensor_desc, &size));
return size;
}
void* Alloc(size_t size)
{
if (size == 0) return 0;
void* x = 0;
cudaMalloc(&x, size);
return x;
}
void main()
{
cudnnHandle_t cudnn;
cudnnCreate(&cudnn);
const int c_in = 128;
const int c_out = 128;
const int xh = 128;
const int xw = 128;
const int n = 128;
const int fh = 4;
const int fw = 4;
float alpha = 0.1f, beta = 0.1f;
cudnnTensorFormat_t data_format = CUDNN_TENSOR_NCHW;
cudnnDataType_t data_type = CUDNN_DATA_HALF;
cudnnTensorDescriptor_t x_desc;
Check(cudnnCreateTensorDescriptor(&x_desc));
Check(cudnnSetTensor4dDescriptor(x_desc, data_format, data_type, n, c_in, xh, xw));
cudnnFilterDescriptor_t f_desc;
Check(cudnnCreateFilterDescriptor(&f_desc));
Check(cudnnSetFilter4dDescriptor(f_desc, data_type, data_format, c_out, c_in, fh, fw));
cudnnConvolutionDescriptor_t conv_desc;
Check(cudnnCreateConvolutionDescriptor(&conv_desc));
Check(cudnnSetConvolution2dDescriptor(
conv_desc,
0, //0, //int pad_h,
0, //0, //int pad_w,
1, //int u,
1, //int v,
1, //0, // dilation_h,
1, //0, // dilation_w,
CUDNN_CONVOLUTION,
CUDNN_DATA_FLOAT));
int n_, c_, h_, w_;
Check(cudnnGetConvolution2dForwardOutputDim(conv_desc, x_desc, f_desc, &n_, &c_, &h_, &w_));
float Gflop = (2.0f * (float)n_ * (float)h_ * (float)w_ * (float)c_out * (float)c_in * (float)fh * (float)fw) / (1000.0f * 1000.0f * 1000.0f);
printf("n_=%d c_=%d h_=%d w_=%d, Gflop=%f\n", n_, c_, h_, w_, Gflop);
cudnnTensorDescriptor_t y_desc;
Check(cudnnCreateTensorDescriptor(&y_desc));
Check(cudnnSetTensor4dDescriptor(y_desc, data_format, data_type, n_, c_, h_, w_));
cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
size_t tmp_size = 0;
Check(cudnnGetConvolutionForwardWorkspaceSize(cudnn, x_desc, f_desc, conv_desc, y_desc, algo, &tmp_size));
size_t f_size = (size_t)c_out * (size_t)c_in * (size_t)fh * (size_t)fw;
printf("tmp_size: %f MB\n", tmp_size * 1.0f / (1024.0*1024.0));
printf("x_size: %f MB\n", Get_size(x_desc) * 1.0f / (1024.0*1024.0));
printf("y_size: %f MB\n", Get_size(y_desc) * 1.0f / (1024.0*1024.0));
printf("f_size: %f MB\n", f_size * 1.0f / (1024.0*1024.0));
void* x = Alloc(Get_size(x_desc));
void* f = Alloc(f_size);
void* y = Alloc(Get_size(y_desc));
void* t = Alloc(tmp_size);
void* y2 = Alloc(Get_size(y_desc));
printf("x = %p, f = %p, y = %p, t = %p\n", x, f, y, t);
const int Q = 4;
cudaStream_t stream[Q];
for (int q = 0; q < Q; q++) Check(cudaStreamCreate(&(stream[q])));
double ti = second();
int U = 10;
for (int u = 0; u < U; u++)
{
for (int q = 0; q < Q; q++)
{
Check(cudnnSetStream(cudnn, stream[q]));
Check(cudnnConvolutionForward(cudnn, &alpha, x_desc, x, f_desc, f,
conv_desc, algo, t, tmp_size, &beta, y_desc, y));
}
}
for (int q = 0; q < Q; q++) Check(cudaStreamSynchronize(stream[q]));
float secs = (float)(second() - ti);
float gflops = (Gflop * U * Q) / secs;
printf("secs: %f, Tflops: %f\n", secs, gflops / 1000.0f);
}