convulution not running in parallel

Hi guys,

could you guys please take a look at this code. Why these two convolution is not running concurrently. I used nvvp to view that and it is not running concurrently. I also looked at the achieve_occupcy, it is not high either.

Appreciate it.
here is my code

#include
#include
#include
#include
#include <unistd.h>

#include <cuda.h>
#include <cudnn.h>

#define CUDA_CALL(f) {
cudaError_t err = (f);
if (err != cudaSuccess) {
std::cout
<< " Error occurred: " << err << std::endl;
std::exit(1);
}
}

#define CUDNN_CALL(f) {
cudnnStatus_t err = (f);
if (err != CUDNN_STATUS_SUCCESS) {
std::cout
<< " Error occurred: " << err << std::endl;
std::exit(1);
}
}

global void dev_const(float *px, float k) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
px[tid] = k;
}

global void dev_iota(float *px) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
px[tid] = tid % 10;
}

void print(const float *data, int n, int c, int h, int w) {
std::vector buffer(1 << 20);
CUDA_CALL(cudaMemcpy(
buffer.data(), data,
n * c * h * w * sizeof(float),
cudaMemcpyDeviceToHost));
int a = 0;
for (int i = 0; i < n; ++i) {
for (int j = 0; j < c; ++j) {
std::cout << “n=” << i << “, c=” << j << “:” << std::endl;
for (int k = 0; k < h; ++k) {
for (int l = 0; l < w; ++l) {
std::cout << std::setw(4) << std::right << buffer[a];
++a;
}
std::cout << std::endl;
}
}
}
std::cout << std::endl;
}

int main() {
cudaStream_t stream1, stream2;
cudaStreamCreateWithFlags( &stream1, cudaStreamNonBlocking) ;
cudaStreamCreateWithFlags( &stream2, cudaStreamNonBlocking) ;

cudnnHandle_t cudnn1, cudnn2;

CUDNN_CALL(cudnnCreate(&cudnn1));
CUDNN_CALL(cudnnCreate(&cudnn2));

CUDNN_CALL(cudnnSetStream(
cudnn1,
stream1));
CUDNN_CALL(cudnnSetStream(
cudnn2,
stream2));

// input
const int in_n1 = 128;
const int in_c1 = 112;
const int in_h1 = 14;
const int in_w1 = 14;

const int in_n2 = 128;
const int in_c2 = 24;
const int in_h2 = 14;
const int in_w2 = 14;
std::cout << "in_n: " << in_n1 << std::endl;
std::cout << "in_c: " << in_c1 << std::endl;
std::cout << "in_h: " << in_h1 << std::endl;
std::cout << "in_w: " << in_w1 << std::endl;
std::cout << std::endl;

cudnnTensorDescriptor_t in_desc1;
CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc1));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
in_desc1, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
in_n1, in_c1, in_h1, in_w1));

cudnnTensorDescriptor_t in_desc2;
CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc2));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
in_desc2, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
in_n2, in_c2, in_h2, in_w2));

float *in_data1;
CUDA_CALL(cudaMallocHost(
&in_data1, in_n1 * in_c1 * in_h1 * in_w1 * sizeof(float)));
float *in_data2;
CUDA_CALL(cudaMallocHost(
&in_data2, in_n2 * in_c2 * in_h2 * in_w2 * sizeof(float)));
// filter
const int filt_k1 = 112;
const int filt_c1 = in_c1;
const int filt_h1 = 3;
const int filt_w1 = 3;

const int filt_k2 = 64;
const int filt_c2 = in_c2;
const int filt_h2 = 5;
const int filt_w2 = 5;
std::cout << "filt_k: " << filt_k1 << std::endl;
std::cout << "filt_c: " << filt_c1 << std::endl;
std::cout << "filt_h: " << filt_h1 << std::endl;
std::cout << "filt_w: " << filt_w1 << std::endl;
std::cout << std::endl;

cudnnFilterDescriptor_t filt_desc1;
CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc1));
CUDNN_CALL(cudnnSetFilter4dDescriptor(
filt_desc1, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
filt_k1, filt_c1, filt_h1, filt_w1));

cudnnFilterDescriptor_t filt_desc2;
CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc2));
CUDNN_CALL(cudnnSetFilter4dDescriptor(
filt_desc2, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
filt_k2, filt_c2, filt_h2, filt_w2));

float *filt_data1;
CUDA_CALL(cudaMallocHost(
&filt_data1, filt_k1 * filt_c1 * filt_h1 * filt_w1 * sizeof(float)));
float *filt_data2;
CUDA_CALL(cudaMallocHost(
&filt_data2, filt_k2 * filt_c2 * filt_h2 * filt_w2 * sizeof(float)));
// convolution

const int pad_h1 = (filt_w1 - 1)/2;
const int pad_w1 = (filt_w1 - 1)/2;
const int str_h1 = 1;
const int str_w1 = 1;
const int dil_h1 = 1;
const int dil_w1 = 1;
std::cout << "pad_h: " << pad_h1 << std::endl;
std::cout << "pad_w: " << pad_w1 << std::endl;
std::cout << "str_h: " << str_h1 << std::endl;
std::cout << "str_w: " << str_w1 << std::endl;
std::cout << "dil_h: " << dil_h1 << std::endl;
std::cout << "dil_w: " << dil_w1 << std::endl;
std::cout << std::endl;

cudnnConvolutionDescriptor_t conv_desc1;
CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc1));
// cudnnSetConvolution2dDescriptor_v5(…)
CUDNN_CALL(cudnnSetConvolution2dDescriptor(
conv_desc1,
pad_h1, pad_w1, str_h1, str_w1, dil_h1, dil_w1,
CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));

const int pad_h2 = (filt_w2 - 1)/2;
const int pad_w2 = (filt_w2 - 1)/2;
const int str_h2 = 1;
const int str_w2 = 1;
const int dil_h2 = 1;
const int dil_w2 = 1;
std::cout << "pad_h: " << pad_h2 << std::endl;
std::cout << "pad_w: " << pad_w2 << std::endl;
std::cout << "str_h: " << str_h2 << std::endl;
std::cout << "str_w: " << str_w2 << std::endl;
std::cout << "dil_h: " << dil_h2 << std::endl;
std::cout << "dil_w: " << dil_w2 << std::endl;
std::cout << std::endl;

cudnnConvolutionDescriptor_t conv_desc2;
CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc2));
// cudnnSetConvolution2dDescriptor_v5(…)
CUDNN_CALL(cudnnSetConvolution2dDescriptor(
conv_desc2,
pad_h2, pad_w2, str_h2, str_w2, dil_h2, dil_w2,
CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
// output
int out_n1;
int out_c1;
int out_h1;
int out_w1;
int out_n2;
int out_c2;
int out_h2;
int out_w2;

CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
conv_desc1, in_desc1, filt_desc1,
&out_n1, &out_c1, &out_h1, &out_w1));
CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
conv_desc2, in_desc2, filt_desc2,
&out_n2, &out_c2, &out_h2, &out_w2));

std::cout << "out_n: " << out_n1 << std::endl;
std::cout << "out_c: " << out_c1 << std::endl;
std::cout << "out_h: " << out_h1 << std::endl;
std::cout << "out_w: " << out_w1 << std::endl;
std::cout << std::endl;

cudnnTensorDescriptor_t out_desc1;
CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc1));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
out_desc1, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
out_n1, out_c1, out_h1, out_w1));
float *out_data1;
CUDA_CALL(cudaMallocHost(
&out_data1, out_n1 * out_c1 * out_h1 * out_w1 * sizeof(float)));

cudnnTensorDescriptor_t out_desc2;
CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc2));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
out_desc2, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
out_n2, out_c2, out_h2, out_w2));
float *out_data2;
CUDA_CALL(cudaMallocHost(
&out_data2, out_n2 * out_c2 * out_h2 * out_w2 * sizeof(float)));
// algorithm
cudnnConvolutionFwdAlgo_t algo1, algo2;
CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(
cudnn1,
in_desc1, filt_desc1, conv_desc1, out_desc1,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo1));

CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(
cudnn2,
in_desc2, filt_desc2, conv_desc2, out_desc2,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo2));

algo1 = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
//CUDNN_CONVOLUTION_FWD_ALGO_GEMM;
algo2 = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
std::cout << "Convolution algorithm stream1: " << algo1 << std::endl;
std::cout << "Convolution algorithm stream2: " << algo2 << std::endl;
std::cout << std::endl;

// workspace
size_t ws_size1, ws_size2;
CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
cudnn1, in_desc1, filt_desc1, conv_desc1, out_desc1, algo1, &ws_size1));
CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
cudnn2, in_desc2, filt_desc2, conv_desc2, out_desc2, algo2, &ws_size2));

float *ws_data1, *ws_data2;
CUDA_CALL(cudaMallocHost(&ws_data1, ws_size1));
CUDA_CALL(cudaMallocHost(&ws_data2, ws_size2));

std::cout << "Workspace size stream1: " << ws_size1 << std::endl;
std::cout << "Workspace size stream2: " << ws_size2 << std::endl;
std::cout << std::endl;

// perform
float alpha = 1.f;
float beta = 0.f;
dev_iota<<<in_w1 * in_h1, in_n1 * in_c1>>>(in_data1);
dev_iota<<<in_w2 * in_h2, in_n2 * in_c2>>>(in_data2);
dev_const<<<filt_w1 * filt_h1, filt_k1 * filt_c1>>>(filt_data1, 1.f);
dev_const<<<filt_w2 * filt_h2, filt_k2 * filt_c2>>>(filt_data2, 1.f);
int conv_repeat = 1;
for (int i = 0; i < conv_repeat; i++){
CUDNN_CALL(cudnnConvolutionForward(
cudnn1,
&alpha, in_desc1, in_data1, filt_desc1, filt_data1,
conv_desc1, algo1, ws_data1, ws_size1,
&beta, out_desc1, out_data1));

CUDNN_CALL(cudnnConvolutionForward(
    cudnn2,
    &alpha, in_desc2, in_data2, filt_desc2, filt_data2,
    conv_desc2, algo2, ws_data2, ws_size2,
    &beta, out_desc2, out_data2));

}

cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);

// results
std::cout << “in_data:” << std::endl;
//print(in_data, in_n, in_c, in_h, in_w);

std::cout << “filt_data:” << std::endl;
//print(filt_data, filt_k, filt_c, filt_h, filt_w);

std::cout << “out_data:” << std::endl;
//print(out_data, out_n, out_c, out_h, out_w);

// finalizing
CUDA_CALL(cudaFreeHost(ws_data1));
CUDA_CALL(cudaFreeHost(ws_data2));
CUDA_CALL(cudaFreeHost(out_data1));
CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc1));
CUDA_CALL(cudaFreeHost(out_data2));
CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc2));
CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc1));
CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc2));
CUDA_CALL(cudaFreeHost(filt_data1));
CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc1));
CUDA_CALL(cudaFreeHost(filt_data2));
CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc2));
CUDA_CALL(cudaFreeHost(in_data1));
CUDA_CALL(cudaFreeHost(in_data2));
CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc1));
CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc2));
CUDNN_CALL(cudnnDestroy(cudnn1));
CUDNN_CALL(cudnnDestroy(cudnn2));
return 0;
}

Hi,

Could you please share the nvidia profiler output as well so we can help better?
Also, can you provide details on the platforms you are using:
o Linux distro and version
o GPU type
o Nvidia driver version
o CUDA version
o CUDNN version

Thanks

1.I am using linux centos7
2.gpu type tesla k40c
3.nvcc 10.0.130 as Nvidia version
4.CUDA Version 10.0.130
5.cudnn I am not sure I think it is the same as cuda

here is the result from profiler:

==31335== NVPROF is profiling process 31335, command: ./conv
==31335== Warning: Profiling results might be incorrect with current version of nvcc compiler used to compile cuda app. Compile with nvcc compiler 9.0 or later version to get correct profiling results. Ignore this warning if code is already compiled with the recommended nvcc version
==31335== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
==31335== Profiling application: ./conv
==31335== Profiling result:
==31335== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device “Tesla K40c (0)”
Kernel: void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const , int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float, float*, int2, int, int)
1 sm_efficiency Multiprocessor Activity 5.26% 5.26% 5.26%
1 achieved_occupancy Achieved Occupancy 0.029510 0.029510 0.029510
1 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
1 gst_requested_throughput Requested Global Store Throughput 45.679MB/s 45.679MB/s 45.679MB/s
1 sm_efficiency_instance Multiprocessor Activity 5.26% 5.26% 5.26%
1 dram_read_throughput Device Memory Read Throughput 943.42MB/s 943.42MB/s 943.42MB/s
1 dram_write_throughput Device Memory Write Throughput 3.7289MB/s 3.7289MB/s 3.7289MB/s
1 gst_throughput Global Store Throughput 57.798MB/s 57.798MB/s 57.798MB/s
1 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
1 shared_efficiency Shared Memory Efficiency 41.99% 41.99% 41.99%
1 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
1 gst_efficiency Global Memory Store Efficiency 79.03% 79.03% 79.03%
1 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1)
Kernel: compute_gemm_pointers(float2**, float2 const , int, float2 const , int, float2 const , int, int)
2 sm_efficiency Multiprocessor Activity 14.42% 17.43% 15.93%
2 achieved_occupancy Achieved Occupancy 0.126549 0.127685 0.127117
2 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
2 gst_requested_throughput Requested Global Store Throughput 509.89MB/s 1.0659GB/s 881.10MB/s
2 sm_efficiency_instance Multiprocessor Activity 14.42% 17.43% 15.93%
2 dram_read_throughput Device Memory Read Throughput 82.928MB/s 165.24MB/s 112.70MB/s
2 dram_write_throughput Device Memory Write Throughput 0.00000B/s 5.3502MB/s 3.4151MB/s
2 gst_throughput Global Store Throughput 1.9917GB/s 4.2634GB/s 3.4418GB/s
2 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
2 shared_efficiency Shared Memory Efficiency 0.00% 0.00% 0.00%
2 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
2 gst_efficiency Global Memory Store Efficiency 25.00% 25.00% 25.00%
2 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1)
Kernel: dev_iota(float
)
2 sm_efficiency Multiprocessor Activity 17.59% 21.79% 19.69%
2 achieved_occupancy Achieved Occupancy 0.160337 0.170173 0.165255
2 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
2 gst_requested_throughput Requested Global Store Throughput 1.9316GB/s 1.9878GB/s 1.9593GB/s
2 sm_efficiency_instance Multiprocessor Activity 17.59% 21.79% 19.69%
2 dram_read_throughput Device Memory Read Throughput 171.56MB/s 186.94MB/s 179.14MB/s
2 dram_write_throughput Device Memory Write Throughput 0.00000B/s 20.184MB/s 10.236MB/s
2 gst_throughput Global Store Throughput 1.9316GB/s 1.9878GB/s 1.9593GB/s
2 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
2 shared_efficiency Shared Memory Efficiency 0.00% 0.00% 0.00%
2 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
2 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
2 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1)
Kernel: void fft2d_r2c_16x16(float2
, float const , int, int, int, int, int, int, int, int)
2 sm_efficiency Multiprocessor Activity 4.52% 4.76% 4.64%
2 achieved_occupancy Achieved Occupancy 0.124755 0.124826 0.124790
2 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
2 gst_requested_throughput Requested Global Store Throughput 887.43MB/s 1.0165GB/s 958.04MB/s
2 sm_efficiency_instance Multiprocessor Activity 4.52% 4.76% 4.64%
2 dram_read_throughput Device Memory Read Throughput 83.125MB/s 529.99MB/s 324.34MB/s
2 dram_write_throughput Device Memory Write Throughput 0.00000B/s 3.6141MB/s 1.6632MB/s
2 gst_throughput Global Store Throughput 887.43MB/s 1.0165GB/s 958.04MB/s
2 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
2 shared_efficiency Shared Memory Efficiency 25.00% 25.00% 25.00%
2 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
2 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
2 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1)
Kernel: void fft2d_r2c_32x32<float, bool=0, unsigned int=5, bool=0>(float2
, float const , int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
1 sm_efficiency Multiprocessor Activity 4.07% 4.07% 4.07%
1 achieved_occupancy Achieved Occupancy 0.248071 0.248071 0.248071
1 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
1 gst_requested_throughput Requested Global Store Throughput 3.6647GB/s 3.6647GB/s 3.6647GB/s
1 sm_efficiency_instance Multiprocessor Activity 4.07% 4.07% 4.07%
1 dram_read_throughput Device Memory Read Throughput 334.56MB/s 334.56MB/s 334.56MB/s
1 dram_write_throughput Device Memory Write Throughput 6.8982MB/s 6.8982MB/s 6.8981MB/s
1 gst_throughput Global Store Throughput 3.6647GB/s 3.6647GB/s 3.6647GB/s
1 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
1 shared_efficiency Shared Memory Efficiency 50.00% 50.00% 50.00%
1 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
1 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1)
Kernel: void fermiPlusCgemmLDS128_batched<bool=1, bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2
const , float2 const , float2 const , float2, float2 const , float2 const , int, int, int, int, int, int, __int64, __int64, __int64, float2 const , float2 const , float2, float2, int)
2 sm_efficiency Multiprocessor Activity 88.19% 93.36% 90.78%
2 achieved_occupancy Achieved Occupancy 0.273947 0.298514 0.286230
2 gld_requested_throughput Requested Global Load Throughput 135.05MB/s 181.35MB/s 169.21MB/s
2 gst_requested_throughput Requested Global Store Throughput 22.508MB/s 30.224MB/s 28.201MB/s
2 sm_efficiency_instance Multiprocessor Activity 88.19% 93.36% 90.78%
2 dram_read_throughput Device Memory Read Throughput 78.227MB/s 151.31MB/s 97.391MB/s
2 dram_write_throughput Device Memory Write Throughput 3.7514MB/s 46.892MB/s 35.579MB/s
2 gst_throughput Global Store Throughput 90.033MB/s 120.90MB/s 112.80MB/s
2 gld_throughput Global Load Throughput 540.20MB/s 725.38MB/s 676.82MB/s
2 shared_efficiency Shared Memory Efficiency 90.00% 90.00% 90.00%
2 gld_efficiency Global Memory Load Efficiency 25.00% 25.00% 25.00%
2 gst_efficiency Global Memory Store Efficiency 25.00% 25.00% 25.00%
2 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1)
Kernel: void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=0>(float2
, float const , int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
1 sm_efficiency Multiprocessor Activity 5.05% 5.05% 5.05%
1 achieved_occupancy Achieved Occupancy 0.249238 0.249238 0.249238
1 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
1 gst_requested_throughput Requested Global Store Throughput 2.1110GB/s 2.1110GB/s 2.1110GB/s
1 sm_efficiency_instance Multiprocessor Activity 5.05% 5.05% 5.05%
1 dram_read_throughput Device Memory Read Throughput 508.63MB/s 508.63MB/s 508.63MB/s
1 dram_write_throughput Device Memory Write Throughput 1.9868MB/s 1.9868MB/s 1.9868MB/s
1 gst_throughput Global Store Throughput 2.1110GB/s 2.1110GB/s 2.1110GB/s
1 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
1 shared_efficiency Shared Memory Efficiency 42.27% 42.27% 42.27%
1 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
1 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1)
Kernel: void fft2d_c2r_16x16<float, bool=0>(float
, float2
, int, int, int, int, int, int, int, int, int, int, float, float, int, float
, float
)
1 sm_efficiency Multiprocessor Activity 5.11% 5.11% 5.11%
1 achieved_occupancy Achieved Occupancy 0.023291 0.023291 0.023291
1 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
1 gst_requested_throughput Requested Global Store Throughput 44.484MB/s 44.484MB/s 44.484MB/s
1 sm_efficiency_instance Multiprocessor Activity 5.11% 5.11% 5.11%
1 dram_read_throughput Device Memory Read Throughput 352.24MB/s 352.24MB/s 352.24MB/s
1 dram_write_throughput Device Memory Write Throughput 7.2626MB/s 7.2626MB/s 7.2626MB/s
1 gst_throughput Global Store Throughput 58.101MB/s 58.101MB/s 58.101MB/s
1 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
1 shared_efficiency Shared Memory Efficiency 27.49% 27.49% 27.49%
1 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
1 gst_efficiency Global Memory Store Efficiency 76.56% 76.56% 76.56%
1 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1)
Kernel: dev_const(float
, float)
2 sm_efficiency Multiprocessor Activity 11.57% 19.58% 15.57%
2 achieved_occupancy Achieved Occupancy 0.015620 0.025804 0.020712
2 gld_requested_throughput Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
2 gst_requested_throughput Requested Global Store Throughput 120.04MB/s 331.14MB/s 225.96MB/s
2 sm_efficiency_instance Multiprocessor Activity 11.57% 19.58% 15.57%
2 dram_read_throughput Device Memory Read Throughput 166.73MB/s 264.91MB/s 215.99MB/s
2 dram_write_throughput Device Memory Write Throughput 13.245MB/s 13.338MB/s 13.291MB/s
2 gst_throughput Global Store Throughput 120.04MB/s 331.14MB/s 225.96MB/s
2 gld_throughput Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
2 shared_efficiency Shared Memory Efficiency 0.00% 0.00% 0.00%
2 gld_efficiency Global Memory Load Efficiency 0.00% 0.00% 0.00%
2 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
2 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1)

Could you please verify the issue with the latest cudnn-8.0.0 Preview and/or cudnn-7.6.5 GA?
Let us know in case issue persist.

Thanks