convulution not running in parallel

chenghz4 · February 5, 2020, 2:01am

Hi guys,

could you guys please take a look at this code. Why these two convolution is not running concurrently. I used nvvp to view that and it is not running concurrently. I also looked at the achieve_occupcy, it is not high either.

Appreciate it.
here is my code

#include
#include
#include
#include
#include <unistd.h>

#include <cuda.h>
#include <cudnn.h>

#define CUDA_CALL(f) {
cudaError_t err = (f);
if (err != cudaSuccess) {
std::cout
<< " Error occurred: " << err << std::endl;
std::exit(1);
}
}

#define CUDNN_CALL(f) {
cudnnStatus_t err = (f);
if (err != CUDNN_STATUS_SUCCESS) {
std::cout
<< " Error occurred: " << err << std::endl;
std::exit(1);
}
}

global void dev_const(float *px, float k) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
px[tid] = k;
}

global void dev_iota(float *px) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
px[tid] = tid % 10;
}

void print(const float *data, int n, int c, int h, int w) {
std::vector buffer(1 << 20);
CUDA_CALL(cudaMemcpy(
buffer.data(), data,
n * c * h * w * sizeof(float),
cudaMemcpyDeviceToHost));
int a = 0;
for (int i = 0; i < n; ++i) {
for (int j = 0; j < c; ++j) {
std::cout << “n=” << i << “, c=” << j << “:” << std::endl;
for (int k = 0; k < h; ++k) {
for (int l = 0; l < w; ++l) {
std::cout << std::setw(4) << std::right << buffer[a];
++a;
}
std::cout << std::endl;
}
}
}
std::cout << std::endl;
}

int main() {
cudaStream_t stream1, stream2;
cudaStreamCreateWithFlags( &stream1, cudaStreamNonBlocking) ;
cudaStreamCreateWithFlags( &stream2, cudaStreamNonBlocking) ;

cudnnHandle_t cudnn1, cudnn2;

CUDNN_CALL(cudnnCreate(&cudnn1));
CUDNN_CALL(cudnnCreate(&cudnn2));

CUDNN_CALL(cudnnSetStream(
cudnn1,
stream1));
CUDNN_CALL(cudnnSetStream(
cudnn2,
stream2));

// input
const int in_n1 = 128;
const int in_c1 = 112;
const int in_h1 = 14;
const int in_w1 = 14;

const int in_n2 = 128;
const int in_c2 = 24;
const int in_h2 = 14;
const int in_w2 = 14;
std::cout << "in_n: " << in_n1 << std::endl;
std::cout << "in_c: " << in_c1 << std::endl;
std::cout << "in_h: " << in_h1 << std::endl;
std::cout << "in_w: " << in_w1 << std::endl;
std::cout << std::endl;

cudnnTensorDescriptor_t in_desc1;
CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc1));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
in_desc1, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
in_n1, in_c1, in_h1, in_w1));

cudnnTensorDescriptor_t in_desc2;
CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc2));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
in_desc2, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
in_n2, in_c2, in_h2, in_w2));

float *in_data1;
CUDA_CALL(cudaMallocHost(
&in_data1, in_n1 * in_c1 * in_h1 * in_w1 * sizeof(float)));
float *in_data2;
CUDA_CALL(cudaMallocHost(
&in_data2, in_n2 * in_c2 * in_h2 * in_w2 * sizeof(float)));
// filter
const int filt_k1 = 112;
const int filt_c1 = in_c1;
const int filt_h1 = 3;
const int filt_w1 = 3;

const int filt_k2 = 64;
const int filt_c2 = in_c2;
const int filt_h2 = 5;
const int filt_w2 = 5;
std::cout << "filt_k: " << filt_k1 << std::endl;
std::cout << "filt_c: " << filt_c1 << std::endl;
std::cout << "filt_h: " << filt_h1 << std::endl;
std::cout << "filt_w: " << filt_w1 << std::endl;
std::cout << std::endl;

cudnnFilterDescriptor_t filt_desc1;
CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc1));
CUDNN_CALL(cudnnSetFilter4dDescriptor(
filt_desc1, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
filt_k1, filt_c1, filt_h1, filt_w1));

cudnnFilterDescriptor_t filt_desc2;
CUDNN_CALL(cudnnCreateFilterDescriptor(&filt_desc2));
CUDNN_CALL(cudnnSetFilter4dDescriptor(
filt_desc2, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
filt_k2, filt_c2, filt_h2, filt_w2));

float *filt_data1;
CUDA_CALL(cudaMallocHost(
&filt_data1, filt_k1 * filt_c1 * filt_h1 * filt_w1 * sizeof(float)));
float *filt_data2;
CUDA_CALL(cudaMallocHost(
&filt_data2, filt_k2 * filt_c2 * filt_h2 * filt_w2 * sizeof(float)));
// convolution

const int pad_h1 = (filt_w1 - 1)/2;
const int pad_w1 = (filt_w1 - 1)/2;
const int str_h1 = 1;
const int str_w1 = 1;
const int dil_h1 = 1;
const int dil_w1 = 1;
std::cout << "pad_h: " << pad_h1 << std::endl;
std::cout << "pad_w: " << pad_w1 << std::endl;
std::cout << "str_h: " << str_h1 << std::endl;
std::cout << "str_w: " << str_w1 << std::endl;
std::cout << "dil_h: " << dil_h1 << std::endl;
std::cout << "dil_w: " << dil_w1 << std::endl;
std::cout << std::endl;

cudnnConvolutionDescriptor_t conv_desc1;
CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc1));
// cudnnSetConvolution2dDescriptor_v5(…)
CUDNN_CALL(cudnnSetConvolution2dDescriptor(
conv_desc1,
pad_h1, pad_w1, str_h1, str_w1, dil_h1, dil_w1,
CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));

const int pad_h2 = (filt_w2 - 1)/2;
const int pad_w2 = (filt_w2 - 1)/2;
const int str_h2 = 1;
const int str_w2 = 1;
const int dil_h2 = 1;
const int dil_w2 = 1;
std::cout << "pad_h: " << pad_h2 << std::endl;
std::cout << "pad_w: " << pad_w2 << std::endl;
std::cout << "str_h: " << str_h2 << std::endl;
std::cout << "str_w: " << str_w2 << std::endl;
std::cout << "dil_h: " << dil_h2 << std::endl;
std::cout << "dil_w: " << dil_w2 << std::endl;
std::cout << std::endl;

cudnnConvolutionDescriptor_t conv_desc2;
CUDNN_CALL(cudnnCreateConvolutionDescriptor(&conv_desc2));
// cudnnSetConvolution2dDescriptor_v5(…)
CUDNN_CALL(cudnnSetConvolution2dDescriptor(
conv_desc2,
pad_h2, pad_w2, str_h2, str_w2, dil_h2, dil_w2,
CUDNN_CONVOLUTION, CUDNN_DATA_FLOAT));
// output
int out_n1;
int out_c1;
int out_h1;
int out_w1;
int out_n2;
int out_c2;
int out_h2;
int out_w2;

CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
conv_desc1, in_desc1, filt_desc1,
&out_n1, &out_c1, &out_h1, &out_w1));
CUDNN_CALL(cudnnGetConvolution2dForwardOutputDim(
conv_desc2, in_desc2, filt_desc2,
&out_n2, &out_c2, &out_h2, &out_w2));

std::cout << "out_n: " << out_n1 << std::endl;
std::cout << "out_c: " << out_c1 << std::endl;
std::cout << "out_h: " << out_h1 << std::endl;
std::cout << "out_w: " << out_w1 << std::endl;
std::cout << std::endl;

cudnnTensorDescriptor_t out_desc1;
CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc1));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
out_desc1, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
out_n1, out_c1, out_h1, out_w1));
float *out_data1;
CUDA_CALL(cudaMallocHost(
&out_data1, out_n1 * out_c1 * out_h1 * out_w1 * sizeof(float)));

cudnnTensorDescriptor_t out_desc2;
CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc2));
CUDNN_CALL(cudnnSetTensor4dDescriptor(
out_desc2, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT,
out_n2, out_c2, out_h2, out_w2));
float *out_data2;
CUDA_CALL(cudaMallocHost(
&out_data2, out_n2 * out_c2 * out_h2 * out_w2 * sizeof(float)));
// algorithm
cudnnConvolutionFwdAlgo_t algo1, algo2;
CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(
cudnn1,
in_desc1, filt_desc1, conv_desc1, out_desc1,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo1));

CUDNN_CALL(cudnnGetConvolutionForwardAlgorithm(
cudnn2,
in_desc2, filt_desc2, conv_desc2, out_desc2,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo2));

algo1 = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
//CUDNN_CONVOLUTION_FWD_ALGO_GEMM;
algo2 = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
std::cout << "Convolution algorithm stream1: " << algo1 << std::endl;
std::cout << "Convolution algorithm stream2: " << algo2 << std::endl;
std::cout << std::endl;

// workspace
size_t ws_size1, ws_size2;
CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
cudnn1, in_desc1, filt_desc1, conv_desc1, out_desc1, algo1, &ws_size1));
CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
cudnn2, in_desc2, filt_desc2, conv_desc2, out_desc2, algo2, &ws_size2));

float *ws_data1, *ws_data2;
CUDA_CALL(cudaMallocHost(&ws_data1, ws_size1));
CUDA_CALL(cudaMallocHost(&ws_data2, ws_size2));

std::cout << "Workspace size stream1: " << ws_size1 << std::endl;
std::cout << "Workspace size stream2: " << ws_size2 << std::endl;
std::cout << std::endl;

// perform
float alpha = 1.f;
float beta = 0.f;
dev_iota<<<in_w1 * in_h1, in_n1 * in_c1>>>(in_data1);
dev_iota<<<in_w2 * in_h2, in_n2 * in_c2>>>(in_data2);
dev_const<<<filt_w1 * filt_h1, filt_k1 * filt_c1>>>(filt_data1, 1.f);
dev_const<<<filt_w2 * filt_h2, filt_k2 * filt_c2>>>(filt_data2, 1.f);
int conv_repeat = 1;
for (int i = 0; i < conv_repeat; i++){
CUDNN_CALL(cudnnConvolutionForward(
cudnn1,
&alpha, in_desc1, in_data1, filt_desc1, filt_data1,
conv_desc1, algo1, ws_data1, ws_size1,
&beta, out_desc1, out_data1));

CUDNN_CALL(cudnnConvolutionForward(
    cudnn2,
    &alpha, in_desc2, in_data2, filt_desc2, filt_data2,
    conv_desc2, algo2, ws_data2, ws_size2,
    &beta, out_desc2, out_data2));

}

cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);

// results
std::cout << “in_data:” << std::endl;
//print(in_data, in_n, in_c, in_h, in_w);

std::cout << “filt_data:” << std::endl;
//print(filt_data, filt_k, filt_c, filt_h, filt_w);

std::cout << “out_data:” << std::endl;
//print(out_data, out_n, out_c, out_h, out_w);

// finalizing
CUDA_CALL(cudaFreeHost(ws_data1));
CUDA_CALL(cudaFreeHost(ws_data2));
CUDA_CALL(cudaFreeHost(out_data1));
CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc1));
CUDA_CALL(cudaFreeHost(out_data2));
CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc2));
CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc1));
CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc2));
CUDA_CALL(cudaFreeHost(filt_data1));
CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc1));
CUDA_CALL(cudaFreeHost(filt_data2));
CUDNN_CALL(cudnnDestroyFilterDescriptor(filt_desc2));
CUDA_CALL(cudaFreeHost(in_data1));
CUDA_CALL(cudaFreeHost(in_data2));
CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc1));
CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc2));
CUDNN_CALL(cudnnDestroy(cudnn1));
CUDNN_CALL(cudnnDestroy(cudnn2));
return 0;
}

SunilJB · February 5, 2020, 10:01am

Hi,

Could you please share the nvidia profiler output as well so we can help better?
Also, can you provide details on the platforms you are using:
o Linux distro and version
o GPU type
o Nvidia driver version
o CUDA version
o CUDNN version

Thanks

chenghz4 · February 6, 2020, 11:09pm

1.I am using linux centos7
2.gpu type tesla k40c
3.nvcc 10.0.130 as Nvidia version
4.CUDA Version 10.0.130
5.cudnn I am not sure I think it is the same as cuda

here is the result from profiler:

==31335== NVPROF is profiling process ==31335== Warning: Profiling results ==31335== Some kernel(s) will be replayed ==31335== Profiling application: ./conv
==31335== Profiling result:
==31335== Metric result:
Invocations Device “Tesla K40c (0)”
Kernel: void fft2d_c2r_32x32<float, 1 sm_efficiency 1 achieved_occupancy 1 gld_requested_throughput 1 gst_requested_throughput 1 sm_efficiency_instance 1 dram_read_throughput 1 dram_write_throughput 1 gst_throughput 1 gld_throughput 1 shared_efficiency 1 gld_efficiency 1 gst_efficiency 1 dram_utilization Kernel: compute_gemm_pointers(float2**, 2 sm_efficiency 2 achieved_occupancy 2 gld_requested_throughput 2 gst_requested_throughput 2 sm_efficiency_instance 2 dram_read_throughput 2 dram_write_throughput 2 gst_throughput 2 gld_throughput 2 shared_efficiency 2 gld_efficiency 2 gst_efficiency 2 dram_utilization Kernel: dev_iota(float)
2 sm_efficiency 2 achieved_occupancy 2 gld_requested_throughput 2 gst_requested_throughput 2 sm_efficiency_instance 2 dram_read_throughput 2 dram_write_throughput 2 gst_throughput 2 gld_throughput 2 shared_efficiency 2 gld_efficiency 2 gst_efficiency 2 dram_utilization Kernel: void fft2d_r2c_16x16(float2, 2 sm_efficiency 2 achieved_occupancy 2 gld_requested_throughput 2 gst_requested_throughput 2 sm_efficiency_instance 2 dram_read_throughput 2 dram_write_throughput 2 gst_throughput 2 gld_throughput 2 shared_efficiency 2 gld_efficiency 2 gst_efficiency 2 dram_utilization Kernel: void fft2d_r2c_32x32<float, 1 sm_efficiency 1 achieved_occupancy 1 gld_requested_throughput 1 gst_requested_throughput 1 sm_efficiency_instance 1 dram_read_throughput 1 dram_write_throughput 1 gst_throughput 1 gld_throughput 1 shared_efficiency 1 gld_efficiency 1 gst_efficiency 1 dram_utilization Kernel: void fermiPlusCgemmLDS128_batched<bool=1, 2 sm_efficiency 2 achieved_occupancy 2 gld_requested_throughput 2 gst_requested_throughput 2 sm_efficiency_instance 2 dram_read_throughput 2 dram_write_throughput 2 gst_throughput 2 gld_throughput 2 shared_efficiency 2 gld_efficiency 2 gst_efficiency 2 dram_utilization Kernel: void fft2d_r2c_32x32<float, 1 sm_efficiency 1 achieved_occupancy 1 gld_requested_throughput 1 gst_requested_throughput 1 sm_efficiency_instance 1 dram_read_throughput 1 dram_write_throughput 1 gst_throughput 1 gld_throughput 1 shared_efficiency 1 gld_efficiency 1 gst_efficiency 1 dram_utilization Kernel: void fft2d_c2r_16x16<float, 1 sm_efficiency 1 achieved_occupancy 1 gld_requested_throughput 1 gst_requested_throughput 1 sm_efficiency_instance 1 dram_read_throughput 1 dram_write_throughput 1 gst_throughput 1 gld_throughput 1 shared_efficiency 1 gld_efficiency 1 gst_efficiency 1 dram_utilization Kernel: dev_const(float, float)
2 sm_efficiency 2 achieved_occupancy 2 gld_requested_throughput 2 gst_requested_throughput 2 sm_efficiency_instance 2 dram_read_throughput 2 dram_write_throughput 2 gst_throughput 2 gld_throughput 2 shared_efficiency 2 gld_efficiency 2 gst_efficiency 2 dram_utilization 31335, command: ./conv
might be incorrect with current version of nvcc compiler used to compile cuda app. Compile with nvcc compiler 9.0 or later version to get correct profiling results. Ignore this warning if code is already compiled with the recommended nvcc version
on device 0 in order to collect all events/metrics.
Metric Name Metric Description Min Max Avg
bool=0, bool=0, unsigned int=1, bool=0, bool=0>(float*, float2 const , int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float, float*, int2, int, int)
Multiprocessor Activity 5.26% 5.26% 5.26%
Achieved Occupancy 0.029510 0.029510 0.029510
Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Requested Global Store Throughput 45.679MB/s 45.679MB/s 45.679MB/s
Multiprocessor Activity 5.26% 5.26% 5.26%
Device Memory Read Throughput 943.42MB/s 943.42MB/s 943.42MB/s
Device Memory Write Throughput 3.7289MB/s 3.7289MB/s 3.7289MB/s
Global Store Throughput 57.798MB/s 57.798MB/s 57.798MB/s
Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Shared Memory Efficiency 41.99% 41.99% 41.99%
Global Memory Load Efficiency 0.00% 0.00% 0.00%
Global Memory Store Efficiency 79.03% 79.03% 79.03%
Device Memory Utilization Low (1) Low (1) Low (1)
float2 const , int, float2 const , int, float2 const , int, int)
Multiprocessor Activity 14.42% 17.43% 15.93%
Achieved Occupancy 0.126549 0.127685 0.127117
Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Requested Global Store Throughput 509.89MB/s 1.0659GB/s 881.10MB/s
Multiprocessor Activity 14.42% 17.43% 15.93%
Device Memory Read Throughput 82.928MB/s 165.24MB/s 112.70MB/s
Device Memory Write Throughput 0.00000B/s 5.3502MB/s 3.4151MB/s
Global Store Throughput 1.9917GB/s 4.2634GB/s 3.4418GB/s
Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Shared Memory Efficiency 0.00% 0.00% 0.00%
Global Memory Load Efficiency 0.00% 0.00% 0.00%
Global Memory Store Efficiency 25.00% 25.00% 25.00%
Device Memory Utilization Low (1) Low (1) Low (1)
Multiprocessor Activity 17.59% 21.79% 19.69%
Achieved Occupancy 0.160337 0.170173 0.165255
Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Requested Global Store Throughput 1.9316GB/s 1.9878GB/s 1.9593GB/s
Multiprocessor Activity 17.59% 21.79% 19.69%
Device Memory Read Throughput 171.56MB/s 186.94MB/s 179.14MB/s
Device Memory Write Throughput 0.00000B/s 20.184MB/s 10.236MB/s
Global Store Throughput 1.9316GB/s 1.9878GB/s 1.9593GB/s
Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Shared Memory Efficiency 0.00% 0.00% 0.00%
Global Memory Load Efficiency 0.00% 0.00% 0.00%
Global Memory Store Efficiency 100.00% 100.00% 100.00%
Device Memory Utilization Low (1) Low (1) Low (1)
float const , int, int, int, int, int, int, int, int)
Multiprocessor Activity 4.52% 4.76% 4.64%
Achieved Occupancy 0.124755 0.124826 0.124790
Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Requested Global Store Throughput 887.43MB/s 1.0165GB/s 958.04MB/s
Multiprocessor Activity 4.52% 4.76% 4.64%
Device Memory Read Throughput 83.125MB/s 529.99MB/s 324.34MB/s
Device Memory Write Throughput 0.00000B/s 3.6141MB/s 1.6632MB/s
Global Store Throughput 887.43MB/s 1.0165GB/s 958.04MB/s
Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Shared Memory Efficiency 25.00% 25.00% 25.00%
Global Memory Load Efficiency 0.00% 0.00% 0.00%
Global Memory Store Efficiency 100.00% 100.00% 100.00%
Device Memory Utilization Low (1) Low (1) Low (1)
bool=0, unsigned int=5, bool=0>(float2, float const , int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
Multiprocessor Activity 4.07% 4.07% 4.07%
Achieved Occupancy 0.248071 0.248071 0.248071
Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Requested Global Store Throughput 3.6647GB/s 3.6647GB/s 3.6647GB/s
Multiprocessor Activity 4.07% 4.07% 4.07%
Device Memory Read Throughput 334.56MB/s 334.56MB/s 334.56MB/s
Device Memory Write Throughput 6.8982MB/s 6.8982MB/s 6.8981MB/s
Global Store Throughput 3.6647GB/s 3.6647GB/s 3.6647GB/s
Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Shared Memory Efficiency 50.00% 50.00% 50.00%
Global Memory Load Efficiency 0.00% 0.00% 0.00%
Global Memory Store Efficiency 100.00% 100.00% 100.00%
Device Memory Utilization Low (1) Low (1) Low (1)
bool=0, bool=0, bool=0, int=4, int=4, int=4, int=3, int=3, bool=1, bool=0>(float2 const , float2 const , float2 const , float2, float2 const , float2 const , int, int, int, int, int, int, __int64, __int64, __int64, float2 const , float2 const , float2, float2, int)
Multiprocessor Activity 88.19% 93.36% 90.78%
Achieved Occupancy 0.273947 0.298514 0.286230
Requested Global Load Throughput 135.05MB/s 181.35MB/s 169.21MB/s
Requested Global Store Throughput 22.508MB/s 30.224MB/s 28.201MB/s
Multiprocessor Activity 88.19% 93.36% 90.78%
Device Memory Read Throughput 78.227MB/s 151.31MB/s 97.391MB/s
Device Memory Write Throughput 3.7514MB/s 46.892MB/s 35.579MB/s
Global Store Throughput 90.033MB/s 120.90MB/s 112.80MB/s
Global Load Throughput 540.20MB/s 725.38MB/s 676.82MB/s
Shared Memory Efficiency 90.00% 90.00% 90.00%
Global Memory Load Efficiency 25.00% 25.00% 25.00%
Global Memory Store Efficiency 25.00% 25.00% 25.00%
Device Memory Utilization Low (1) Low (1) Low (1)
bool=0, unsigned int=1, bool=0>(float2, float const , int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)
Multiprocessor Activity 5.05% 5.05% 5.05%
Achieved Occupancy 0.249238 0.249238 0.249238
Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Requested Global Store Throughput 2.1110GB/s 2.1110GB/s 2.1110GB/s
Multiprocessor Activity 5.05% 5.05% 5.05%
Device Memory Read Throughput 508.63MB/s 508.63MB/s 508.63MB/s
Device Memory Write Throughput 1.9868MB/s 1.9868MB/s 1.9868MB/s
Global Store Throughput 2.1110GB/s 2.1110GB/s 2.1110GB/s
Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Shared Memory Efficiency 42.27% 42.27% 42.27%
Global Memory Load Efficiency 0.00% 0.00% 0.00%
Global Memory Store Efficiency 100.00% 100.00% 100.00%
Device Memory Utilization Low (1) Low (1) Low (1)
bool=0>(float, float2, int, int, int, int, int, int, int, int, int, int, float, float, int, float, float)
Multiprocessor Activity 5.11% 5.11% 5.11%
Achieved Occupancy 0.023291 0.023291 0.023291
Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Requested Global Store Throughput 44.484MB/s 44.484MB/s 44.484MB/s
Multiprocessor Activity 5.11% 5.11% 5.11%
Device Memory Read Throughput 352.24MB/s 352.24MB/s 352.24MB/s
Device Memory Write Throughput 7.2626MB/s 7.2626MB/s 7.2626MB/s
Global Store Throughput 58.101MB/s 58.101MB/s 58.101MB/s
Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Shared Memory Efficiency 27.49% 27.49% 27.49%
Global Memory Load Efficiency 0.00% 0.00% 0.00%
Global Memory Store Efficiency 76.56% 76.56% 76.56%
Device Memory Utilization Low (1) Low (1) Low (1)
Multiprocessor Activity 11.57% 19.58% 15.57%
Achieved Occupancy 0.015620 0.025804 0.020712
Requested Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Requested Global Store Throughput 120.04MB/s 331.14MB/s 225.96MB/s
Multiprocessor Activity 11.57% 19.58% 15.57%
Device Memory Read Throughput 166.73MB/s 264.91MB/s 215.99MB/s
Device Memory Write Throughput 13.245MB/s 13.338MB/s 13.291MB/s
Global Store Throughput 120.04MB/s 331.14MB/s 225.96MB/s
Global Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s
Shared Memory Efficiency 0.00% 0.00% 0.00%
Global Memory Load Efficiency 0.00% 0.00% 0.00%
Global Memory Store Efficiency 100.00% 100.00% 100.00%
Device Memory Utilization Low (1) Low (1) Low (1)

SunilJB · June 22, 2020, 4:58am

Could you please verify the issue with the latest cudnn-8.0.0 Preview and/or cudnn-7.6.5 GA?
Let us know in case issue persist.

Thanks