I try to implement convolution backward for data by NHWC data format, but encountered an error “CUDNN_STATUS_NOT_SUPPORTED”. If I use NCHWC data format, the program will run normally.How should I solve the problem?
The following code:
#include <cuda.h>
#include <cudnn.h>
#include "def.h"
void cudnn_convBwdData(convMess cm)
{
fprintf(stdout, "\033\[32mCUDNN Convolution Backward for Data ......\033\[0m\n");
int i, j, k;
int N, C, inH, inW;
int K, fltH, fltW;
int outH, outW;
DataType *h_indiff, *h_flt, *h_outdiff;
DataType *d_indiff, *d_flt, *d_outdiff;
DataType alpha, beta;
struct timeval stime, etime;
double gflops, time_ms;
N = cm->N_;
C = cm->C_;
inH = cm->inH_;
inW = cm->inW_;
K = cm->K_;
fltH = cm->fltH_;
fltW = cm->fltW_;
outH = cm->outH_;
outW = cm->outW_;
alpha = cm->alpha_;
beta = cm->beta_;
h_flt = cm->flt_;
h_outdiff = cm->outdiff_;
h_indiff = cm->indiff_;
int indim[] = {N, inH, inW, C};
int indimInv[] = {inH*inW*C, inW*C, C, 1};
int insize = N*inH*inW*C;
int fltdim[] = {K, fltH, fltW, C};
int fltdimInv[] = {fltH*fltW*C, fltW*C, C, 1};
int fltsize = K*fltH*fltW*C;
int outdim[] = {N, outH, outW, K};
int outdimInv[] = {outH*outW*K, outH*K, K, 1};
int outsize = N*outH*outW*K;
int pad[] = {cm->padh_, cm->padw_};
int stride[] = {cm->strideh_, cm->stridew_};
int upscaleA[] = {1, 1};
cudnnHandle_t handle;
cudnnTensorDescriptor_t inDesc, outDesc;
cudnnFilterDescriptor_t fltDesc;
cudnnConvolutionDescriptor_t convDesc;
cudnnConvolutionBwdDataAlgo_t bwdDataAlgo;
#ifdef ROTATE
cudnnConvolutionMode_t convMode = CUDNN_CONVOLUTION;
#endif
#ifdef NOROTATE
cudnnConvolutionMode_t convMode = CUDNN_CROSS_CORRELATION;
#endif
size_t free_byte;
size_t total_byte;
size_t bwdData_sizeInBytes_;
void *workSpace;
cudaEvent_t scuda, ecuda;
cudnnCreate(&handle);
CUDA_CHECK(cudaMalloc((void **)&d_flt, fltsize*sizeof(DataType)));
CUDA_CHECK(cudaMemcpy(d_flt, h_flt, fltsize*sizeof(DataType), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMalloc((void **)&d_outdiff, outsize*sizeof(DataType)));
CUDA_CHECK(cudaMemcpy(d_outdiff, h_outdiff, outsize*sizeof(DataType), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMalloc((void **)&d_indiff, insize*sizeof(DataType)));
CUDA_CHECK(cudaMemcpy(d_indiff, h_indiff, insize*sizeof(DataType), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaEventCreate(&scuda));
CUDA_CHECK(cudaEventCreate(&ecuda));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&inDesc)); // input descriptor
CUDNN_CHECK(cudnnSetTensor4dDescriptor(inDesc, CUDNN_TENSOR_NHWC, CUDNN_DataType,
N, C, inH, inW));
CUDNN_CHECK(cudnnCreateTensorDescriptor(&outDesc)); // output descriptor
CUDNN_CHECK(cudnnSetTensor4dDescriptor(outDesc, CUDNN_TENSOR_NHWC, CUDNN_DataType,
N, K, outH, outW));
CUDNN_CHECK(cudnnCreateFilterDescriptor(&fltDesc)); // filter descriptor
CUDNN_CHECK(cudnnSetFilter4dDescriptor(fltDesc, CUDNN_DataType, CUDNN_TENSOR_NHWC,
K, C, fltH, fltW));
CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&convDesc)); // convolution descriptor
//CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(convDesc, 2, pad, stride,
// upscaleA, convMode, CUDNN_DataType));
CUDNN_CHECK(cudnnSetConvolution2dDescriptor(convDesc, pad[0], pad[1], stride[0], stride[1],
upscaleA[0], upscaleA[1], convMode, CUDNN_DataType));
CUDA_CHECK(cudaMemGetInfo(&free_byte, &total_byte));
// get convolution forward algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle, fltDesc, outDesc,
convDesc, inDesc,
//CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE,
//CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
free_byte, &bwdDataAlgo));
// get convolution forward workspace size
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle, fltDesc, outDesc,
convDesc, inDesc, bwdDataAlgo, &bwdData_sizeInBytes_));
if(bwdData_sizeInBytes_ > free_byte){ // for bug
bwdDataAlgo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;//(cudnnConvolutionbwdDataAlgo_t)(1);
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle, fltDesc, outDesc,
convDesc, inDesc, bwdDataAlgo, &bwdData_sizeInBytes_));
}
CUDA_CHECK(cudaMalloc((void **)&workSpace, bwdData_sizeInBytes_));
CUDNN_CHECK(cudnnConvolutionBackwardData(handle, &alpha, // 第一次运行预热
fltDesc, d_flt, outDesc, d_outdiff,
convDesc, bwdDataAlgo, workSpace, bwdData_sizeInBytes_, &beta,
inDesc, d_indiff));
#ifdef PERF
gettimeofday(&stime, NULL);
cudaEventRecord(scuda, 0);
for(j = 0; j < CYCLE; j++){
CUDNN_CHECK(cudnnConvolutionBackwardData(handle, &alpha,
fltDesc, d_flt, outDesc, d_outdiff,
convDesc, bwdDataAlgo, workSpace, bwdData_sizeInBytes_, &beta,
inDesc, d_indiff));
}
CUDA_CHECK(cudaEventRecord(ecuda, 0));
CUDA_CHECK(cudaEventSynchronize(scuda));
CUDA_CHECK(cudaEventSynchronize(ecuda));
gettimeofday(&etime ,NULL);
time_ms = TIME_MS(stime, etime)/CYCLE;
/* gflops = (1.0*((2*fltH*fltW-1)*C+(C-1))*outH/1e3*outW*K/1e3*N)/time_ms; */
/* fprintf(stdout, "Elapsed-time: %8.2f ms, Perf %7.2f gflops, %6.2f%% peak!\n", */
/* time_ms, gflops, gflops/1e3/PEAK_PERF*100); */
#endif
CUDA_CHECK(cudaMemcpy(h_indiff, d_indiff, insize*sizeof(DataType), cudaMemcpyDeviceToHost));
// clean environment
GPUFREE(d_flt);
GPUFREE(d_outdiff);
GPUFREE(d_indiff);
GPUFREE(workSpace);
CUDA_CHECK(cudaEventDestroy(scuda));
CUDA_CHECK(cudaEventDestroy(ecuda));
CUDNN_CHECK(cudnnDestroy(handle));
CUDNN_CHECK(cudnnDestroyTensorDescriptor(inDesc));
CUDNN_CHECK(cudnnDestroyTensorDescriptor(outDesc));
CUDNN_CHECK(cudnnDestroyFilterDescriptor(fltDesc));
CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(convDesc));
}