#include <cuda_runtime.h>
#include <cudnn.h>
#include <stdio.h>
#include <stdlib.h>
void teReadDataFromDisk(float* pbuf, int size, const char* filename)
{
FILE* pFile = NULL;
fopen_s(&pFile, filename, “rb+”);
fread(pbuf, sizeof(float), size, pFile);
fclose(pFile);
}
void main()
{
const int BB = 1, CC = 128, WW = 16, HH = 48;
const int XSIZE = BB * CC * WW * HH;
const int WSIZE = CC * CC * 9;
cudaError_t cudaerr = cudaSetDevice(0);
cudnnHandle_t handle = nullptr;
cudnnStatus_t cudnnerr = cudnnCreate(&handle);
float fAlpha = 1.0f, fBeta = 0.0f;
cudnnTensorDescriptor_t xDesc, yDesc;
cudnnFilterDescriptor_t wDesc;
cudnnConvolutionDescriptor_t convDesc;
cudnnerr = cudnnCreateTensorDescriptor(&xDesc);
cudnnerr = cudnnCreateTensorDescriptor(&yDesc);
cudnnerr = cudnnCreateFilterDescriptor(&wDesc);
cudnnerr = cudnnCreateConvolutionDescriptor(&convDesc);
cudnnerr = cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, BB, CC, HH, WW);
cudnnerr = cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, BB, CC, HH, WW);
cudnnerr = cudnnSetFilter4dDescriptor(wDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, CC, CC, 3, 3);
cudnnerr = cudnnSetConvolutionGroupCount(convDesc, 1);
//cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH);
cudnnerr = cudnnSetConvolution2dDescriptor(convDesc, 1, 1, 1, 1, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
//int b, c, w, h;
//cudnnerr = cudnnGetConvolution2dForwardOutputDim(convDesc, xDesc, wDesc, &b, &c, &h, &w);
cudnnConvolutionFwdAlgo_t algo = (cudnnConvolutionFwdAlgo_t)0;
size_t workspace_size = 0
cudnnerr = cudnnGetConvolutionForwardWorkspaceSize(handle, xDesc, wDesc, convDesc, yDesc,
(cudnnConvolutionFwdAlgo_t)1, &workspace_size);//1769552
float *x_h = new float[XSIZE];
float *w_h = new float[WSIZE];
float *y_h1 = new float[XSIZE];
float *y_h2 = new float[XSIZE];
float *x_d, *w_d, *y_d, *workspace_d;
cudaerr = cudaMalloc(&x_d, XSIZE*4);
cudaerr = cudaMalloc(&y_d, XSIZE*4);
cudaerr = cudaMalloc(&w_d, WSIZE*4);
cudaerr = cudaMalloc(&workspace_d, workspace_size);
teReadDataFromDisk(x_h, XSIZE, "aaxx");
teReadDataFromDisk(w_h, WSIZE, "aaww");
cudaerr = cudaMemcpy(x_d, x_h, XSIZE * 4, cudaMemcpyHostToDevice);
cudaerr = cudaMemcpy(w_d, w_h, WSIZE * 4, cudaMemcpyHostToDevice);
cudnnerr = cudnnConvolutionForward(handle,
&fAlpha, xDesc, x_d,
wDesc, w_d,
convDesc, algo, workspace_d, workspace_size,
&fBeta, yDesc, y_d);
cudaerr = cudaMemcpy(y_h1, y_d, XSIZE *4, cudaMemcpyDeviceToHost);
for (size_t ii = 0; ii < 20; ii++)
{
printf("%f,", y_h1[ii]);
}
printf("\n\n");
algo = (cudnnConvolutionFwdAlgo_t)1;
cudnnerr = cudnnConvolutionForward(handle,
&fAlpha, xDesc, x_d,
wDesc, w_d,
convDesc, algo, workspace_d, workspace_size,
&fBeta, yDesc, y_d);
cudaerr = cudaMemcpy(y_h2, y_d, XSIZE *4, cudaMemcpyDeviceToHost);
for (size_t ii = 0; ii < 20; ii++)
{
printf("%f,", y_h2[ii]);
}
printf("\n\n");
//destroy descriptors.
cudnnDestroy(handle);
system("Pause");
}
input&&weight.zip (885.7 KB)