Hi, I am recently using cudnn and comparing results with my simple CPU implementation.
I found the results mismatch when I do backward activation(tanh). And I could not figure out the reason.
Here is the output for my code(attached below).
CPU 2.44045
GPU 2.43986
CPU -1.16093
GPU -1.16092
CPU -0.0392796
GPU -0.0391668
CPU -1.94606
GPU -1.94574
My code looks like this, I wonder if the offset is expected. I may make some stupid mistakes since I am really new to it. Thank you ahead.
#include <cuda_runtime.h>
#include <cudnn.h>
#include <assert.h>
#include
#include
using namespace std;
int main() {
float x[4] = {0}; //doesn’t matter here
float y[4] = {-0.137502,-0.0577537,-0.254379,-0.125099};
float dx[4] = {0};
float dy[4] = {2.48688,-1.16481,-0.0418766,-1.97667};
cudnnHandle_t cudnnHandle;
cudnnCreate(&cudnnHandle);
float alpha=1,beta=0;
float *dev_x, *dev_y, *dev_dx, *dev_dy;
cudaMalloc((void **)&dev_x,sizeof(float) * 4);
cudaMalloc((void **)&dev_dx,sizeof(float) * 4);
cudaMalloc((void **)&dev_y,sizeof(float) * 4);
cudaMalloc((void **)&dev_dy,sizeof(float) * 4);
cudaMemcpy(dev_x, x, sizeof(float) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dx, dx, sizeof(float) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(dev_y, y, sizeof(float) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dy, dy, sizeof(float) * 4, cudaMemcpyHostToDevice);
cudnnActivationDescriptor_t actDesc;
cudnnCreateActivationDescriptor(&actDesc);
cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_TANH,
CUDNN_PROPAGATE_NAN, 0.0);
cudnnTensorDescriptor_t yDesc;
cudnnCreateTensorDescriptor(&yDesc);
cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 2, 1,
1, 2);
auto error = cudnnActivationBackward(cudnnHandle, actDesc, &alpha, yDesc,
dev_y, yDesc, dev_dy, yDesc,
dev_x, &beta, yDesc, dev_dx);
assert(CUDNN_STATUS_SUCCESS == error);
cudaMemcpy( x, dev_x,sizeof(float) * 4, cudaMemcpyDeviceToHost);
cudaMemcpy( y, dev_y,sizeof(float) * 4, cudaMemcpyDeviceToHost);
cudaMemcpy( dx, dev_dx,sizeof(float) * 4, cudaMemcpyDeviceToHost);
cudaMemcpy( dy, dev_dy,sizeof(float) * 4, cudaMemcpyDeviceToHost);
for(int i=0;i<4;++i){
cout<<"CPU " <<(1-pow(tanh(y[i]),2))*dy[i]<<"\n";
cout<<"GPU " <<dx[i]<<endl;
}
return 0;
}