Tanh Activation Backward Precision?

Hi, I am recently using cudnn and comparing results with my simple CPU implementation.
I found the results mismatch when I do backward activation(tanh). And I could not figure out the reason.

Here is the output for my code(attached below).
CPU 2.44045
GPU 2.43986
CPU -1.16093
GPU -1.16092
CPU -0.0392796
GPU -0.0391668
CPU -1.94606
GPU -1.94574

My code looks like this, I wonder if the offset is expected. I may make some stupid mistakes since I am really new to it. Thank you ahead.
#include <cuda_runtime.h>
#include <cudnn.h>
#include <assert.h>
#include
#include

using namespace std;

int main() {
float x[4] = {0}; //doesn’t matter here
float y[4] = {-0.137502,-0.0577537,-0.254379,-0.125099};
float dx[4] = {0};
float dy[4] = {2.48688,-1.16481,-0.0418766,-1.97667};

cudnnHandle_t cudnnHandle;
cudnnCreate(&cudnnHandle);
float alpha=1,beta=0;
float *dev_x, *dev_y, *dev_dx, *dev_dy;
cudaMalloc((void **)&dev_x,sizeof(float) * 4);
cudaMalloc((void **)&dev_dx,sizeof(float) * 4);
cudaMalloc((void **)&dev_y,sizeof(float) * 4);
cudaMalloc((void **)&dev_dy,sizeof(float) * 4);
cudaMemcpy(dev_x, x, sizeof(float) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dx, dx, sizeof(float) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(dev_y, y, sizeof(float) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dy, dy, sizeof(float) * 4, cudaMemcpyHostToDevice);

cudnnActivationDescriptor_t actDesc;
cudnnCreateActivationDescriptor(&actDesc);
cudnnSetActivationDescriptor(actDesc, CUDNN_ACTIVATION_TANH,
                             CUDNN_PROPAGATE_NAN, 0.0);
cudnnTensorDescriptor_t yDesc;
cudnnCreateTensorDescriptor(&yDesc);
cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 2, 1,
                           1, 2);

auto error = cudnnActivationBackward(cudnnHandle, actDesc, &alpha, yDesc,
                                     dev_y, yDesc, dev_dy, yDesc,
                                     dev_x, &beta, yDesc, dev_dx);
assert(CUDNN_STATUS_SUCCESS == error);

cudaMemcpy( x, dev_x,sizeof(float) * 4, cudaMemcpyDeviceToHost);
cudaMemcpy( y, dev_y,sizeof(float) * 4, cudaMemcpyDeviceToHost);
cudaMemcpy( dx, dev_dx,sizeof(float) * 4, cudaMemcpyDeviceToHost);
cudaMemcpy( dy, dev_dy,sizeof(float) * 4, cudaMemcpyDeviceToHost);

for(int i=0;i<4;++i){
    cout<<"CPU " <<(1-pow(tanh(y[i]),2))*dy[i]<<"\n";
    cout<<"GPU " <<dx[i]<<endl;
}

return 0;

}

The variations are:
0.00059
0.00001
0.0001128
0.00032

This minor variation is expected.

Thanks

1 Like