CUDA kernel failed to execute without error report.

Hi everyone,

I am writing my own MD simulation code and I have encountered a problem:

grid.x = angles.size() / 1024;
    if(angles.size() % 1024)grid.x++;
    calcAngleForceCUDA<<<grid,thread>>>(atom_coords_gpu,forces_gpu,angle_param_gpu,angles.size());
    error = cudaDeviceSynchronize();
    if(error != cudaSuccess){
        std::cout<<"angle calc error"<<cudaGetErrorString(error)<<std::endl;
        exit(-1);
    }

and here’s the kernel:

__global__ void calcAngleForceCUDA(float *atom_coords,float *forces_vectors,float *angle_params,int size){
    // params[0]
    // r12 r32 r13  dr12 dr32 dr13


    int index = blockIdx.x * blockDim.x + threadIdx.x;
    //printf("Hello from %d\n",index);
    if(index >= size)return;

    //read in data TODO
    float theta,sinTheta,cosTheta;
    float dpotdtheta;
    float d12,d32,d13;
    float r12[3],r32[3],r13[3],dr12[3],dr32[3],dr13[3];
    r12[0] = atom_coords[(int)angle_params[index * 7] * 3 + 0] - atom_coords[(int)angle_params[index * 7 + 1] * 3 + 0];
    r12[1] = atom_coords[(int)angle_params[index * 7] * 3 + 1] - atom_coords[(int)angle_params[index * 7 + 1] * 3 + 1];
    r12[2] = atom_coords[(int)angle_params[index * 7] * 3 + 2] - atom_coords[(int)angle_params[index * 7 + 1] * 3 + 2];

    r32[0] = atom_coords[(int)angle_params[index * 7 + 2] * 3 + 0] - atom_coords[(int)angle_params[index * 7 + 1] * 3 + 0];
    r32[1] = atom_coords[(int)angle_params[index * 7 + 2] * 3 + 1] - atom_coords[(int)angle_params[index * 7 + 1] * 3 + 1];
    r32[2] = atom_coords[(int)angle_params[index * 7 + 2] * 3 + 2] - atom_coords[(int)angle_params[index * 7 + 1] * 3 + 2];

    r13[0] = atom_coords[(int)angle_params[index * 7] * 3 + 0] - atom_coords[(int)angle_params[index * 7 + 2] * 3 + 0];
    r13[1] = atom_coords[(int)angle_params[index * 7] * 3 + 1] - atom_coords[(int)angle_params[index * 7 + 2] * 3 + 1];
    r13[2] = atom_coords[(int)angle_params[index * 7] * 3 + 2] - atom_coords[(int)angle_params[index * 7 + 2] * 3 + 2];

    float n_x,n_y,n_z;
    float dot12_32 = r12[0] * r32[0] + r12[1] * r32[1] + r12[2] * r32[2];
    n_x = r12[1] * r32[2] - r12[2] * r32[1];
    n_y = r12[2] * r32[0] - r12[0] * r32[2];
    n_z = r12[0] * r32[1] - r12[1] * r32[0];
    theta = atanf(sqrtf(n_x * n_x + n_y * n_y + n_z * n_z) / dot12_32);
    sinTheta = sin(theta);
    cosTheta = cos(theta);

    //Theta0 :
    float KTheta = angle_params[index * 7 + 3];
    float Theta0 = forces_vectors[index * 7 + 4];

    float bradley_constant = angle_params[index * 7 + 5];
    float bradley_rest_lenth = angle_params[index * 7 + 6];

    dpotdtheta = 2.0 * KTheta * (theta - Theta0);
    d12 = sqrtf(r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2]);
    d32 = sqrtf(r32[0] * r32[0] + r32[1] * r32[1] + r32[2] * r32[2]);
    d13 = sqrtf(r13[0] * r13[0] + r13[1] * r13[1] + r13[2] * r13[2]);

    dr12[0] = r12[0] / d12;
    dr12[1] = r12[1] / d12;
    dr12[2] = r12[2] / d12;

    dr13[0] = r13[0] / d13;
    dr13[1] = r13[1] / d13;
    dr13[2] = r13[2] / d13;

    dr32[0] = r32[0] / d32;
    dr32[1] = r32[1] / d32;
    dr32[2] = r32[2] / d32;

    float dtheta1[3] = {
            (dr12[0] * cosTheta - dr32[0]) / (sinTheta * d12) * (-dpotdtheta) - dr13[0] * 2.0 * bradley_constant * (d13 - bradley_rest_lenth),
            (dr12[1] * cosTheta - dr32[1]) / (sinTheta * d12) * (-dpotdtheta) - dr13[1] * 2.0 * bradley_constant * (d13 - bradley_rest_lenth),
            (dr12[2] * cosTheta - dr32[2]) / (sinTheta * d12) * (-dpotdtheta) - dr13[2] * 2.0 * bradley_constant * (d13 - bradley_rest_lenth)
    };
    __threadfence_block();
    float dtheta3[3] = {
            (dr32[0] * cosTheta - dr12[0]) / (sinTheta * d32) * (-dpotdtheta) + dr13[0] * 2.0 * bradley_constant * (d13 - bradley_rest_lenth),
            (dr32[1] * cosTheta - dr12[1]) / (sinTheta * d32) * (-dpotdtheta) + dr13[1] * 2.0 * bradley_constant * (d13 - bradley_rest_lenth),
            (dr32[2] * cosTheta - dr12[2]) / (sinTheta * d32) * (-dpotdtheta) + dr13[2] * 2.0 * bradley_constant * (d13 - bradley_rest_lenth)
    };
    __threadfence_block();

    float dtheta2[3] = {
            -dtheta1[0] - dtheta3[0],
            -dtheta1[1] - dtheta3[1],
            -dtheta1[2] - dtheta3[2]
    };

       forces_vectors[(int)angle_params[index * 7] * 3 + 0] += dtheta1[0];
       forces_vectors[(int)angle_params[index * 7] * 3 + 1] += dtheta1[1];
       forces_vectors[(int)angle_params[index * 7] * 3 + 2] += dtheta1[2];


       forces_vectors[(int)angle_params[index * 7 +1] * 3 + 0] += dtheta2[0];
       forces_vectors[(int)angle_params[index * 7 +1] * 3 + 1] += dtheta2[1];
       forces_vectors[(int)angle_params[index * 7 +1] * 3 + 2] += dtheta2[2];

       forces_vectors[(int)angle_params[index * 7 + 2] * 3 + 0] += dtheta3[0];
       forces_vectors[(int)angle_params[index * 7 + 2] * 3 + 1] += dtheta3[1];
       forces_vectors[(int)angle_params[index * 7 + 2] * 3 + 2] += dtheta3[2];



}

Interestingly, the kernel doesn’t run at all (printf didn’t run at all and the values are not affected) and no errors were reported, the program exited with 0.

The only workaround is to split the last part of assigning values to forces_vectors into two parts. When dtheta1[0],dtheta1[1],dtheta1[2],dtheta2[0],dtheta2[1] is in that part, then we can’t have dtheta2[3] and others. So it was put into two kernels to ensure it works. I looked at the site and found no explanation. Any idea what happened? Thanks.

Hi RedAlice64,

Well, I think you should copy the data from host to device with function cudaMemcpy first before calling the kernel function.

Thanks,

You don’t need to explicitly use cudaMemcpy as long as you use unified memory. Now, I don’t know if @RedAlice64 is using unified memory as this portion isn’t shown but this be why.