Parallelize function with three loops for

Hello,
I have to parallel this function with Cuda but it does not give me the same result
How could I do the kernel?
Greetings.

void func (int p1, int p2, int P3, float *P4, float *P5, float *P6, float *P7, float *P8, float *P9, int* P10, int* P11, float *P12 ,float *P13, float *P14, struct struct1 *P15, int P16){
float V12, V13 = 0,A1[3], V14;
int j,i;
int V1, V2;
int total;
float V3,V4,V5, V6, V7, V8, V9,V10,V11;
float  V15, V16, V17;
total = P16 * P3;
{
    for (int k=0; k < (P16*P3); k+=P3)
    {
        for(int i=0;i<p2;i++){
            V14 = 0;
            V1 = P11[i];
            A1[0] = *(P7 + k + i);
            A1[1] = *(P8 + k + i);
            A1[2] = *(P9 + k + i);
            V6 = P15[V1].asp;
            V8 = P15[V1].vol;
            for(int j=0;j<p1;j++){
            V14 = 0;
            V2 = P10[j];
            V7 = P15[V2].asp;
            V9 = P15[V2].vol;
            V3= (P4[j]) - A1[0];
            V4= (P5[j]) - A1[1];
            V5= (P6[j]) - A1[2];
            V15=V3*V3;
            V16=V4*V4;
            V17=V5*V5;
            V3=V15+V16+V17;
            V12 = sqrtf(V3);
            V14 = ((V6 * V9) + (QASP * fabs(P12[i]) *  V9) + (V7 * V8) + (QASP * fabs(P13[j]) * V8)) * exp(-V3/(2*G_D_2));
            V13 += V14;
            }
        }
        P14[k/P3] = V13;
        V13 = 0;
    }
}
printf("Result: %f\n",P14[0]);
}
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

__global__ void kernel(unsigned int sizeX, unsigned int sizeY, unsigned int sizeZ,unsigned int p3) {
  unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
  unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
  unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
  if ( x < sizeX && y < sizeY && z < sizeZ ) {
    unsigned int k = x * p3;
    unsigned int i = y;
    unsigned int j = z;
    /* do something using k, i, j */
  }
}

int main() {
  unsigned int sizeK = 10; // p16
  unsigned int sizeI = 10; // p2
  unsigned int sizeJ = 10; // p1
  unsigned int    p3 =  2;
  dim3 block(8,8,4); // 8*8*4 = 256[threads/block] 
  dim3 grid(
         (sizeK + block.x -1)/ block.x,
         (sizeI + block.y -1)/ block.y,
         (sizeJ + block.z -1)/ block.z);
  kernel<<<grid,block>>>(sizeK, sizeI, sizeJ, p3);
}

Thank you very much!
I’m going to go with it to see if I can make it work.
I’m going to start putting the cudadMemcpi (…) and try it.
Greetings.