Difficult to translate into parallel computing

I have migrated all my neural network functions to cuda. But one operation remains unsolvable.
Please, help. I’m trying to translate Java code to CUDA. Problem with the index.

int index = 0, indexInput;

for (int h = 0; h < rows; h += sizeKernel) {
for (int w = 0; w < columns; w += sizeKernel) {
for (int j = 0; j < sizeKernel; j++) {
for (int k = 0; k < sizeKernel; k++) {
indexInput = (h + j) * depth * columns + (w + k) * depth;
for (int c = 0; c < depth; c++, index++, indexInput++) {
result[index] = data[indexInput];
}
}
}
}
}

Trying to translate.

“extern "C"\n” +
global void imageVector(const float* restrict A, float* C, int rows, int columns, int depth, int sizeKernel)\n” +
“{\n” +
" const int h = (blockDim.x * blockIdx.x + threadIdx.x) * sizeKernel;\n" +
" const int w = (blockDim.y * blockIdx.y + threadIdx.y) * sizeKernel;\n" +
" const int z = blockDim.z * blockIdx.z + threadIdx.z;\n" +
" if (h < rows && w < columns && z < sizeKernel)\n" +
" {\n" +
" int index = columns * sizeKernel * h + sizeKernel * w + z;\n" +
" for (int k = 0; k < sizeKernel; k++) {\n" +
" int indexInput = (h + z) * depth * columns + (w + k) * depth;\n" +
" for (int c = 0; c < depth; c++, index++, indexInput++) {\n" +
" C[index] = A[indexInput];\n" +
" }\n" +
" }\n" +
" }\n" +
“}\n” +

Problem solved:

int sizeKernel_X_depth = sizeKernel * depth;
int sizeKernel_X_sizeKernel_X_depth_ = sizeKernel_X_depth * sizeKernel;
int columns_X_sizeKernel_X_sizeKernel_X_depth = sizeKernel_X_sizeKernel_X_depth_ * columns / sizeKernel;
int index = c + k * depth + z * sizeKernel_X_depth + w / sizeKernel * sizeKernel_X_sizeKernel_X_depth_ + h / sizeKernel * columns_X_sizeKernel_X_sizeKernel_X_depth;