```
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>
using namespace std;
__global__ void Cal(float* Input, float* Output, int xx, int yy, int zz)
{
int X = threadIdx.x + blockDim.x * blockIdx.x;
int Y = threadIdx.y + blockDim.y * blockIdx.y;
int Z = threadIdx.z + blockDim.z * blockIdx.z;
if (X < xx && Y < yy && Z< zz)
{
Output[X + Y * xx + Z* xx*yy] = Input[X + Y * xx + Z * xx * yy];
}
}
int main()
{
int z = 2;
int y = 3;
int x = 4;
int num = x * y * z;
float* h_input = (float*)malloc(sizeof(float) * num);
float* h_output = (float*)malloc(sizeof(float) * num);
float* d_input;
float* d_output;
cudaMalloc((void**)&d_input, sizeof(float) * num);
cudaMalloc((void**)&d_output, sizeof(float) * num);
for (int i = 0; i < num; i++)
{
h_input[i] = i + 1;
}
for (int i = 0; i < z; i++)
{
for (int j = 0; j < y; j++)
{
for (int k = 0; k < x; k++)
{
cout << h_input[i * y * x + j * x + k] << " ";
}
cout << endl;
}
cout << endl << endl;;
}
cudaMemcpy(d_input, h_input, sizeof(float) * num, cudaMemcpyHostToDevice);
Cal << <(6, 6, 6), (512, 512, 512) >> > (d_input, d_output, x, y, z);
cudaMemcpy(h_output, d_output, sizeof(float) * num, cudaMemcpyDeviceToHost);
for (int i = 0; i < z; i++)
{
for (int j = 0; j < y; j++)
{
for (int k = 0; k < x; k++)
{
cout <<h_output[i * y * x + j * x + k] << " ";
}
cout << endl;
}
cout << endl << endl;;
}
return 0;
}
```

I used this code to realise 3 dimension calculation, but failed. the result is:

original data:

1 2 3 4

5 6 7 8

9 10 11 12

13 14 15 16

17 18 19 20

21 22 23 24

result:

1 2 3 4

0 0 0 0

0 0 0 0

0 0 0 0

0 0 0 0

0 0 0 0

It can be seen that only the first row of the first block has the right answer. So can you give me an example how to solve this problem? thanks!

Besides, I want to figure out how to do high-dimensional calculation in CUDA, is there any useful tutorials online? Last time an advisor of yours recommended one website but i found it can not be openned.