#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>
using namespace std;
__global__ void Cal(float* Input, float* Output, int xx, int yy, int zz)
{
int X = threadIdx.x + blockDim.x * blockIdx.x;
int Y = threadIdx.y + blockDim.y * blockIdx.y;
int Z = threadIdx.z + blockDim.z * blockIdx.z;
if (X < xx && Y < yy && Z< zz)
{
Output[X + Y * xx + Z* xx*yy] = Input[X + Y * xx + Z * xx * yy];
}
}
int main()
{
int z = 2;
int y = 3;
int x = 4;
int num = x * y * z;
float* h_input = (float*)malloc(sizeof(float) * num);
float* h_output = (float*)malloc(sizeof(float) * num);
float* d_input;
float* d_output;
cudaMalloc((void**)&d_input, sizeof(float) * num);
cudaMalloc((void**)&d_output, sizeof(float) * num);
for (int i = 0; i < num; i++)
{
h_input[i] = i + 1;
}
for (int i = 0; i < z; i++)
{
for (int j = 0; j < y; j++)
{
for (int k = 0; k < x; k++)
{
cout << h_input[i * y * x + j * x + k] << " ";
}
cout << endl;
}
cout << endl << endl;;
}
cudaMemcpy(d_input, h_input, sizeof(float) * num, cudaMemcpyHostToDevice);
Cal << <(6, 6, 6), (512, 512, 512) >> > (d_input, d_output, x, y, z);
cudaMemcpy(h_output, d_output, sizeof(float) * num, cudaMemcpyDeviceToHost);
for (int i = 0; i < z; i++)
{
for (int j = 0; j < y; j++)
{
for (int k = 0; k < x; k++)
{
cout <<h_output[i * y * x + j * x + k] << " ";
}
cout << endl;
}
cout << endl << endl;;
}
return 0;
}
I used this code to realise 3 dimension calculation, but failed. the result is:
original data:
1 2 3 4
5 6 7 8
9 10 11 12
13 14 15 16
17 18 19 20
21 22 23 24
result:
1 2 3 4
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
It can be seen that only the first row of the first block has the right answer. So can you give me an example how to solve this problem? thanks!
Besides, I want to figure out how to do high-dimensional calculation in CUDA, is there any useful tutorials online? Last time an advisor of yours recommended one website but i found it can not be openned.