How to realise 3 dimension calculation?

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>

using namespace std;

__global__ void Cal(float* Input, float* Output, int xx, int yy, int zz)
{
	int X = threadIdx.x + blockDim.x * blockIdx.x;
	int Y = threadIdx.y + blockDim.y * blockIdx.y;
	int Z = threadIdx.z + blockDim.z * blockIdx.z;

	if (X < xx && Y < yy && Z< zz)
	{
		Output[X + Y * xx + Z* xx*yy] = Input[X + Y * xx + Z * xx * yy];
	}
}

int main()
{
	int z = 2;
	int y = 3;
	int x = 4;

	int num = x * y * z;

	float* h_input = (float*)malloc(sizeof(float) * num);
	float* h_output = (float*)malloc(sizeof(float) * num);

	float* d_input;
	float* d_output;
	cudaMalloc((void**)&d_input, sizeof(float) * num);
	cudaMalloc((void**)&d_output, sizeof(float) * num);
	for (int i = 0; i < num; i++)
	{
		h_input[i] = i + 1;
	}
	for (int i = 0; i < z; i++)
	{
		for (int j = 0; j < y; j++)
		{
			for (int k = 0; k < x; k++)
			{
				cout << h_input[i * y * x + j * x + k] << " ";
			}
			cout << endl;
		}
		cout << endl << endl;;
	}

	cudaMemcpy(d_input, h_input, sizeof(float) * num, cudaMemcpyHostToDevice);

	Cal << <(6, 6, 6), (512, 512, 512) >> > (d_input, d_output, x, y, z);

	cudaMemcpy(h_output, d_output, sizeof(float) * num, cudaMemcpyDeviceToHost);

	for (int i = 0; i < z; i++)
	{
		for (int j = 0; j < y; j++)
		{
			for (int k = 0; k < x; k++)
			{
				cout <<h_output[i * y * x + j * x + k] << " ";
			}
			cout << endl;
		}
		cout << endl << endl;;
	}

	return 0;
}

I used this code to realise 3 dimension calculation, but failed. the result is:
original data:
1 2 3 4
5 6 7 8
9 10 11 12

13 14 15 16
17 18 19 20
21 22 23 24

result:
1 2 3 4
0 0 0 0
0 0 0 0

0 0 0 0
0 0 0 0
0 0 0 0
It can be seen that only the first row of the first block has the right answer. So can you give me an example how to solve this problem? thanks!
Besides, I want to figure out how to do high-dimensional calculation in CUDA, is there any useful tutorials online? Last time an advisor of yours recommended one website but i found it can not be openned.

You are missing an & in the kernel if.

if (X < xx && Y < yy & Z< zz) should be
if (X < xx && Y < yy && Z< zz)

1 Like

You are careful, but that does no help to the result. I have fixed it.

Here is the result afer fixing that problem:
1 2 3 4
5 6 7 8
9 10 11 12

13 14 15 16
17 18 19 20
21 22 23 24

1 2 3 4
0 0 0 0
0 0 0 0

0 0 0 0
0 0 0 0
0 0 0 0
stiil the same.

I know the index i wrote was wrong, because it didn’t show how to locate every thread. But i don’t konw what to do to make it right.

When I try to compile this code, I see many compiler warnings. warning #174-D: expression has no effect
Then, when I print the block dimensions and grid dimensions within the kernel, the block dimension is (512,1,1) and grid dimension is (6,1,1) , which I assume is not what you intended.
Blocks of size 512x512x512 are not allowed anyways.

If you properly pass grid and block dimensions using dim3, and use blocks of size 6x6x6 , the code works fine for me.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>

using namespace std;

__global__ void Cal(float* Input, float* Output, int xx, int yy, int zz)
{
	int X = threadIdx.x + blockDim.x * blockIdx.x;
	int Y = threadIdx.y + blockDim.y * blockIdx.y;
	int Z = threadIdx.z + blockDim.z * blockIdx.z;

    // if(threadIdx.x + threadIdx.y + threadIdx.z + blockIdx.x + blockIdx.y + blockIdx.z == 0){
    //     printf("%d %d %d, %d %d %d\n", blockDim.x, blockDim.y, blockDim.z, gridDim.x, gridDim.y, gridDim.z);
    // }

	if (X < xx && Y < yy && Z< zz)
	{
		Output[X + Y * xx + Z* xx*yy] = Input[X + Y * xx + Z * xx * yy];
	}
}

int main()
{
	int z = 2;
	int y = 3;
	int x = 4;

	int num = x * y * z;

	float* h_input = (float*)malloc(sizeof(float) * num);
	float* h_output = (float*)malloc(sizeof(float) * num);

	float* d_input;
	float* d_output;
	cudaMalloc((void**)&d_input, sizeof(float) * num);
	cudaMalloc((void**)&d_output, sizeof(float) * num);
	for (int i = 0; i < num; i++)
	{
		h_input[i] = i + 1;
	}
	for (int i = 0; i < z; i++)
	{
		for (int j = 0; j < y; j++)
		{
			for (int k = 0; k < x; k++)
			{
				cout << h_input[i * y * x + j * x + k] << " ";
			}
			cout << endl;
		}
		cout << endl << endl;;
	}

	cudaMemcpy(d_input, h_input, sizeof(float) * num, cudaMemcpyHostToDevice);

	//Cal << <(6, 6, 6), (512, 512, 512) >> > (d_input, d_output, x, y, z);
    Cal <<<dim3(512, 512, 512), dim3(6, 6, 6)>>> (d_input, d_output, x, y, z);

	cudaMemcpy(h_output, d_output, sizeof(float) * num, cudaMemcpyDeviceToHost);

	for (int i = 0; i < z; i++)
	{
		for (int j = 0; j < y; j++)
		{
			for (int k = 0; k < x; k++)
			{
				cout <<h_output[i * y * x + j * x + k] << " ";
			}
			cout << endl;
		}
		cout << endl << endl;;
	}

	return 0;
}

I know what is wrong. I guess it is the gridsize and blocksize which I chose that made my code unable to get right results.
Cal <<<dim3(512, 512, 512), dim3(6, 6, 6)>>> (d_input, d_output, x, y, z);
And i want to know why (512,512,512) of blocksize is not right? Is it because it’s beyond actually number of device threads in my Nvidia GPU? I have watched some tutorials online, and in those tutorials, they explained that the ideal possible number of blocksize was up to (1024,1024,1024).
So:
Can you tell me where i exactly wrong? Many thanks!

See Table 15 in the programming guide CUDA C++ Programming Guide .

For thread blocks, the maximum allowed x-dimension is 1024, maximum y-dimension is 1024, and maximum z-dimension is 64.
Additionally!, the maximum allowed total number of threads per block is 1024.

dim3(512,512,512) has more than 64 threads in z-dimension, and 512x512x512 > 1024, so it does not work.

On the other hand dim3(64,8,2) would be a valid partitioning with a total of 1024 threads.

1 Like

Thank you for your help. Now I have a better understanding of CUDA programming. Really thank yoooooou~~

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.