How to realise 3 dimension calculation?

huanyihu2000 · August 28, 2023, 8:03am

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>

using namespace std;

__global__ void Cal(float* Input, float* Output, int xx, int yy, int zz)
{
	int X = threadIdx.x + blockDim.x * blockIdx.x;
	int Y = threadIdx.y + blockDim.y * blockIdx.y;
	int Z = threadIdx.z + blockDim.z * blockIdx.z;

	if (X < xx && Y < yy && Z< zz)
	{
		Output[X + Y * xx + Z* xx*yy] = Input[X + Y * xx + Z * xx * yy];
	}
}

int main()
{
	int z = 2;
	int y = 3;
	int x = 4;

	int num = x * y * z;

	float* h_input = (float*)malloc(sizeof(float) * num);
	float* h_output = (float*)malloc(sizeof(float) * num);

	float* d_input;
	float* d_output;
	cudaMalloc((void**)&d_input, sizeof(float) * num);
	cudaMalloc((void**)&d_output, sizeof(float) * num);
	for (int i = 0; i < num; i++)
	{
		h_input[i] = i + 1;
	}
	for (int i = 0; i < z; i++)
	{
		for (int j = 0; j < y; j++)
		{
			for (int k = 0; k < x; k++)
			{
				cout << h_input[i * y * x + j * x + k] << " ";
			}
			cout << endl;
		}
		cout << endl << endl;;
	}

	cudaMemcpy(d_input, h_input, sizeof(float) * num, cudaMemcpyHostToDevice);

	Cal << <(6, 6, 6), (512, 512, 512) >> > (d_input, d_output, x, y, z);

	cudaMemcpy(h_output, d_output, sizeof(float) * num, cudaMemcpyDeviceToHost);

	for (int i = 0; i < z; i++)
	{
		for (int j = 0; j < y; j++)
		{
			for (int k = 0; k < x; k++)
			{
				cout <<h_output[i * y * x + j * x + k] << " ";
			}
			cout << endl;
		}
		cout << endl << endl;;
	}

	return 0;
}

I used this code to realise 3 dimension calculation, but failed. the result is:
original data:
1 2 3 4
5 6 7 8
9 10 11 12

13 14 15 16
17 18 19 20
21 22 23 24

result:
1 2 3 4
0 0 0 0
0 0 0 0

0 0 0 0
0 0 0 0
0 0 0 0
It can be seen that only the first row of the first block has the right answer. So can you give me an example how to solve this problem? thanks!
Besides, I want to figure out how to do high-dimensional calculation in CUDA, is there any useful tutorials online? Last time an advisor of yours recommended one website but i found it can not be openned.

striker159 · August 28, 2023, 8:09am

You are missing an & in the kernel if.

if (X < xx && Y < yy & Z< zz) should be
if (X < xx && Y < yy && Z< zz)

huanyihu2000 · August 28, 2023, 8:14am

You are careful, but that does no help to the result. I have fixed it.

huanyihu2000 · August 28, 2023, 8:16am

Here is the result afer fixing that problem:
1 2 3 4
5 6 7 8
9 10 11 12

13 14 15 16
17 18 19 20
21 22 23 24

1 2 3 4
0 0 0 0
0 0 0 0

0 0 0 0
0 0 0 0
0 0 0 0
stiil the same.

huanyihu2000 · August 28, 2023, 8:17am

I know the index i wrote was wrong, because it didn’t show how to locate every thread. But i don’t konw what to do to make it right.

striker159 · August 28, 2023, 8:31am

When I try to compile this code, I see many compiler warnings. warning #174-D: expression has no effect
Then, when I print the block dimensions and grid dimensions within the kernel, the block dimension is (512,1,1) and grid dimension is (6,1,1) , which I assume is not what you intended.
Blocks of size 512x512x512 are not allowed anyways.

If you properly pass grid and block dimensions using dim3, and use blocks of size 6x6x6 , the code works fine for me.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>

using namespace std;

__global__ void Cal(float* Input, float* Output, int xx, int yy, int zz)
{
	int X = threadIdx.x + blockDim.x * blockIdx.x;
	int Y = threadIdx.y + blockDim.y * blockIdx.y;
	int Z = threadIdx.z + blockDim.z * blockIdx.z;

    // if(threadIdx.x + threadIdx.y + threadIdx.z + blockIdx.x + blockIdx.y + blockIdx.z == 0){
    //     printf("%d %d %d, %d %d %d\n", blockDim.x, blockDim.y, blockDim.z, gridDim.x, gridDim.y, gridDim.z);
    // }

	if (X < xx && Y < yy && Z< zz)
	{
		Output[X + Y * xx + Z* xx*yy] = Input[X + Y * xx + Z * xx * yy];
	}
}

int main()
{
	int z = 2;
	int y = 3;
	int x = 4;

	int num = x * y * z;

	float* h_input = (float*)malloc(sizeof(float) * num);
	float* h_output = (float*)malloc(sizeof(float) * num);

	float* d_input;
	float* d_output;
	cudaMalloc((void**)&d_input, sizeof(float) * num);
	cudaMalloc((void**)&d_output, sizeof(float) * num);
	for (int i = 0; i < num; i++)
	{
		h_input[i] = i + 1;
	}
	for (int i = 0; i < z; i++)
	{
		for (int j = 0; j < y; j++)
		{
			for (int k = 0; k < x; k++)
			{
				cout << h_input[i * y * x + j * x + k] << " ";
			}
			cout << endl;
		}
		cout << endl << endl;;
	}

	cudaMemcpy(d_input, h_input, sizeof(float) * num, cudaMemcpyHostToDevice);

	//Cal << <(6, 6, 6), (512, 512, 512) >> > (d_input, d_output, x, y, z);
    Cal <<<dim3(512, 512, 512), dim3(6, 6, 6)>>> (d_input, d_output, x, y, z);

	cudaMemcpy(h_output, d_output, sizeof(float) * num, cudaMemcpyDeviceToHost);

	for (int i = 0; i < z; i++)
	{
		for (int j = 0; j < y; j++)
		{
			for (int k = 0; k < x; k++)
			{
				cout <<h_output[i * y * x + j * x + k] << " ";
			}
			cout << endl;
		}
		cout << endl << endl;;
	}

	return 0;
}

huanyihu2000 · August 28, 2023, 8:47am

I know what is wrong. I guess it is the gridsize and blocksize which I chose that made my code unable to get right results.
Cal <<<dim3(512, 512, 512), dim3(6, 6, 6)>>> (d_input, d_output, x, y, z);
And i want to know why (512,512,512) of blocksize is not right? Is it because it’s beyond actually number of device threads in my Nvidia GPU? I have watched some tutorials online, and in those tutorials, they explained that the ideal possible number of blocksize was up to (1024,1024,1024).
So:
Can you tell me where i exactly wrong? Many thanks!

striker159 · August 28, 2023, 8:57am

See Table 15 in the programming guide CUDA C++ Programming Guide .

For thread blocks, the maximum allowed x-dimension is 1024, maximum y-dimension is 1024, and maximum z-dimension is 64.
Additionally!, the maximum allowed total number of threads per block is 1024.

dim3(512,512,512) has more than 64 threads in z-dimension, and 512x512x512 > 1024, so it does not work.

On the other hand dim3(64,8,2) would be a valid partitioning with a total of 1024 threads.

huanyihu2000 · August 28, 2023, 9:04am

Thank you for your help. Now I have a better understanding of CUDA programming. Really thank yoooooou~~

system · September 11, 2023, 9:04am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Dimensions of a Block and a Grid CUDA Programming and Performance	7	12984	May 1, 2008
help with some cuda programming CUDA Programming and Performance	9	1817	August 31, 2009
Max Dimension of GridSize and BlockSize CUDA Programming and Performance	8	10141	June 19, 2011
CUDA Program Issue CUDA Programming and Performance cuda	19	145	September 20, 2024
Size limitation for 1D Arrays in CUDA? CUDA Programming and Performance	9	18241	October 17, 2013
Thread Number Limitation CUDA Programming and Performance	3	3890	December 22, 2008
Maximum number of threads on thread block CUDA Programming and Performance	12	73527	September 21, 2023
How many can use Blocks to effcient parallel prog CUDA Programming and Performance	8	5789	December 12, 2009
Understanding Threads in CUDA help me find the exact number of threads for my code CUDA Programming and Performance	4	2332	July 13, 2009
I wonder maximum number of threads per block really limits the number of threads in each block. CUDA Programming and Performance	5	3977	February 9, 2024

How to realise 3 dimension calculation?

Related topics