Hello. below is my Hello World kernel code for CUDA.
__global__ void _cuHelloWorld()
{
printf("threadindex : %d, blockindex : %d\r\n", threadIdx.x, blockIdx.x);
}
void cuHelloWorld()
{
printf("Hello World by GPU origin function\r\n");
printf("<<<7,7>>> block 7 thread 7\r\n");
_cuHelloWorld<<<7,7>>>();
cudaDeviceSynchronize(); //CPU wait for CUDA completion.
}
And the output on the console was as follows:
[100%] Built target CUDARunOrderVisualization
ubuntu@tegra-ubuntu:~/Desktop/MyZEDSample/MySample/CUDAPractice/CUDARunOrderVisualization/build$ ./CUDARunOrderVisualization
Hello World by CPU
Hello World by GPU origin function
<<<7,7>>> block 7 thread 7
threadindex : 0, blockindex : 0
threadindex : 1, blockindex : 0
threadindex : 2, blockindex : 0
threadindex : 3, blockindex : 0
threadindex : 4, blockindex : 0
threadindex : 5, blockindex : 0
threadindex : 6, blockindex : 0
threadindex : 0, blockindex : 1
threadindex : 1, blockindex : 1
threadindex : 2, blockindex : 1
threadindex : 3, blockindex : 1
threadindex : 4, blockindex : 1
threadindex : 5, blockindex : 1
threadindex : 6, blockindex : 1
threadindex : 0, blockindex : 4
threadindex : 1, blockindex : 4
threadindex : 2, blockindex : 4
threadindex : 3, blockindex : 4
threadindex : 4, blockindex : 4
threadindex : 5, blockindex : 4
threadindex : 6, blockindex : 4
threadindex : 0, blockindex : 2
threadindex : 1, blockindex : 2
threadindex : 2, blockindex : 2
threadindex : 3, blockindex : 2
threadindex : 4, blockindex : 2
threadindex : 5, blockindex : 2
threadindex : 6, blockindex : 2
threadindex : 0, blockindex : 5
threadindex : 1, blockindex : 5
threadindex : 2, blockindex : 5
threadindex : 3, blockindex : 5
threadindex : 4, blockindex : 5
threadindex : 5, blockindex : 5
threadindex : 6, blockindex : 5
threadindex : 0, blockindex : 6
threadindex : 1, blockindex : 6
threadindex : 2, blockindex : 6
threadindex : 3, blockindex : 6
threadindex : 4, blockindex : 6
threadindex : 5, blockindex : 6
threadindex : 6, blockindex : 6
threadindex : 0, blockindex : 3
threadindex : 1, blockindex : 3
threadindex : 2, blockindex : 3
threadindex : 3, blockindex : 3
threadindex : 4, blockindex : 3
threadindex : 5, blockindex : 3
threadindex : 6, blockindex : 3
As you can see above, the blocks do not appear to be running in parallel; they seem to run in a specific order.
My Environment :
- Jetson TK1
- Ubuntu 14.04(Jetpack 3.1)
- CUDA version 6.5
Please note that I’m very beginner at CUDA