The following experimental program is supposed to print block and grid dimensions only once.
Once the variables isBlockDimPrinted
and isGridDimPrinted
set to true
by one thread, the other threads should not have found them set to false
.
If each thread doesn’t execute its own copy of the kernel code, then why are threads not following the if-else condition?
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
__device__ bool isBlockDimPrinted = false;
__device__ bool isGridDimPrinted = false;
__global__ void MatMulKernel()
{
if(!isBlockDimPrinted){
printf("Block Dim (%d, %d, %d), ", blockDim.x, blockDim.y, blockDim.z);
//printf("\n");
isBlockDimPrinted = true;
}
if(!isGridDimPrinted){
printf("Grid Dim (%d, %d, %d), ", gridDim.x, gridDim.y, gridDim.z);
//printf("\n");
isGridDimPrinted = true;
}
printf("BlockIdx (%d, %d, %d)\t", blockIdx.x, blockIdx.y, blockIdx.z);
//printf("\n");
printf("threadIdx (%d, %d, %d)\t", threadIdx.x, threadIdx.y, threadIdx.z);
//printf("\n");
}
int main()
{
dim3 dimBlock(1,2,3);
dim3 dimGrid(3, 2, 1);
MatMulKernel<<<dimGrid, dimBlock>>>();
cudaDeviceSynchronize();
return 0;
}
However, the output is unexpected as the blockDim
and gridDim
are being printed multiple times:
user_name@192:~/CUDA_lab$ nano thread_id_test_matrix.cu
user_name@192:~/CUDA_lab$ nvcc thread_id_test_matrix.cu -o exe
user_name@192:~/CUDA_lab$ ./exe
Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Block Dim (1, 2, 3), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), Grid Dim (3, 2, 1), BlockIdx (1, 1, 0) BlockIdx (1, 1, 0) BlockIdx (1, 1, 0) BlockIdx (1, 1, 0) BlockIdx (1, 1, 0) BlockIdx (1, 1, 0) BlockIdx (2, 0, 0) BlockIdx (2, 0, 0) BlockIdx (2, 0, 0) BlockIdx (2, 0, 0) BlockIdx (2, 0, 0) BlockIdx (2, 0, 0) BlockIdx (0, 0, 0) BlockIdx (0, 0, 0) BlockIdx (0, 0, 0) BlockIdx (0, 0, 0) BlockIdx (0, 0, 0) BlockIdx (0, 0, 0) BlockIdx (1, 0, 0) BlockIdx (1, 0, 0) BlockIdx (1, 0, 0) BlockIdx (1, 0, 0) BlockIdx (1, 0, 0) BlockIdx (1, 0, 0) BlockIdx (0, 1, 0) BlockIdx (0, 1, 0) BlockIdx (0, 1, 0) BlockIdx (0, 1, 0) BlockIdx (0, 1, 0) BlockIdx (0, 1, 0) BlockIdx (2, 1, 0) BlockIdx (2, 1, 0) BlockIdx (2, 1, 0) BlockIdx (2, 1, 0) BlockIdx (2, 1, 0) BlockIdx (2, 1, 0) threadIdx (0, 0, 0) threadIdx (0, 1, 0) threadIdx (0, 0, 1) threadIdx (0, 1, 1) threadIdx (0, 0, 2) threadIdx (0, 1, 2) threadIdx (0, 0, 0) threadIdx (0, 1, 0) threadIdx (0, 0, 1) threadIdx (0, 1, 1) threadIdx (0, 0, 2) threadIdx (0, 1, 2) threadIdx (0, 0, 0) threadIdx (0, 1, 0) threadIdx (0, 0, 1) threadIdx (0, 1, 1) threadIdx (0, 0, 2) threadIdx (0, 1, 2) threadIdx (0, 0, 0) threadIdx (0, 1, 0) threadIdx (0, 0, 1) threadIdx (0, 1, 1) threadIdx (0, 0, 2) threadIdx (0, 1, 2) threadIdx (0, 0, 0) threadIdx (0, 1, 0) threadIdx (0, 0, 1) threadIdx (0, 1, 1) threadIdx (0, 0, 2) threadIdx (0, 1, 2) threadIdx (0, 0, 0) threadIdx (0, 1, 0) threadIdx (0, 0, 1) threadIdx (0, 1, 1) threadIdx (0, 0, 2) threadIdx (0, 1, 2)
user_name@192:~/CUDA_lab$
Why is that happening?
How can I fix this?