In this following code segment, I was trying to exit all the threads from the infinite loop, where only thread 0 of blockId 0 update the *end=32 which is initially get the value from blockDim.x (*end=16). But this program stuck in this infinite loop. I think, therads of blockId 1 are not getting the updated value, so they stuck in the infinite loop. But from the description of CUDA tutorial it is clearly mention that any thing in device memory is visible from all the threads of different blocks.
I am not sure weather there are multiple copies of *end for the two blocks. Or is there any other solution, so that all the threads of different blocks can exit this infinite loop, only by the updating the *end by the therad of only one block of threads.
I was trying to do this program using device , where device qualifier was used for start and end but same thing happening.
Below is the description of my device.
There are 2 devices supporting CUDA
Device 0: “GeForce GTX 560 Ti”
CUDA Driver Version: 4.0
CUDA Capability Major/Minor version number: 2.1
Device 1: “Quadro 600”
CUDA Driver Version: 4.0
CUDA Capability Major/Minor version number: 2.1
But this program ran for folowing the following device and did not stuck in the infinite loop…
Device 0: “GeForce 320M”
CUDA Driver Version: 4.0
CUDA Capability Major/Minor version number: 1.2
Here, is the code. Can anyone please give me a solution.
#define N 32
__global__ void test_program (int *start, int *end, int *first_element, int *last_element){
*start=0;
*end=blockDim.x; //*end get the initial value 16
int idx;
int u_idx;
while(1){
u_idx=blockIdx.x*blockDim.x+threadIdx.x;
//only threads 0 to 15 of blockId 0 can satisfy this condition.
if(u_idx>=*start && u_idx < *end){
idx=threadIdx.x;
//only thread 0 of blockId 0 updates *end to 32
if(idx==0){
*end=32;
}
}
__syncthreads();
//exit condition
if(*end==32){
break;
}
}
}
int main(void)
{
float elapsed_time_ms = 0.0f;
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
int *start_d;
int *end_d;
int *first_element;
int *last_element;
cudaMalloc ((void**)&start_d,sizeof(int));
cudaMalloc ((void**)&end_d,sizeof(int));
cudaMalloc ((void**)&first_element,sizeof(int));
cudaMalloc ((void**)&last_element,sizeof(int));
int nBlocks = 2;
int blockSize=16;
cudaEventRecord( start, 0);
//execution configuration: 2 blocks of threads, each block has 16 threads
test_program <<< nBlocks, blockSize >>> (start_d,end_d,first_element,last_element);
cudaEventRecord( stop, 0);
cudaEventSynchronize( stop );
cudaEventElapsedTime( &elapsed_time_ms, start, stop );
printf("time %f\n",elapsed_time_ms);
cudaFree(start_d);
cudaFree(end_d);
cudaFree(first_element);
cudaFree(last_element);
}