Hi,
I’m trying to load an int from global memory and use it in a subsequent if
condition. For some reason, the if
condition is being evaluated incorrectly. Here is a small sample where the problem reproduces:
__global__ void DummyKernel(uint32_t* pings_ptr) {
uint32_t pings = 0;
uint32_t threshold = 2;
asm("ld.volatile.global.u32 %0,[%1];" : "=r"(pings) : "l"(pings_ptr));
for (uint32_t count = 0; count < 4; count++)
{
if(threadIdx.x==0) {
while (count > threshold + pings) {
asm("ld.volatile.global.u32 %0,[%1];" : "=r"(pings) : "l"(pings_ptr));
}
printf("Spin done. %u must now be <= %u\n", count, threshold + pings);
}
__syncthreads();
}
}
int main() {
CUDACHECK(cudaSetDevice(0));
cudaStream_t cuda_stream;
CUDACHECK(cudaStreamCreateWithFlags(&cuda_stream, cudaStreamNonBlocking));
uint32_t* pings;
CUDACHECK(cudaMalloc(&pings, sizeof(uint32_t)));
CUDACHECK(cudaMemset(pings, 0, sizeof(uint32_t)));
int num_threads = 128;
int num_threadblocks = 1;
DummyKernel<<<num_threadblocks, num_threads, 0, cuda_stream>>>(pings);
CUDACHECK(cudaStreamSynchronize(cuda_stream));
}
When I run the above code, I get
Spin done. 0 must now be <= 2
Spin done. 1 must now be <= 2
Spin done. 2 must now be <= 2
Spin done. 3 must now be <= 2
Note that the third and fourth lines should not be printed.
If I comment out the asm
line within the while loop, then it works as expected.
What am I doing wrong?
Thanks,
Indu