Hi,
I have found a strange behaviour in our CUDA-code. Up-to-now I haven’t succeeded in creating a small code snippet, which reproduces the error. Therefore I just want to ask if someone perhaps could imagine where the problem may be located.
In short, our code performs an integer division:
c = a / b
where all variables are of type int.
This division works in nearly all cases as expected. However, we find a situation where this division gives the wrong result (753 / 251 = 4, instead of 753 / 251 = 3).
This problem is really strange and I think we can state that there are no race-conditions at this part of the code, since these are either local variables or variables which never change.
We could circumvent this problem by transforming the division to a floating-point division:
c = a * float(1.0f/b)
With this modification the result is correct (c=3).
Our suspicion is, that perhaps(?) there is a problem with integer divisions, if the division is exact (without rest), since in all other cases we haven’t observed this problem. However, I can hardly imagine this to be a compiler bug? My hope is, that someone at least may have an idea what may be the cause of this problem?
ADDITION:
We now succeeded in reproducing this problem:
(notice: block_idx = 0, since the 753 % 251 = 0.
Hence, (block_id_2d - block_idx) / blks_x = 753 / 251 = 3, but is 4.)
#include <stdio.h>
__global__ void div_issue( int blocks_x, int* block_offset)
{
int blks_x = blocks_x;
//number of block 2d
int block_id_2d = block_offset[0];
//x-coordinate of block in absolute grid
int block_idx = block_id_2d % blks_x;
//y-coordinate of block in absolute grid
int block_idy = (block_id_2d - block_idx) / blks_x;
printf("%d mod %d = %d \n", block_id_2d, blks_x, block_idx);
printf("%d / %d = %d \n", block_id_2d - block_idx, blks_x, block_idy);
}
//int block_idy = int(block_id_2d - block_idx) * float(1.0f/blks_x);
int main(int argc, char *argv[])
{
for (unsigned int i=0; i < 2; i++)
{
cudaSetDevice(i);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("using device %s :\n\n", prop.name);
int block_offset_host[1];
block_offset_host[0] = 753;
int* block_offset_dev;
cudaMalloc(&block_offset_dev, sizeof(int));
cudaMemcpy(block_offset_dev, block_offset_host, sizeof(int), cudaMemcpyHostToDevice);
div_issue <<<1, 1 >>>( 251 , block_offset_dev);
cudaDeviceSynchronize();
printf("\n");
}
}