 # CUDA & modulos

I wrote some pieces of code using the modulo operators, and noticed that these, especially in comparisons, give weird results.

Example:

``````#include <stdio.h>

#include <assert.h>

#include <math.h>

#define BLOCKSIZE 512

__global__ void kfinddiv(int *d_a, int k)

{

if (idx+2 < k/2)

{

if (k%(idx+2) == 0)

*d_a = idx+2;

}

}

int main()

{

int input;

scanf("%d", &input);

dim3 dimGrid(int(ceil(float(input/BLOCKSIZE))), int(ceil(float(input/BLOCKSIZE))));

dim3 dimBlock(BLOCKSIZE);

printf("%d\n", int(ceil(float(input/BLOCKSIZE))));

int *d_a;

int *h_a;

h_a = (int*)malloc(sizeof(int));

cudaMalloc(&d_a, sizeof(int));

*h_a = 0;

cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);

kfinddiv<<< dimGrid, dimBlock >>>(d_a, input);

cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);

printf("%d\n", *h_a);

return 0;

}
``````

This works fine when compiled with -deviceemu, but as soon as it starts running on the real deal the modulo seems to be wrongly evaluated.

Does anyone know whether, or when this will be fixed. (I’m 100% sure it is the modulo operator: add an if(idx == 0 && k%(idx+2)) at the end and use an even number as imput and you’ll see what I mean. then try removing the modulo portion)

I am confused you index computation

``````long long idx = gridDim.x*blockIdx.y+blockDim.x*blockIdx.x+threadIdx.x;
``````

If you want to sweep all elements of 2-D data, then it should be

``````long long idx = (gridDim.x*blockIdx.y+blockDim.x)*blockIdx.x+threadIdx.x;
``````

second you have race condition on updating *d_a

``````if (idx+2 < k/2)

{

if (k%(idx+2) == 0)

*d_a = idx+2;

}
``````

Could you provide CPU version, then we can check your parallelized version?