CUDA & modulos

I wrote some pieces of code using the modulo operators, and noticed that these, especially in comparisons, give weird results.

Example:

#include <stdio.h>

#include <assert.h>

#include <math.h>

#define BLOCKSIZE 512

__global__ void kfinddiv(int *d_a, int k)

{

	long long idx = gridDim.x*blockIdx.y+blockDim.x*blockIdx.x+threadIdx.x;

	if (idx+2 < k/2)

	{

		if (k%(idx+2) == 0)

			*d_a = idx+2;

	}

}

int main()

{

	int input;

	scanf("%d", &input);

	

	dim3 dimGrid(int(ceil(float(input/BLOCKSIZE))), int(ceil(float(input/BLOCKSIZE))));

	dim3 dimBlock(BLOCKSIZE);

	

	printf("%d\n", int(ceil(float(input/BLOCKSIZE))));

	

	int *d_a;

	int *h_a;

	h_a = (int*)malloc(sizeof(int));

	cudaMalloc(&d_a, sizeof(int));

	*h_a = 0;

	cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);

	

	kfinddiv<<< dimGrid, dimBlock >>>(d_a, input);

	

	cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);

	printf("%d\n", *h_a);

	return 0;

}

This works fine when compiled with -deviceemu, but as soon as it starts running on the real deal the modulo seems to be wrongly evaluated.

Does anyone know whether, or when this will be fixed. (I’m 100% sure it is the modulo operator: add an if(idx == 0 && k%(idx+2)) at the end and use an even number as imput and you’ll see what I mean. then try removing the modulo portion)

I am confused you index computation

long long idx = gridDim.x*blockIdx.y+blockDim.x*blockIdx.x+threadIdx.x;

If you want to sweep all elements of 2-D data, then it should be

long long idx = (gridDim.x*blockIdx.y+blockDim.x)*blockIdx.x+threadIdx.x;

second you have race condition on updating *d_a

if (idx+2 < k/2)

	{

		if (k%(idx+2) == 0)

			*d_a = idx+2;

	}

Could you provide CPU version, then we can check your parallelized version?