I’ve been trying to identify the source of a race condition in my CUDA kernel for days but I have had no luck. I am trying to implement a simple parallel prime sieve but it fails to produce correct results:

41.cu

```
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define B 4096
#define T 256
#define N (B*T)
#define checkCudaErrors(error) {\
if (error != cudaSuccess) {\
printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\
exit(1);\
}\
}\
__global__ void prime_sieve(int *primes) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i > 1 && i<N) {
for (int j=2; j<N/2; j++) {
if (i*j > 1 && i*j < N) {
primes[i*j] = 0;
}
}
}
}
int main() {
int *h_primes=(int*)malloc(N * sizeof(int));
int *d_primes;
for (int i=0; i<N; i++) {
h_primes[i]=i;
}
h_primes[0] = h_primes[1] = 0;
checkCudaErrors(cudaMalloc( (void**)&d_primes, N*sizeof(int)));
checkCudaErrors(cudaMemcpy(d_primes,h_primes,N*sizeof(int),cudaMemcpyHostToDevice));
prime_sieve<<<B,T>>>(d_primes);
checkCudaErrors(cudaMemcpy(h_primes,d_primes,N*sizeof(int),cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_primes));
int size = 0;
int total = 0;
for (int i=0; i<N; i++) {
if (h_primes[i]) {
size++;
}
total++;
}
printf("\n");
printf("Length = %d\tPrimes = %d\n",total,size);
free(h_primes);
return 0;
}
```

cuda-memcheck does not report any errors for racecheck, initcheck or synccheck. I don’t understand why but if I remove the i*j > 1 conditional from the kernel inner if clause, an illegal memory access error is thrown but why would i*j be negative?. Even though they are signed integers, they are always within the integer limit bounds.

Compilation is done via

nvcc 41.cu -o 41.o -arch=sm_30

On a GTX 780 Ti with CUDA version 8.0.61 on Ubuntu 16.04

When I run the program, the output is always deterministic but incorrect:

Length = 1048576 Primes = 0

Can anyone see any problems with my kernel code?

Help much appreciated