I’ve been trying to identify the source of a race condition in my CUDA kernel for days but I have had no luck. I am trying to implement a simple parallel prime sieve but it fails to produce correct results:
41.cu
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define B 4096
#define T 256
#define N (B*T)
#define checkCudaErrors(error) {\
if (error != cudaSuccess) {\
printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\
exit(1);\
}\
}\
__global__ void prime_sieve(int *primes) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i > 1 && i<N) {
for (int j=2; j<N/2; j++) {
if (i*j > 1 && i*j < N) {
primes[i*j] = 0;
}
}
}
}
int main() {
int *h_primes=(int*)malloc(N * sizeof(int));
int *d_primes;
for (int i=0; i<N; i++) {
h_primes[i]=i;
}
h_primes[0] = h_primes[1] = 0;
checkCudaErrors(cudaMalloc( (void**)&d_primes, N*sizeof(int)));
checkCudaErrors(cudaMemcpy(d_primes,h_primes,N*sizeof(int),cudaMemcpyHostToDevice));
prime_sieve<<<B,T>>>(d_primes);
checkCudaErrors(cudaMemcpy(h_primes,d_primes,N*sizeof(int),cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_primes));
int size = 0;
int total = 0;
for (int i=0; i<N; i++) {
if (h_primes[i]) {
size++;
}
total++;
}
printf("\n");
printf("Length = %d\tPrimes = %d\n",total,size);
free(h_primes);
return 0;
}
cuda-memcheck does not report any errors for racecheck, initcheck or synccheck. I don’t understand why but if I remove the ij > 1 conditional from the kernel inner if clause, an illegal memory access error is thrown but why would ij be negative?. Even though they are signed integers, they are always within the integer limit bounds.
Compilation is done via
nvcc 41.cu -o 41.o -arch=sm_30
On a GTX 780 Ti with CUDA version 8.0.61 on Ubuntu 16.04
When I run the program, the output is always deterministic but incorrect:
Length = 1048576 Primes = 0
Can anyone see any problems with my kernel code?
Help much appreciated