Hello,

Trying to run a simulation on a GTX 480 I encountered varying results upon completion. As I couldn’t find the cause for this, I broke the problem down to the most simple kernel that exhibits this strange behaviour:

```
#define N 1024*1024
__global__ void indeterministic(unsigned long* sums, unsigned int* seeds) {
int idx = blockIdx.x*blockDim.x+threadIdx.x;
unsigned int ran = seeds[idx];
ran ^= ran << 17;
ran ^= ran >> 14;
seeds[idx] = ran;
sums[idx] += ran;
}
int main(int argc, char** argv) {
unsigned long* sums;
unsigned int* seeds;
cudaMalloc(&sums, N*sizeof(unsigned long));
cudaMalloc(&seeds, N*sizeof(unsigned int));
unsigned int* h_seeds = (unsigned int*) malloc(N*sizeof(unsigned int));
for(int i = 0; i < N; i++)
h_seeds[i] = i+1;
for(int j = 0; j < 2; j++) {
cudaMemset(sums, 0, N*sizeof(unsigned long));
cudaMemcpy(seeds, h_seeds, N*sizeof(unsigned int), cudaMemcpyHostToDevice);
for(int i = 0; i < 100000; i++) {
indeterministic<<<N/256, 256>>>(sums, seeds);
}
unsigned long* h_sums = (unsigned long*) malloc(N*sizeof(unsigned long));
cudaMemcpy(h_sums, sums, N*sizeof(unsigned long), cudaMemcpyDeviceToHost);
unsigned long sum = 0;
for(int i = 0; i < N; i++)
sum += h_sums[i];
printf("%lu\n", sum);
free(h_sums);
}
cudaFree(sums);
cudaFree(seeds);
free(h_seeds);
return(0);
}
```

Sometimes when running this code runaway values will appear in the printed results.

I’d appreciate any help.