Hello all,
Could any of you please run this code (see below) and check if there is any garbage in output? Judging from the kernel I expected ‘out’ array to be filled with ones; but on my OSX 10.6.2, Cuda 2.3, GF9400 for some reason ‘out’ contains some garbage numbers. If I change ‘lId / 2’ to ‘lId’, or decrease shared memory size along with thread number, bug disappears.
Example output:
$ make clean; make; release/test_cuda
nan
0: 1.000000
1: -165970779523315961984400771799580672.000000
2: -169949038053089037631926039430826033152.000000
3: -138290592331389744016850419712.000000
4: 35736483056626696192.000000
5: 2638969508086798374919131611799748608.000000
6: nan
7: -85049812401595674729494317898999529472.000000
8: nan
9: -0.000000
10: nan
11: -0.000000
12: 340280805175398982284751963493419188224.000000
13: -10382680480738939701846136061952000.000000
14: -1.468738
…
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil_inline.h>
extern "C"{
__global__ void test(float *out)
{
float sMem[128];
int lId = threadIdx.x;
sMem[lId] = 1;
__syncthreads();
out[lId] = sMem[lId / 2];
}
}
int main( int argc, char** argv)
{
if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
cutilDeviceInit(argc, argv);
else
cudaSetDevice(cutGetMaxGflopsDeviceId());
int N = 128;
float *h_out_buf = (float*)malloc(N * sizeof(float));
float *d_out_buf;
cutilSafeCall(cudaMalloc((void**)&d_out_buf, N * sizeof(float)));
test<<<1, N>>>(d_out_buf);
cutilCheckMsg("test");
cutilSafeCall(cudaMemcpy(h_out_buf, d_out_buf, N * sizeof(float), cudaMemcpyDeviceToHost));
float sum = 0;
for(int i = 0; i < N; i++) sum += h_out_buf[i];
printf("%f\n", sum);
for(int i = 0; i < N; i++) printf("%d: %f\n", i, h_out_buf[i]);
cutilSafeCall(cudaFree(d_out_buf));
free(h_out_buf);
cudaThreadExit();
}