Hi,

i currently work in a company who doesn’t allow me to download the cuda SDK. Therefore i can’t launch the device query test and the memory bandwidth test.

So i wrote a (very) simple kernel, which build without any problem but doens’t give me the expected results. Instead of i get a zero filled vector.

This kernel compute the sum of two vectors of size N (floating point precision), and stores the result in a third vector.

The code is very simple so i guess i’ve not made mistake in it. I don’t understand what could be wrong… perhaps a harware problem, or an inattention mistake…

I use the toolkit v3.2 and a nvidia driver 6.14.12.5957 which date is 01/09/2010 under windows XP.

Thank you very much for the attention you’ll give to my request.

R.Portalez

Here is my code :

main.cpp

[codebox]#include

extern “C” void addGPU(int N, float* a, float* b, float* c) ;

int main(int argc, char** argv)

{

```
int N = 5 ;
float* a = (float*) malloc(N*sizeof(int)) ;
float* b = (float*) malloc(N*sizeof(int)) ;
float* cc = (float*) malloc(N*sizeof(int)) ;
for (int i = 0 ; i < N ; ++i)
{
a[i] = float(i) ;
b[i] = float(i) ;
cc[i] = 0.f ;
}
addGPU(N, a, b, cc) ;
return 0 ;
```

}

[/codebox]

kernel.cu

[codebox]**global** void kernel_addGPU(int N, float* a, float* b, float* c)

{

```
int step = blockDim.x*gridDim.x ;
for (int k = threadIdx.x + blockIdx.x*blockDim.x ; k < N ; k += step)
{
c[k] = a[k] + b[k] ;
}
```

}

extern “C” void addGPU(int N, float* a, float* b, float* c)

{

```
int size = N*sizeof(float) ;
float *d_a, *d_b, *d_c ;
cudaMalloc((void**) &d_a, size) ;
cudaMalloc((void**) &d_b, size) ;
cudaMalloc((void**) &d_c, size) ;
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice) ;
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice) ;
kernel_addGPU <<< 1, 8 >>> (N, d_a, d_b, d_c) ;
```

cudaThreadSynchronize () ;

```
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost) ;
cudaFree(d_a) ;
cudaFree(d_b) ;
cudaFree(d_c) ;
```

}[/codebox]