Error on accessing GPU

Hello all, I have a code that should work right but all of my codes that has device codes somehow do some weird
things. Does anyone have a clue of what is happening. Can this be a driver problem? If so, can somebody
teach me how to solve it??? Thank you.

Code:

#include “…/common/book.h”

#define N 50

global void add(int *a, int *b, int *c) {
int tid = blockIdx.x; //handle the data at this index
if (tid <N)
c[tid] = a[tid] + b[tid];
}

int main( void ) {
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c; //for GPU memory allocation

//memory allocation on GPU
cudaMalloc( (void**)&dev_a, N*sizeof(int));
cudaMalloc( (void**)&dev_b, N*sizeof(int)); 
cudaMalloc( (void**)&dev_c, N*sizeof(int)); 

//fill in a, b
for (int i=0; i<N; i++) {
a[i] = -i;
b[i] = i*i;
}

//copy of arrays 'a', 'b' on GPU

cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

add<<<1, 1>>>(dev_a, dev_b, dev_c);
cudaThreadSynchronize();	

//copy the array 'c' back from the GPU to CPU

cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);

//display results

for (int i=0; i<N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}

//free the memory allocated on the GPU
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;

}

Errors: This is the result from following file

My computer setting is
Win 7
Intel Core2
Geforce 9800GT

I did the following to see what is happening and the following happened. It seems like cudaMemcpyDeviceToHost is not working since
memory in “c” does not change at all. Does anyone have clue for this?

#include “…/common/book.h”

#define N 10

global void add(int *a, int *b, int *c) {
int tid = threadIdx.x; //handle the data at this index
if (tid <N)
c[tid] = a[tid] + b[tid];
__syncthreads();
}

int main( void ) {
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c; //for GPU memory allocation

//memory allocation on Host

a = (int *)malloc(sizeof(int)*N);
b = (int *)malloc(sizeof(int)*N);
c = (int *)malloc(sizeof(int)*N);

//debugging
for (int i=0; i<N; i++) {
printf( “%d\n”,c[i] );
}
printf( “\n” );
printf( “\n” );

//memory allocation on GPU
cudaMalloc( (int **)&dev_a, N*sizeof(int));
cudaMalloc( (int **)&dev_b, N*sizeof(int)); 
cudaMalloc( (int **)&dev_c, N*sizeof(int)); 

//fill in a, b
for (int i=0; i<N; i++) {
a[i] = -i;
b[i] = i*i;
}

//copy of arrays 'a', 'b' on GPU
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

add<<<1, 1>>>(dev_a, dev_b, dev_c);
cudaThreadSynchronize();

//debugging
for (int i=0; i<N; i++) {
printf( “%d\n”,c[i] );
}
printf( “\n” );
printf( “\n” );

//copy the array 'c' back from the GPU to CPU

cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);

//debugging
for (int i=0; i<N; i++) {
printf( “%d\n”,c[i] );
}
printf( “\n” );
printf( “\n” );

//display results

for (int i=0; i<N; i++) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}

//free the memory allocated on the GPU
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;

}

hi,

you are calling with 1 block of 1 thread. did you want to call 1 block of N threads?

even with this, I would expect the first place in the array c[0] to be with the correct result.

the function cudaMemcopy returns an error code. could you get that code? maybe it will give a clue as to the problem.

good luck,