Cuda C.

Hi all I have created a program for vector addition in CUDA C and executing it in visual studio 2010. Program executed perfectly but it never enters into global void add(int *a, int *b, int *c, int n) function and the statements inside this function are not executed.

Why the program is not entering into global function? Kinldy reply.
Program is as follows

//#include “cuda.h”
#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <malloc.h>
#include <assert.h>
#define SIZE 10

global
void add(int *a, int *b,int *c,int count)
{
int n=threadIdx.x;
if(n<count)
c[n]=a[n]+b[n];
}

int main()
{
int i,*a,*b,*c,*d;
int *d_a,*d_b,*d_c,t;

a=(int *)malloc(SIZE*sizeof(int));
b=(int *)malloc(SIZE*sizeof(int));
c=(int *)malloc(SIZE*sizeof(int));


t=SIZE*sizeof(int);

cudaMalloc((void **)&d_a,t);
cudaMalloc((void **)&d_b,t);
cudaMalloc((void **)&d_c,t);

for(i=0;i<SIZE;i++)
{
	a[i]=4;
	b[i]=5;
	c[i]=0;
}

cudaMemcpy(d_a,a,SIZE*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_b,b,SIZE*sizeof(int),cudaMemcpyHostToDevice);
//cudaMemcpy(d_c,c,SIZE*sizeof(int),cudaMemcpyHostToDevice);

int block_size = 128;
int grid_size = SIZE / block_size;

hello<<<grid_size,block_size>>>(d_a,d_b,d_c,SIZE);

cudaMemcpy(c,d_c,SIZE*sizeof(int),cudaMemcpyDeviceToHost);


for(i=0;i<SIZE;i++)
{
	printf("c[%d]=%d\n",i,c[i]);
}
free(a);
free(b);
free(c);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

// system(“pause”);

}

Please Please reply

global
void add(int *a, int *b,int *c,int count) ;

hello<<<grid_size,block_size>>>(d_a,d_b,d_c,SIZE);

???

cudaGetLastError() before and after kernel launches is best

and perhaps post on the programming and performance board, if it is not directly gdb related

its by mistake
actual is as follows

add<<<grid_size,block_size>>>(d_a,d_b,d_c,SIZE);

Please somebody guide me on this