Copying memory from host to device and vice versa didn't work

Dear all,

I’m newer to CUDA. I’ve just done a small and basic kernel, which’s pasted herewith. This kernel didn’t work, I mean both the copying from host memory to device memory and from device memory to host memory didn’t work. The concerned APIs didn’t return any error code. I’m executing this kernel in Win7 x64, NVidia GeForce GT 525M with CUDA toolkit 4.0 and VS 2008 SP1. The expected output is 30, but it prints 0. Any ideas are appreciated.

#include “stdafx.h”
#include “CUDAResult.h”

global void Add(unsigned int* a, unsigned int* b, unsigned int* c)
*c = *a + *b;

int _tmain(int argc, _TCHAR* argv)
CUDAResult res = cuInit(0);

	CUdevice cuDevice = 0;
	res = cuDeviceGet(&cuDevice, 0);

	//Write a wrapper for destroying the context automatically.
	CUcontext cuContext;
	res = cuCtxCreate(&cuContext, 0, cuDevice);
	unsigned int a = 10, b = 20, c = 0;
	//Write a wrapper for releasing memory automatically.
	unsigned int a1;
	res = cuMemAlloc(&a1, sizeof(unsigned int));
	res = cuMemcpyHtoD(a1, &a, sizeof(unsigned int));
	unsigned int b1;
	res = cuMemAlloc(&b1, sizeof(unsigned int));
	res = cuMemcpyHtoD(b1, &b, sizeof(unsigned int));
	unsigned int c1;
	res = cuMemAlloc(&c1, sizeof(unsigned int));
	Add<<<1,1>>>(&a1, &b1, &c1);
	res = cuMemcpyDtoH(&c, c1, sizeof(unsigned int));
	printf("%d", c);
	return -1;

return 0;