Kernel functions do not seem to run

Hi,
I have been running a simple code stated below, which simply adds two vectors to a third one using a kernel running on the device.
I do not get the desired results, it seems that the kernel is not run at all.
The results produced looks like an uninitialized block of memory.
I am running on windows XP with NVIDIA GEForce 9300 GS.
Any ideas?
thanks,
Elad

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#include <cuda.h>

// Device code
global void VecAdd(BYTE* A, BYTE* B, BYTE* C, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N)
C[i] = A[i]+B[i];
}
// Host code
int HelloCUDA()
{
cudaSetDevice(1);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);

int n;
int N = 100; 
size_t size = N;
// Allocate input vectors h_A and h_B in host memory 
BYTE* h_A = (BYTE*)malloc(size); 
BYTE*  h_B = (BYTE*)malloc(size); 
BYTE* h_C = (BYTE*)malloc(size);

// Initialize input vectors ... 
memset(h_A,1,size);
memset(h_B,1,size);
memset(h_C,0,size);


// Allocate vectors in device memory 

BYTE* stam; 
int Err = cudaMalloc((void**)&stam, size); 
BYTE* d_A; 
Err = cudaMalloc((void**)&d_A, size); 
BYTE* d_B; 
Err = cudaMalloc((void**)&d_B, size); 
BYTE* d_C; 
Err = cudaMalloc((void**)&d_C, size); 

// Copy vectors from host memory to device memory 
Err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 
Err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 

// Invoke kernel 
VecAdd<<<5, 5>>>(d_A, d_B, d_C, N); 
// Copy result from device memory to host memory 
// h_C contains the result in host memory 
Err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 


    printf("input\n");
for (int i=0;i<size;i++)
	printf("%d ",h_A[i]);
printf("result\n");
for (int i=0;i<size;i++)
	printf("%d ",h_C[i]);

// Free device memory
cudaFree(d_A); 
cudaFree(d_B); 
cudaFree(d_C);
cudaFree(stam);
// Free host memory ... 
free(h_A);
free(h_B);
free(h_C);

return 0;

}

You may want to add CudaError_t cudaErr = cudaGetLastError(); immediately following the kernel invocation, and go from there.

You may want to add CudaError_t cudaErr = cudaGetLastError(); immediately following the kernel invocation, and go from there.