problem about cuda coding

main funtion
float test_volumn;
test_volumn=new float[256
256256];
short test_weight=new short[256256
256];
cudaMalloc(&weight,256256256sizeof(short));
cudaMalloc(&volumn,256
256256sizeof(float));
dim3 dimblock(256);
dim3 dimgrid(256,256);
save_output_image_init_dev<<<dimgrid,dimblock>>>(weight,volumn);
cudaMemcpy(test_volumn,volumn,256256256sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(test_weight,weight,256
256256sizeof(short),cudaMemcpyDeviceToHost);
cout<<test_weight[1]<<endl;
cout<<test_volumn[1];

kernel funtion
global void reconstruction::save_output_image_init_dev(short weight,float volumn)
{
int _high=blockIdx.x;
int _wide=blockIdx.y;
int _long=threadIdx.x;
int index=_high
65536+_wide
256+_long;
weight[index]=0;
volumn[index]=-1;
}

when i Programme it on GT9600 (compute capability 1.1),the output is 0 -1 ,it is right
but when i use another computer which equip the gt630 (compute capability 2.1) the output has changed not 0 -1.and the output seem to be random,not the same.It’s wrong.
why??

Try adding status checks for all CUDA API calls, and all kernel launches.

I’m new ,I use VS2010 Can you tell me how to adding status checks for all CUDA API calls, and all kernel launches in VS2010??