i make a programe,but it cannot get a right result a programe debug

///
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include <template_kernel.cu>

#define data_size 100
extern “C”
void computeGold( float* reference, float* idata, const unsigned int len);

int
main( int argc, char** argv)
{
float dev_A, dev_B, dev_C;
float
a = (float
) malloc(data_size);
float
b = (float*) malloc(data_size);
float* c = (float*) malloc(data_size);
float* d = (float*) malloc(data_size);
for( unsigned int i = 0; i < data_size; ++i)
{
a[i] = (float) i;
}
// initalize the memory
for( unsigned int j = 0; j < data_size; ++j)
{
b[j] = (float) j;
}

CUDA_SAFE_CALL(cudaMemcpy( dev_A, a, data_size, cudaMemcpyHostToDevice ));
CUDA_SAFE_CALL(cudaMemcpy( dev_B, b, data_size, cudaMemcpyHostToDevice ));
dim3 grid( 1, 1, 1);
dim3 threads( data_size, 1, 1);

VectorMul<<< grid, threads,data_size >>>( dev_A, dev_B, dev_C );
//VectorMul<<< 1,data_size >>>( dev_A, dev_B, dev_C );

CUDA_SAFE_CALL( cudaMemcpy( c, dev_C, data_size,cudaMemcpyDeviceToHost) );

for(int m=0;m<data_size;++m)
{
printf(“gpuresult:%f\n”,c[m]);

}

CUT_EXIT(argc, argv);
}

///

global void
VectorMul( float* dev_A, float* dev_B, float* output )
{
//int idx = threadIdx.x;

//output[idx] = dev_A[idx] * dev_B[idx];

for(int kk=0;kk<data_size;++kk)
{output[kk] = dev_A[kk] * dev_B[kk];
}
}

why not i can not get the right result?

sizeof(float)!=1.

since you are not using any shared memory in your kernel

data_size is unnecessary here

VectorMul<<< grid, threads,data_size >>>( dev_A, dev_B, dev_C );

put the data_size as a parameter in the kernel call.