///
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include <template_kernel.cu>
#define data_size 100
extern “C”
void computeGold( float* reference, float* idata, const unsigned int len);
int
main( int argc, char** argv)
{
float dev_A, dev_B, dev_C;
float a = (float) malloc(data_size);
float b = (float*) malloc(data_size);
float* c = (float*) malloc(data_size);
float* d = (float*) malloc(data_size);
for( unsigned int i = 0; i < data_size; ++i)
{
a[i] = (float) i;
}
// initalize the memory
for( unsigned int j = 0; j < data_size; ++j)
{
b[j] = (float) j;
}
CUDA_SAFE_CALL(cudaMemcpy( dev_A, a, data_size, cudaMemcpyHostToDevice ));
CUDA_SAFE_CALL(cudaMemcpy( dev_B, b, data_size, cudaMemcpyHostToDevice ));
dim3 grid( 1, 1, 1);
dim3 threads( data_size, 1, 1);
VectorMul<<< grid, threads,data_size >>>( dev_A, dev_B, dev_C );
//VectorMul<<< 1,data_size >>>( dev_A, dev_B, dev_C );
CUDA_SAFE_CALL( cudaMemcpy( c, dev_C, data_size,cudaMemcpyDeviceToHost) );
for(int m=0;m<data_size;++m)
{
printf(“gpuresult:%f\n”,c[m]);
}
CUT_EXIT(argc, argv);
}
///
global void
VectorMul( float* dev_A, float* dev_B, float* output )
{
//int idx = threadIdx.x;
//output[idx] = dev_A[idx] * dev_B[idx];
for(int kk=0;kk<data_size;++kk)
{output[kk] = dev_A[kk] * dev_B[kk];
}
}
why not i can not get the right result?