I am writing a very basic version of Vector Vector Multiplication using CUDA. The code is as follows
Length of each vector : 100
Number of Grid Taken : 1
Number of Block : 1 ( 10 by 10 )
Input values are hard coded also. But while running , i am getting wrong result. Can any body point out , where i am making mistake.
While running with device emulation mode [ -deviceemu ], the same program is producing correct result.
please help me out…
thanks in advance .
with regards
sam
--------------------source-----------------------------
#include<stdio.h>
#include<cuda.h>
#define LEN 100
global void VectVect(float *dMatA, float *dMatB, int length, float *device_result)
{
int tidx = threadIdx.x;
int tidy = threadIdx.y;
float tempResult = 0.0f;
tempResult = dMatA[ (10 * tidx) + tidy ] * dMatB[ (10 * tidx) + tidy ];
__syncthreads();
device_result[0] += tempResult;
__syncthreads();
}//end of VectVect device function
int main(int argc, char* argv)
{
float *dMatA, *dMatB;
float *hMatA, *hMatB;
float *dresult, *hresult;
int length = LEN, count = 0;
//allocation host memory
hMatA = (float*) malloc( LEN * sizeof(float));
hMatB = (float*) malloc( LEN * sizeof(float));
hresult = (float*) malloc( sizeof(float));
//allocation device memory
cudaMalloc( (void**)&dMatA, LEN* sizeof(float));
cudaMalloc( (void**)&dMatB, LEN* sizeof(float));
cudaMalloc( (void**)&dresult, sizeof(float));
// assinging value to host vectors
for( count = 0; count < LEN ; count++ )
hMatA[count] = hMatB[count] = 2.00f;
// copying host vector to device vector
cudaMemcpy((void*)dMatA, (void*)hMatA, LEN* sizeof(float) , cudaMemcpyHostToDevice );
cudaMemcpy((void*)dMatB, (void*)hMatB, LEN* sizeof(float) , cudaMemcpyHostToDevice );
// defining thread grid and block
dim3 dimGrid(1,1);
dim3 dimBlock(10,10);
hresult[0] = 0.00f;
cudaMemcpy((void*)dresult, (void*)hresult, sizeof(float) , cudaMemcpyHostToDevice );
//calling device kernel
VectVect<<<dimGrid, dimBlock>>>( dMatA, dMatB, length, dresult );
//retriving result from device
cudaMemcpy((void*)hresult, (void*)dresult, sizeof(float) , cudaMemcpyDeviceToHost );
printf( " Result : %f ", hresult[0]);
cudaFree(dMatA);
cudaFree(dMatB);
cudaFree(dresult);
free(hMatA);
free(hMatB);
free(hresult);
}// end of main
---------------------------output I got ------------
Result : 4.00
---------------------------expected output------------
Result : 400.00