how to measure the execution time of matrix multiplication program

i want to know the execution time of matrix multiplication program .

i wrote the following code in visual studio 2010.

#include<cuda.h>
#include<stdio.h>

int main(void) {
//capture the start time
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
void MatrixMultiplication(float , float , float , int);
const int Width = 5;
float M[Width
Width], N[Width
Width], P[Width
Width];
for(int i = 0; i < (WidthWidth) ; i++) {
M[i] = 5;
N[i] = 10;
P[i] = 0;
}
MatrixMultiplication(M, N, P, Width);
for(int i = 0; i < (Width
Width) ; i++) {
printf("%f \n", P[i]);
}
int quit;
scanf("%d",&quit);
return 0;
}

//Matrix multiplication kernel - thread specification
global void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width) {
//2D Thread ID
int tx = threadIdx.x;
int ty = threadIdx.y;

//Pvalue stores the Pd element that is computed by the thread
float Pvalue = 0;

for(int k = 0; k < Width ; ++k) {
    float Mdelement = Md[ty*Width + k];
    float Ndelement = Nd[k*Width + tx];
    Pvalue += (Mdelement*Ndelement);
}

Pd[ty*Width + tx] = Pvalue;

}

void MatrixMultiplication(float *M, float N, float P, int Width) {
int size = Width
Width
sizeof(float);
float *Md, *Nd, *Pd;

//Transfer M and N to device memory
cudaMalloc((void**)&Md, size);
cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
cudaMalloc((void**)&Nd, size);
cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);

//Allocate P on the device
cudaMalloc((void**)&Pd,size);

//Setup the execution configuration
dim3 dimBlock(Width,Width);
dim3 dimGrid(1,1);

//Launch the device computation threads!
MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);

//Transfer P from device to host
cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);

//get stop time, and display the timing results
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("Time to generate: %3.1f ms\n",elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//Free device matrices
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);

}

when i run the above code, i got the following errors
Error 1 error : identifier “stop” is undefined C:\Users\kittu\Documents\Visual Studio 2010\Projects\matrix multiplication\matrix multiplication\kernel.cu 69 1 matrix multiplication

Error 2 error : identifier “start” is undefined C:\Users\kittu\Documents\Visual Studio 2010\Projects\matrix multiplication\matrix multiplication\kernel.cu 72 1 matrix multiplication

Error 3 error MSB3721: The command ““C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\bin\nvcc.exe” -gencode=arch=compute_10,code=“sm_10,compute_10” --use-local-env --cl-version 2010 -ccbin “C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin” -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\include” -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\include" -G --keep-dir “Debug” -maxrregcount=0 --machine 32 --compile -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler “/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o “Debug\kernel.cu.obj” “C:\Users\kittu\Documents\Visual Studio 2010\Projects\matrix multiplication\matrix multiplication\kernel.cu”” exited with code 2. C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\BuildCustomizations\CUDA 5.0.targets 592 10 matrix multiplication

Also i got the result of matrix multiplication (list of 25 "250 "s), but i didn’t get the elapsed time.

Tell me where i need to change the code.

Thanks in advance.

regards,
balakrishna.

Hi,

first you should declare cudaEvents (start and stop) in the MatrixMultiplication function or you should pass them as an argument to this function. You can declare both Events as Global variables too…

Second, you should put cudaEventRecord(name_event,0) when you want to start and finish obtain the time. Usually…

cudaEventRecord…
cudaMemcpy
Kernel<<<…>>>
cudaMemcpy
cudaEventRecord…

or if you only wants kernel execution time

cudaEventRecord…
Kernel<<<…>>>
cudaEventRecord…

I can’t help you with the last error (I don’t use Windows) but you should look for a SDK sample.

Good luck!