i want to know the execution time of matrix multiplication program .
i wrote the following code in visual studio 2010.
#include<cuda.h>
#include<stdio.h>
int main(void) {
//capture the start time
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
void MatrixMultiplication(float , float , float , int);
const int Width = 5;
float M[WidthWidth], N[WidthWidth], P[WidthWidth];
for(int i = 0; i < (WidthWidth) ; i++) {
M[i] = 5;
N[i] = 10;
P[i] = 0;
}
MatrixMultiplication(M, N, P, Width);
for(int i = 0; i < (WidthWidth) ; i++) {
printf(“%f \n”, P[i]);
}
int quit;
scanf(“%d”,&quit);
return 0;
}
//Matrix multiplication kernel - thread specification
global void MatrixMulKernel(float *Md, float *Nd, float *Pd, int Width) {
//2D Thread ID
int tx = threadIdx.x;
int ty = threadIdx.y;
//Pvalue stores the Pd element that is computed by the thread
float Pvalue = 0;
for(int k = 0; k < Width ; ++k) {
float Mdelement = Md[ty*Width + k];
float Ndelement = Nd[k*Width + tx];
Pvalue += (Mdelement*Ndelement);
}
Pd[ty*Width + tx] = Pvalue;
}
void MatrixMultiplication(float *M, float N, float P, int Width) {
int size = WidthWidthsizeof(float);
float *Md, *Nd, *Pd;
//Transfer M and N to device memory
cudaMalloc((void**)&Md, size);
cudaMemcpy(Md,M,size,cudaMemcpyHostToDevice);
cudaMalloc((void**)&Nd, size);
cudaMemcpy(Nd,N,size,cudaMemcpyHostToDevice);
//Allocate P on the device
cudaMalloc((void**)&Pd,size);
//Setup the execution configuration
dim3 dimBlock(Width,Width);
dim3 dimGrid(1,1);
//Launch the device computation threads!
MatrixMulKernel<<<dimGrid,dimBlock>>>(Md,Nd,Pd,Width);
//Transfer P from device to host
cudaMemcpy(P,Pd,size,cudaMemcpyDeviceToHost);
//get stop time, and display the timing results
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("Time to generate: %3.1f ms\n",elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//Free device matrices
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
}
when i run the above code, i got the following errors
Error 1 error : identifier “stop” is undefined C:\Users\kittu\Documents\Visual Studio 2010\Projects\matrix multiplication\matrix multiplication\kernel.cu 69 1 matrix multiplication
Error 2 error : identifier “start” is undefined C:\Users\kittu\Documents\Visual Studio 2010\Projects\matrix multiplication\matrix multiplication\kernel.cu 72 1 matrix multiplication
Error 3 error MSB3721: The command ““C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\bin\nvcc.exe” -gencode=arch=compute_10,code="sm_10,compute_10" --use-local-env --cl-version 2010 -ccbin “C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin” -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\include” -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.0\include" -G --keep-dir “Debug” -maxrregcount=0 --machine 32 --compile -g -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler “/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd " -o “Debug\kernel.cu.obj” “C:\Users\kittu\Documents\Visual Studio 2010\Projects\matrix multiplication\matrix multiplication\kernel.cu”” exited with code 2. C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\BuildCustomizations\CUDA 5.0.targets 592 10 matrix multiplication
Also i got the result of matrix multiplication (list of 25 "250 "s), but i didn’t get the elapsed time.
Tell me where i need to change the code.
Thanks in advance.
regards,
balakrishna.