Hello
I am trying to understand the basics of CUDA, and I am trying to measure the time execution of my programs.
I have a behavior that I found weird (but it is probably not). Here is my code.
#include <iostream>
#include <math.h>
#include <chrono>
// Kernel function to add the elements of two arrays
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void MatProd(float* C, float* A, float*B, int dimAx, int dimBx, int dimCx, int dimCy)
{
int row = blockDim.y*blockIdx.y + threadIdx.y;
int col = blockDim.x*blockIdx.x + threadIdx.x;
//printf("oui! \n");
//printf("%d \n", blockDim.y);
double Result = 0;
if (row <= dimCy - 1 && col <= dimCx - 1)
{
for (int k = 0; k < dimAx; k++)
{
Result += A[k + dimAx*row] * B[col + dimBx*k];
//printf("(%d,%d) \n", row, col);
}
C[col + row*dimCx] = Result;
}
}
int main(void)
{
// Exemple de multiplication de matrice par un scalaire :
std::chrono::high_resolution_clock::time_point programBegin = std::chrono::high_resolution_clock::now();
int lambda = 1000;
// tailles libres
int dimAx = 100;
int dimAy = pow(10, 5);
int dimBx = 2;
// tailles contraintes
int dimBy = dimAx;
int dimCx = dimBx;
int dimCy = dimAy;
float *mat, *res, *d_C, *A, *B;
float millisecondsPureComputation = 0;
float millisecondsMemoryTransfer = 0;
size_t sizeA = sizeof(float)*dimAx*dimAy;
size_t sizeB = sizeof(float)*dimBx*dimBy;
cudaMallocManaged(&A, dimAx*dimAy*sizeof(float));
cudaMallocManaged(&B, dimBx*dimBy*sizeof(float));
// -------------------- WHEN THE NEXT SECTION IS COMMENTED LIKE THAT, THE KERNEL EXECUTION IS WAY FASTER --------------------
// BUT WHY ???
/*
for (int i = 0; i < dimAy; i++)
{
for (int j = 0; j < dimAx; j++)
{
A[j + dimAx*i] = j + 10 * i;
}
}
for (int i = 0; i < dimBy; i++)
{
for (int j = 0; j < dimBx; j++)
{
B[j + dimBx*i] = (j + 1)*pow(i, 2);
}
}
*/
// --------------------END OF COMMENTED SECTION --------------------------------------------------------------------------
// Allocating memory for the result matric C.
gpuErrchk(cudaMallocManaged(&d_C, dimCx*dimCy*sizeof(float)));
// Computation part :
int threadPerBlockx = 32;
int threadPerBlocky = 32;
int BlockPerGridx = 1 + (dimCx - 1) / threadPerBlockx;
int BlockPerGridy = 1 + (dimCy - 1) / threadPerBlockx;
dim3 BlockPerGrid(BlockPerGridx, BlockPerGridy, 1);
dim3 ThreadPerBlock(threadPerBlockx, threadPerBlocky, 1);
gpuErrchk(cudaDeviceSynchronize()); // I wait that the GPU & the CPU start at the same time
std::chrono::high_resolution_clock::time_point ComputationBegin = std::chrono::high_resolution_clock::now();
MatProd << <BlockPerGrid, ThreadPerBlock >> >(d_C, A, B, dimAx, dimBx, dimCx, dimCy);
gpuErrchk(cudaDeviceSynchronize()); // I wait that the Kernel finishes its execution
std::chrono::high_resolution_clock::time_point ComputationEnd = std::chrono::high_resolution_clock::now();
auto durationComputation = std::chrono::duration_cast<std::chrono::milliseconds>(ComputationEnd - ComputationBegin).count();
cudaFree(A);
cudaFree(B);
cudaFree(d_C);
cudaDeviceSynchronize();
std::chrono::high_resolution_clock::time_point programEnd = std::chrono::high_resolution_clock::now();
auto durationProgram = std::chrono::duration_cast<std::chrono::milliseconds>(programEnd - programBegin).count();
std::cout << "Milliseconds CUDA computation : " << durationComputation << " ms" << std::endl;
std::cout << "Total elapsed time : " << durationProgram << " ms" << std::endl;
return 0;
}
In fact my kernel computes in 79ms (global execution : 484ms) when the section from lines 63 to 79 of my code is commented. But when it is not the kernel last for 276ms (global execution : 719ms).
But for me the kernel execution that occurs at line 99 shouldn’t be different ? Indeed when I don’t initialise the variables should just have some random values. And as the initialisation part is not in the kernel, the kernel execution shouldn’t change. It is probably more a C++ issue than a CUDA one, I am not sure.
(Remark : I measure this time of execution using C++ functions at lines 97 & 102)
The point of my post is also to check if I understood well how to measure time using CPU functions (I
think I got how to use the CUDA event functions now). Around the Kernel I synchronised before and after the CPU and GPU.
After the kernel to be sure that the CPU wait that the kernel indeed finished its process before measuring the time.
And before the kernel to be sure that the GPU is not late before starting measuring the time. Indeed if I don’t put this one, I would probably start measuring the time before the GPU is ready to start the computation Kernel.
Thanks a lot.