I’ve been trying to get familiar with using CUDA and cuBlas for my application, so I coded some timing into a test code and found that the time it takes to execute cudaFree() depends significantly on the number of calls to the cublasDgemm() routine. The relevant sections of my code are:
//... allocation and initialization of host memory is omitted ...
cudaMallocPitch((void **)&d_A, &pchSzA, msz, p);
cudaMallocPitch((void **)&d_B, &pchSzB, psz, n);
cudaMallocPitch((void **)&d_C, &pchSzC, msz, n);
pchNumA=pchSzA/sizeof(double);
pchNumB=pchSzB/sizeof(double);
pchNumC=pchSzC/sizeof(double);
cudaMemcpy2D(d_A,pchSzA,A,msz,msz,p,cudaMemcpyHostToDevice);
cudaMemcpy2D(d_B,pchSzB,B,psz,psz,n,cudaMemcpyHostToDevice);
for (r = 0; r < LOOP_COUNT; r++) {
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, p, &alpha, d_A, pchNumA, d_B, pchNumB, &beta, d_C, pchNumC);
}
s_initial = std::chrono::high_resolution_clock::now();
if (cudaFree(d_A) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (A): %d",cudaGetLastError());
}
s_final = std::chrono::high_resolution_clock::now();
s_elapsed = (double)std::chrono::duration_cast<std::chrono::milliseconds>
(s_final-s_initial).count();
printf (" == d_A freed, elapsed time is %.5f milliseconds == ", (s_elapsed));
s_initial = dsecnd();
And similar cudaFree() calls for B & C. (I included additional error trapping, but I left it out here to condense the code.)
I get these results:
LOOP_COUNT=1: cudaFree time = 38 milliseconds
LOOP_COUNT=100: cudaFree time = 1864 milliseconds
LOOP_COUNT=10000: cudaFree time = 106580 milliseconds
Has anyone else seen this behavior?
I am using CUDA 5.0 with an NVS 4200M chip.