Here it is.
[codebox] for (j = 0; j < total_iter; j++) {
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
/* Allocate host memory for matrices */
h_A = (float*) malloc(n2 * sizeof(h_A[0]));
h_B = (float*) malloc(n2 * sizeof(h_B[0]));
h_C = (float*) malloc(n2 * sizeof(h_C[0]));
/* Fill the matrices with test data */
for (i = 0; i < n2; i++)
{
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
/* Allocate device memory for the matrices */
float p1 = clock();
cublasAlloc(n2, sizeof(d_A[0]), (void**)&d_A);
cublasAlloc(n2, sizeof(d_B[0]), (void**)&d_B);
cublasAlloc(n2, sizeof(d_C[0]), (void**)&d_C);
cudaThreadSynchronize();
float p2 = clock();
printf("total 1 = %lf\n", (p2 - p1) / CLOCKS_PER_SEC);
/* Copy A and B from host to device*/
float p3 = clock();
cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
cudaThreadSynchronize();
float p4 = clock();
printf("total 2 = %lf\n", (p4 - p3) / CLOCKS_PER_SEC);
/* Perform matrix multiplication using sgemm */
float p5 = clock();
cublasSgemm('n', 'n', M, M, N, alpha, d_A, M, d_B, N, beta, d_C, M);
cudaThreadSynchronize();
float p6 = clock();
printf("total 3 = %lf\n", (p6 - p5) / CLOCKS_PER_SEC);
/* Allocate host memory for reading back the result from device memory */
float p7 = clock();
h_C = (float*) malloc (n2 * sizeof (h_C[0]));
cudaThreadSynchronize();
float p8 = clock();
printf("total 4 = %lf\n", (p8 - p7) / CLOCKS_PER_SEC);
/* Read the result back */
float p9 = clock();
cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
cudaThreadSynchronize();
float p10 = clock();
printf("total 5 = %lf\n", (p10 - p9) / CLOCKS_PER_SEC);
/* memory cleanup */
free (h_A);
free (h_B);
free (h_C);
cublasFree(d_A);
cublasFree(d_B);
cublasFree(d_C);
}
[/codebox]
Notice the difference in total 3 time
total 1 = 0.000400
total 2 = 0.058846
total 3 = 0.542948
total 4 = 0.000026
total 5 = 0.019047
total 1 = 0.000381
total 2 = 0.058186
total 3 = 0.398009
total 4 = 0.000018
total 5 = 0.019063
total 1 = 0.000344
total 2 = 0.059284
total 3 = 0.398278
total 4 = 0.000017
total 5 = 0.019045
total 1 = 0.000348
total 2 = 0.058342
total 3 = 0.395848
total 4 = 0.000015
total 5 = 0.019466