Here it is.

[codebox] for (j = 0; j < total_iter; j++) {

status = cublasInit();

```
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
```

/* Allocate host memory for matrices */

```
h_A = (float*) malloc(n2 * sizeof(h_A[0]));
h_B = (float*) malloc(n2 * sizeof(h_B[0]));
h_C = (float*) malloc(n2 * sizeof(h_C[0]));
```

/* Fill the matrices with test data */

```
for (i = 0; i < n2; i++)
{
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
```

/* Allocate device memory for the matrices */

```
float p1 = clock();
cublasAlloc(n2, sizeof(d_A[0]), (void**)&d_A);
cublasAlloc(n2, sizeof(d_B[0]), (void**)&d_B);
cublasAlloc(n2, sizeof(d_C[0]), (void**)&d_C);
cudaThreadSynchronize();
float p2 = clock();
printf("total 1 = %lf\n", (p2 - p1) / CLOCKS_PER_SEC);
```

/* Copy A and B from host to device*/

```
float p3 = clock();
cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
cudaThreadSynchronize();
float p4 = clock();
printf("total 2 = %lf\n", (p4 - p3) / CLOCKS_PER_SEC);
```

/* Perform matrix multiplication using sgemm */

```
float p5 = clock();
cublasSgemm('n', 'n', M, M, N, alpha, d_A, M, d_B, N, beta, d_C, M);
cudaThreadSynchronize();
float p6 = clock();
printf("total 3 = %lf\n", (p6 - p5) / CLOCKS_PER_SEC);
```

/* Allocate host memory for reading back the result from device memory */

```
float p7 = clock();
h_C = (float*) malloc (n2 * sizeof (h_C[0]));
cudaThreadSynchronize();
float p8 = clock();
printf("total 4 = %lf\n", (p8 - p7) / CLOCKS_PER_SEC);
```

/* Read the result back */

```
float p9 = clock();
cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
cudaThreadSynchronize();
float p10 = clock();
printf("total 5 = %lf\n", (p10 - p9) / CLOCKS_PER_SEC);
```

/* memory cleanup */

```
free (h_A);
free (h_B);
free (h_C);
cublasFree(d_A);
cublasFree(d_B);
cublasFree(d_C);
```

}

[/codebox]

Notice the difference in total 3 time

total 1 = 0.000400

total 2 = 0.058846

total 3 = 0.542948

total 4 = 0.000026

total 5 = 0.019047

total 1 = 0.000381

total 2 = 0.058186

total 3 = 0.398009

total 4 = 0.000018

total 5 = 0.019063

total 1 = 0.000344

total 2 = 0.059284

total 3 = 0.398278

total 4 = 0.000017

total 5 = 0.019045

total 1 = 0.000348

total 2 = 0.058342

total 3 = 0.395848

total 4 = 0.000015

total 5 = 0.019466