Time Measurement for CUBLAS why time (clock()) for CUBLAS is always 0 ms for any array size?

The following is the code I am using for testing CUBLAS. However, i found the CUBLAS always take 0 ms no matter what the array size. :blink: Is it correct or is it some mistake in my code? :wacko:

/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

/* Includes, cuda */
#include “cublas.h”

/* Main /
int main(int argc, char
* argv)
{
cublasStatus status;
float* h_A;
float* h_B;
float* h_C;
float* d_A = 0;
float* d_B = 0;
float* d_C = 0;
float alpha = 1.0f;
float beta = 0.0f;
int n2;
int i, size; /* Matrix size */
clock_t time_ffsum_start, time_ffsum_end ;
double looptime_ffsum ;

size = 4000; 



	n2 = size * size;

/* Initialize CUBLAS */
printf("simpleCUBLAS test running..\n");

status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! CUBLAS initialization error\n");
    return EXIT_FAILURE;
}

/* Allocate host memory for the matrices */
h_A = (float*)malloc(n2 * sizeof(h_A[0]));
if (h_A == 0) {
    fprintf (stderr, "!!!! host memory allocation error (A)\n");
    return EXIT_FAILURE;
}
h_B = (float*)malloc(n2 * sizeof(h_B[0]));
if (h_B == 0) {
    fprintf (stderr, "!!!! host memory allocation error (B)\n");
    return EXIT_FAILURE;
}
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0) {
    fprintf (stderr, "!!!! host memory allocation error ©\n");
    return EXIT_FAILURE;
}

/* Fill the matrices with test data */
for (i = 0; i < n2; i++) {
    h_A[i] = 1.0 ;
    h_B[i] = 1.0 ;
    h_C[i] = 0.0 ;
}

/* Allocate device memory for the matrices */
status = cublasAlloc(n2, sizeof(d_A[0]), (void**)&d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device memory allocation error (A)\n");
    return EXIT_FAILURE;
}
status = cublasAlloc(n2, sizeof(d_B[0]), (void**)&d_B);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device memory allocation error (B)\n");
    return EXIT_FAILURE;
}
status = cublasAlloc(n2, sizeof(d_C[0]), (void**)&d_C);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device memory allocation error ©\n");
    return EXIT_FAILURE;
}

/* Initialize the device matrices with the host matrices */
status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device access error (write A)\n");
    return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device access error (write B)\n");
    return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device access error (write C)\n");
    return EXIT_FAILURE;
}

/* Clear last error */
cublasGetError();

/* Performs operation using cublas */

time_ffsum_start = clock(); // begin timing ffsum loop

cublasSgemm('n', 'n', size, size, size, alpha, d_A, size, d_B, size, beta, d_C, size);

time_ffsum_end = clock(); // end timing ffsum loop



status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
}

/* Allocate host memory for reading back the result from device memory */
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0) {
    fprintf (stderr, "!!!! host memory allocation error ©\n");
    return EXIT_FAILURE;
}

/* Read the result back */
status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device access error (read C)\n");
    return EXIT_FAILURE;
}

/*Printing the result*/

looptime_ffsum = (double)(time_ffsum_end - time_ffsum_start);

printf("\t %f \n", looptime_ffsum );

/* for (i = 0; i < n2; i++) {
	 printf("%f \t", *(h_C + i));
 }*/




/* Memory clean up */
free(h_A);
free(h_B);
free(h_C);
  status = cublasFree(d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! memory free error (A)\n");
    return EXIT_FAILURE;
}
status = cublasFree(d_B);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! memory free error (B)\n");
    return EXIT_FAILURE;
}
status = cublasFree(d_C);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! memory free error ©\n");
    return EXIT_FAILURE;
}

/* Shutdown */
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! shutdown error (A)\n");
    return EXIT_FAILURE;
}



if (argc > 1) {
    if (!strcmp(argv[1], "-noprompt") ||
        !strcmp(argv[1], "-qatest") ) 
    {
        return EXIT_SUCCESS;
    }
} 
else
{
    printf("\nPress ENTER to exit...\n");
    getchar();
}

return EXIT_SUCCESS;

}

I am not 100% sure about CUBLAS, but for CUDA kernel launches, the control is returned to your program before the kernel execution is completed on GPU. Try using cudaThreadSynchronize() before the last timing call. Also, time() is only accurate up to a second. For proper timing techniques, lookup “bandwidthTest” project in SDK:

[codebox]

    unsigned int timer = 0;

CUT_SAFE_CALL(cutCreateTimer(&timer));

CUT_SAFE_CALL(cutStartTimer(timer));

// setup execution parameters

     CUDA_CALL();

// check if kernel execution generated and error

CUT_CHECK_ERROR("Kernel execution failed");	

// stop and destroy timer

cudaThreadSynchronize();

CUT_SAFE_CALL(cutStopTimer(timer));

cout<<"\tKernel Processing time: "<<cutGetTimerValue(timer)/1000.<<" (s) \n";

CUT_SAFE_CALL(cutDeleteTimer(timer));

[/codebox]

Cheers!

Thank you Demq. :thumbup:

Yes I need to wait until the threads synchronized and finish the job.

Regards,

Satakarni Bommuluri