Time Measurement for CUBLAS why time (clock()) for CUBLAS is always 0 ms for any array size?

satakarni · March 21, 2009, 9:45am

The following is the code I am using for testing CUBLAS. However, i found the CUBLAS always take 0 ms no matter what the array size. :blink: Is it correct or is it some mistake in my code? :wacko:

/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

/* Includes, cuda */
#include “cublas.h”

/* Main /
int main(int argc, char* argv)
{
cublasStatus status;
float* h_A;
float* h_B;
float* h_C;
float* d_A = 0;
float* d_B = 0;
float* d_C = 0;
float alpha = 1.0f;
float beta = 0.0f;
int n2;
int i, size; /* Matrix size */
clock_t time_ffsum_start, time_ffsum_end ;
double looptime_ffsum ;

size = 4000; 



	n2 = size * size;

/* Initialize CUBLAS */
printf("simpleCUBLAS test running..\n");

status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! CUBLAS initialization error\n");
    return EXIT_FAILURE;
}

/* Allocate host memory for the matrices */
h_A = (float*)malloc(n2 * sizeof(h_A[0]));
if (h_A == 0) {
    fprintf (stderr, "!!!! host memory allocation error (A)\n");
    return EXIT_FAILURE;
}
h_B = (float*)malloc(n2 * sizeof(h_B[0]));
if (h_B == 0) {
    fprintf (stderr, "!!!! host memory allocation error (B)\n");
    return EXIT_FAILURE;
}
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0) {
    fprintf (stderr, "!!!! host memory allocation error ©\n");
    return EXIT_FAILURE;
}

/* Fill the matrices with test data */
for (i = 0; i < n2; i++) {
    h_A[i] = 1.0 ;
    h_B[i] = 1.0 ;
    h_C[i] = 0.0 ;
}

/* Allocate device memory for the matrices */
status = cublasAlloc(n2, sizeof(d_A[0]), (void**)&d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device memory allocation error (A)\n");
    return EXIT_FAILURE;
}
status = cublasAlloc(n2, sizeof(d_B[0]), (void**)&d_B);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device memory allocation error (B)\n");
    return EXIT_FAILURE;
}
status = cublasAlloc(n2, sizeof(d_C[0]), (void**)&d_C);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device memory allocation error ©\n");
    return EXIT_FAILURE;
}

/* Initialize the device matrices with the host matrices */
status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device access error (write A)\n");
    return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device access error (write B)\n");
    return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device access error (write C)\n");
    return EXIT_FAILURE;
}

/* Clear last error */
cublasGetError();

/* Performs operation using cublas */

time_ffsum_start = clock(); // begin timing ffsum loop

cublasSgemm('n', 'n', size, size, size, alpha, d_A, size, d_B, size, beta, d_C, size);

time_ffsum_end = clock(); // end timing ffsum loop



status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! kernel execution error.\n");
    return EXIT_FAILURE;
}

/* Allocate host memory for reading back the result from device memory */
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0) {
    fprintf (stderr, "!!!! host memory allocation error ©\n");
    return EXIT_FAILURE;
}

/* Read the result back */
status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! device access error (read C)\n");
    return EXIT_FAILURE;
}

/*Printing the result*/

looptime_ffsum = (double)(time_ffsum_end - time_ffsum_start);

printf("\t %f \n", looptime_ffsum );

/* for (i = 0; i < n2; i++) {
	 printf("%f \t", *(h_C + i));
 }*/




/* Memory clean up */
free(h_A);
free(h_B);
free(h_C);
  status = cublasFree(d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! memory free error (A)\n");
    return EXIT_FAILURE;
}
status = cublasFree(d_B);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! memory free error (B)\n");
    return EXIT_FAILURE;
}
status = cublasFree(d_C);
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! memory free error ©\n");
    return EXIT_FAILURE;
}

/* Shutdown */
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! shutdown error (A)\n");
    return EXIT_FAILURE;
}



if (argc > 1) {
    if (!strcmp(argv[1], "-noprompt") ||
        !strcmp(argv[1], "-qatest") ) 
    {
        return EXIT_SUCCESS;
    }
} 
else
{
    printf("\nPress ENTER to exit...\n");
    getchar();
}

return EXIT_SUCCESS;

}

Demq · March 21, 2009, 3:38pm

I am not 100% sure about CUBLAS, but for CUDA kernel launches, the control is returned to your program before the kernel execution is completed on GPU. Try using cudaThreadSynchronize() before the last timing call. Also, time() is only accurate up to a second. For proper timing techniques, lookup “bandwidthTest” project in SDK:

[codebox]

    unsigned int timer = 0;

CUT_SAFE_CALL(cutCreateTimer(&timer));

CUT_SAFE_CALL(cutStartTimer(timer));

// setup execution parameters

     CUDA_CALL();

// check if kernel execution generated and error

CUT_CHECK_ERROR("Kernel execution failed");	

// stop and destroy timer

cudaThreadSynchronize();

CUT_SAFE_CALL(cutStopTimer(timer));

cout<<"\tKernel Processing time: "<<cutGetTimerValue(timer)/1000.<<" (s) \n";

CUT_SAFE_CALL(cutDeleteTimer(timer));

[/codebox]

Cheers!

satakarni · March 21, 2009, 6:21pm

Thank you Demq. External Media

Yes I need to wait until the threads synchronized and finish the job.

Regards,

Satakarni Bommuluri

I am not 100% sure about CUBLAS, but for CUDA kernel launches, the control is returned to your program before the kernel execution is completed on GPU. Try using cudaThreadSynchronize() before the last timing call. Also, time() is only accurate up to a second. For proper timing techniques, lookup “bandwidthTest” project in SDK:

[codebox]
    unsigned int timer = 0;

CUT_SAFE_CALL(cutCreateTimer(&timer));

CUT_SAFE_CALL(cutStartTimer(timer));

// setup execution parameters

     CUDA_CALL();

// check if kernel execution generated and error

CUT_CHECK_ERROR("Kernel execution failed");	

// stop and destroy timer

cudaThreadSynchronize();

CUT_SAFE_CALL(cutStopTimer(timer));

cout<<"\tKernel Processing time: "<<cutGetTimerValue(timer)/1000.<<" (s) \n";

CUT_SAFE_CALL(cutDeleteTimer(timer));
[/codebox]

Cheers!

Topic		Replies	Views
CUBLAS Level 1 and Level 2 BLAS has 0 computaional time. Is it correct? Assesment of the CUBLAS leve CUDA Programming and Performance	3	3716	April 24, 2009
Help with CUBLAS performance and timing issues, please help... CUDA Programming and Performance	1	3475	December 26, 2008
CUBLAS timing CUDA Programming and Performance	0	10838	July 20, 2010
Evaluate cycle execution time Newbie question CUDA Programming and Performance	1	2186	July 13, 2007
What's the overhead to call CUBLAS APIs? In my applicastion it is 9 ms..... CUDA Programming and Performance	3	4982	October 31, 2007
CUBLAS question cublasGetVector() call CUDA Programming and Performance	3	5646	November 19, 2009
Faster MatrixMult than CUBLAS! CUDA Programming and Performance	4	2839	September 4, 2009
How to time cublas functions? cublasSgemv V.S nested loops CUDA Programming and Performance	4	1352	June 26, 2009
Varying Execution time CUDA Programming and Performance	2	1104	June 10, 2010
cudaThreadSynchronize() with cublas figuring out the bottleneck of cublas matrix multipl. CUDA Programming and Performance	1	4418	October 7, 2009

Time Measurement for CUBLAS why time (clock()) for CUBLAS is always 0 ms for any array size?

Related topics