CUBLAS Level 1 and Level 2 BLAS has 0 computaional time. Is it correct? Assesment of the CUBLAS leve

I am assessing the CUBLAS level -1,2and 3 BLAS. However I found the CUBLAS level 1 and level 2 BLAS takes nearly 0 milliseconds for computation (excluding communication time) irrespective of array and vector size. Is it true? I am running programs on x64 platform. Please let me know if I am doing any mistake in assessing their compuation time.

Following is my source code (Main cublas code is highlighted.):

	/* Includes, system */

	#include <stdio.h>

	#include <stdlib.h>

	#include <string.h>

	#include <time.h>

	/* Includes, cuda */

	#include "cublas.h"

	/* Main */

	int main(int argc, char** argv)

	{ 

		cublasStatus status;

		float* h_A;

		float* h_B;

		float* h_C;

		float* d_A = 0;

		float* d_B = 0;

		float* d_C = 0;

		float alpha = 1.0f;

		float beta = 0.0f;

		int n2;

		int i, size; /* Matrix size */

		clock_t time_start, time_end ;

		double host_device_time, blas_time, device_host_time, total_time ;

		//printf("CUBLAS SGEMM()\n");

		size = 320; 

		while(size <= 4800){ 

		n2 = size * size;

		/* Initialize CUBLAS */

		

		status = cublasInit();

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! CUBLAS initialization error\n");

			return EXIT_FAILURE;

			}

		/* Allocate host memory for the matrices */

		h_A = (float*)malloc(n2 * sizeof(h_A[0]));

		if (h_A == 0) {

			fprintf (stderr, "!!!! host memory allocation error (A)\n");

			return EXIT_FAILURE;

			}

		h_B = (float*)malloc(n2 * sizeof(h_B[0]));

		if (h_B == 0) {

			fprintf (stderr, "!!!! host memory allocation error (\n");

			return EXIT_FAILURE;

			}

		h_C = (float*)malloc(n2 * sizeof(h_C[0]));

		if (h_C == 0) {

			fprintf (stderr, "!!!! host memory allocation error ©\n");

			return EXIT_FAILURE;

			}

		/* Fill the matrices with test data */

		for (i = 0; i < n2; i++) {

		h_A[i] = 1.0 ;

		h_B[i] = 1.0 ;

		h_C[i] = 0.0 ;

		}

		/* Allocate device memory for the matrices */

		status = cublasAlloc(n2, sizeof(d_A[0]), (void**)&d_A);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! device memory allocation error (A)\n");

			return EXIT_FAILURE;

			}

		status = cublasAlloc(n2, sizeof(d_B[0]), (void**)&d_B);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! device memory allocation error (\n");

			return EXIT_FAILURE;

			}

		status = cublasAlloc(n2, sizeof(d_C[0]), (void**)&d_C);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! device memory allocation error ©\n");

			return EXIT_FAILURE;

			}

		time_start = clock(); // begin timing 

		/* Initialize the device matrices with the host matrices */

		status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! device access error (write A)\n");

			return EXIT_FAILURE;

			}

		status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! device access error (write \n");

			return EXIT_FAILURE;

			}

		status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! device access error (write C)\n");

			return EXIT_FAILURE;

			}

		time_end = clock(); // end timing

		host_device_time = (double)(time_end - time_start);

		/* Clear last error */

		cublasGetError();

		/* Performs operation using cublas */

		time_start = clock(); // begin timing 

		cublasSgemm('n', 'n', size, size, size, alpha, d_A, size, d_B, size, beta, d_C, size);

		cudaThreadSynchronize();			

                                            time_end = clock(); // end timing 		

                             	blas_time = (double)(time_end - time_start);

		status = cublasGetError();

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! kernel execution error.\n");

			return EXIT_FAILURE;

			}

		/* Allocate host memory for reading back the result from device memory */

		h_C = (float*)malloc(n2 * sizeof(h_C[0]));

		if (h_C == 0) {

			fprintf (stderr, "!!!! host memory allocation error ©\n");

			return EXIT_FAILURE;

			}

		time_start = clock();

		/* Read the result back */

		status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! device access error (read C)\n");

			return EXIT_FAILURE;

			}

		time_end = clock(); // end timing 

		device_host_time = (double)(time_end - time_start);

			

		total_time = host_device_time + blas_time + device_host_time;

		/*Printing the result*/

		printf("\t %d, %f, %f, %f, %f \n", size, host_device_time, blas_time, device_host_time, total_time );

		/* Memory clean up */

		free(h_A);

		free(h_B);

		free(h_C);

		status = cublasFree(d_A);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! memory free error (A)\n");

			return EXIT_FAILURE;

			}

		status = cublasFree(d_B);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! memory free error (\n");

			return EXIT_FAILURE;

			}

		status = cublasFree(d_C);

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! memory free error ©\n");

			return EXIT_FAILURE;

			}

		/* Shutdown */

		status = cublasShutdown();

		if (status != CUBLAS_STATUS_SUCCESS) {

			fprintf (stderr, "!!!! shutdown error (A)\n");

			return EXIT_FAILURE;

			}

		size = size + 320;

		}

		if (argc > 1) {

		if (!strcmp(argv[1], "-noprompt") ||

			!strcmp(argv[1], "-qatest") ) 

			{

			return EXIT_SUCCESS;

			}

		} 

		else

		{

			printf("\nPress ENTER to exit...\n");

			getchar();

		}

		return EXIT_SUCCESS;

	}

Regards

Satakarni Bommuluri

You can’t use clock() for timing in the way you are trying to. Difference successive results from gettimeofday() to measure elapsed wall clock time.

And isn’t “your source” just the simpleCUBLAS example from the SDK wrapped in a loop?

Hi

I think I corrected my mistake. I wasn’t using cudaThreadSynchronize(); after the CUBLAS Level 1 and 2 call.
Following is the code I am using (and not my code ! :D ):

                                             time_start = clock(); // begin timing 

		cublasSgemv('n',size, size, alpha, d_A, size, d_B, incx, beta, d_C, incy);

		cudaThreadSynchronize();

		time_end = clock(); // end timing 

		blas_time = (double)(time_end - time_start);

At the same time I am assessing the Intel MKL BLAS.

I found that CUBLAS Level 3 performance >> MKL BLAS Level 3 but CUBLAS Level 1 and 2 performance is inferior than MKL BLAS Level 1 and 2.

What could be the major reasons for this ? ANd is their any optimization techniques for CUBLAS Level 1 and level 2 to improve their performance?

I will repeat myself. You cannot use clock() in the way you are using it to time portions of code running on the GPU. From the man page (which is straight from the ISO C90/POSIX standards):

[codebox]NAME

   clock - Determine processor time

SYNOPSIS

   #include <time.h>

clock_t clock(void);

DESCRIPTION

   The clock() function returns an approximation of processor time used by the program.

RETURN VALUE

   The  value  returned is the CPU time used so far as a clock_t; to get the number of seconds used,

   divide by CLOCKS_PER_SEC.  If the processor time used is not available or  its  value  cannot  be

   represented, the function returns the value (clock_t) -1.

[/codebox]

The only reliable, objective measure you should use is wall clock time, and that should be obtained by differencing successive calls to getimeofday() with appropriate use of cudaThreadSynchronize() to ensure each CUBLAS call finishes before the host code jumps through the timed code path.