I am assessing the CUBLAS level -1,2and 3 BLAS. However I found the CUBLAS level 1 and level 2 BLAS takes nearly 0 milliseconds for computation (excluding communication time) irrespective of array and vector size. Is it true? I am running programs on x64 platform. Please let me know if I am doing any mistake in assessing their compuation time.
Following is my source code (Main cublas code is highlighted.):
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
/* Includes, cuda */
#include "cublas.h"
/* Main */
int main(int argc, char** argv)
{
cublasStatus status;
float* h_A;
float* h_B;
float* h_C;
float* d_A = 0;
float* d_B = 0;
float* d_C = 0;
float alpha = 1.0f;
float beta = 0.0f;
int n2;
int i, size; /* Matrix size */
clock_t time_start, time_end ;
double host_device_time, blas_time, device_host_time, total_time ;
//printf("CUBLAS SGEMM()\n");
size = 320;
while(size <= 4800){
n2 = size * size;
/* Initialize CUBLAS */
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
/* Allocate host memory for the matrices */
h_A = (float*)malloc(n2 * sizeof(h_A[0]));
if (h_A == 0) {
fprintf (stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
h_B = (float*)malloc(n2 * sizeof(h_B[0]));
if (h_B == 0) {
fprintf (stderr, "!!!! host memory allocation error (\n");
return EXIT_FAILURE;
}
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0) {
fprintf (stderr, "!!!! host memory allocation error ©\n");
return EXIT_FAILURE;
}
/* Fill the matrices with test data */
for (i = 0; i < n2; i++) {
h_A[i] = 1.0 ;
h_B[i] = 1.0 ;
h_C[i] = 0.0 ;
}
/* Allocate device memory for the matrices */
status = cublasAlloc(n2, sizeof(d_A[0]), (void**)&d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
status = cublasAlloc(n2, sizeof(d_B[0]), (void**)&d_B);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (\n");
return EXIT_FAILURE;
}
status = cublasAlloc(n2, sizeof(d_C[0]), (void**)&d_C);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error ©\n");
return EXIT_FAILURE;
}
time_start = clock(); // begin timing
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write \n");
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
time_end = clock(); // end timing
host_device_time = (double)(time_end - time_start);
/* Clear last error */
cublasGetError();
/* Performs operation using cublas */
time_start = clock(); // begin timing
cublasSgemm('n', 'n', size, size, size, alpha, d_A, size, d_B, size, beta, d_C, size);
cudaThreadSynchronize();
time_end = clock(); // end timing
blas_time = (double)(time_end - time_start);
status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Allocate host memory for reading back the result from device memory */
h_C = (float*)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0) {
fprintf (stderr, "!!!! host memory allocation error ©\n");
return EXIT_FAILURE;
}
time_start = clock();
/* Read the result back */
status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
time_end = clock(); // end timing
device_host_time = (double)(time_end - time_start);
total_time = host_device_time + blas_time + device_host_time;
/*Printing the result*/
printf("\t %d, %f, %f, %f, %f \n", size, host_device_time, blas_time, device_host_time, total_time );
/* Memory clean up */
free(h_A);
free(h_B);
free(h_C);
status = cublasFree(d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! memory free error (A)\n");
return EXIT_FAILURE;
}
status = cublasFree(d_B);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! memory free error (\n");
return EXIT_FAILURE;
}
status = cublasFree(d_C);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! memory free error ©\n");
return EXIT_FAILURE;
}
/* Shutdown */
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
size = size + 320;
}
if (argc > 1) {
if (!strcmp(argv[1], "-noprompt") ||
!strcmp(argv[1], "-qatest") )
{
return EXIT_SUCCESS;
}
}
else
{
printf("\nPress ENTER to exit...\n");
getchar();
}
return EXIT_SUCCESS;
}
Regards
Satakarni Bommuluri