Help Matrix Multiplication using cuBLAS

Hi,

I am using cuBLAS for the first time. I am trying to compile the SDK example (given at the end of this post) I have saved this in file named cublasExample.c .I am facing some problems in successfully compiling the code.

1- How do I compile this? Shall I use gcc cublasExample.c -o output ?

2- when I used gcc its giving lots of errors. Using NVCC gave me the following output:

/tmp/tmpxft_0000163a_00000000-1_cublasExample.o: In function `main':

cublasExample.c:(.text+0x10d): undefined reference to `cublasInit'

cublasExample.c:(.text+0x2b4): undefined reference to `cublasAlloc'

cublasExample.c:(.text+0x308): undefined reference to `cublasAlloc'

cublasExample.c:(.text+0x35c): undefined reference to `cublasAlloc'

cublasExample.c:(.text+0x3c7): undefined reference to `cublasSetVector'

cublasExample.c:(.text+0x432): undefined reference to `cublasSetVector'

cublasExample.c:(.text+0x49d): undefined reference to `cublasSetVector'

cublasExample.c:(.text+0x511): undefined reference to `cublasGetError'

cublasExample.c:(.text+0x578): undefined reference to `cublasSgemm'

cublasExample.c:(.text+0x57d): undefined reference to `cublasGetError'

cublasExample.c:(.text+0x630): undefined reference to `cublasGetVector'

cublasExample.c:(.text+0x7c4): undefined reference to `cublasFree'

cublasExample.c:(.text+0x809): undefined reference to `cublasFree'

cublasExample.c:(.text+0x84e): undefined reference to `cublasFree'

cublasExample.c:(.text+0x88d): undefined reference to `cublasShutdown'

collect2: ld returned 1 exit status

Please help me in solving this issue.

cublasExample.c which I am trying to compile

/* Includes, system */

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

/* Includes, cuda */

#include "cublas.h"

/* Matrix size */

#define N  (275)

/* Host implementation of a simple version of sgemm */

static void simple_sgemm(int n, float alpha, const float *A, const float *B,

						 float beta, float *C)

{

	int i;

	int j;

	int k;

	for (i = 0; i < n; ++i) {

		for (j = 0; j < n; ++j) {

			float prod = 0;

			for (k = 0; k < n; ++k) {

				prod += A[k * n + i] * B[j * n + k];

			}

			C[j * n + i] = alpha * prod + beta * C[j * n + i];

		}

	}

}

/* Main */

int main(int argc, char** argv)

{	

	cublasStatus status;

	float* h_A;

	float* h_B;

	float* h_C;

	float* h_C_ref;

	float* d_A = 0;

	float* d_B = 0;

	float* d_C = 0;

	float alpha = 1.0f;

	float beta = 0.0f;

	int n2 = N * N;

	int i;

	float error_norm;

	float ref_norm;

	float diff;

	/* Initialize CUBLAS */

	printf("simpleCUBLAS test running..\n");

	status = cublasInit();

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! CUBLAS initialization error\n");

		return EXIT_FAILURE;

	}

	/* Allocate host memory for the matrices */

	h_A = (float*)malloc(n2 * sizeof(h_A[0]));

	if (h_A == 0) {

		fprintf (stderr, "!!!! host memory allocation error (A)\n");

		return EXIT_FAILURE;

	}

	h_B = (float*)malloc(n2 * sizeof(h_B[0]));

	if (h_B == 0) {

		fprintf (stderr, "!!!! host memory allocation error (B)\n");

		return EXIT_FAILURE;

	}

	h_C = (float*)malloc(n2 * sizeof(h_C[0]));

	if (h_C == 0) {

		fprintf (stderr, "!!!! host memory allocation error (C)\n");

		return EXIT_FAILURE;

	}

	/* Fill the matrices with test data */

	for (i = 0; i < n2; i++) {

		h_A[i] = rand() / (float)RAND_MAX;

		h_B[i] = rand() / (float)RAND_MAX;

		h_C[i] = rand() / (float)RAND_MAX;

	}

	/* Allocate device memory for the matrices */

	status = cublasAlloc(n2, sizeof(d_A[0]), (void**)&d_A);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device memory allocation error (A)\n");

		return EXIT_FAILURE;

	}

	status = cublasAlloc(n2, sizeof(d_B[0]), (void**)&d_B);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device memory allocation error (B)\n");

		return EXIT_FAILURE;

	}

	status = cublasAlloc(n2, sizeof(d_C[0]), (void**)&d_C);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device memory allocation error (C)\n");

		return EXIT_FAILURE;

	}

	/* Initialize the device matrices with the host matrices */

	status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device access error (write A)\n");

		return EXIT_FAILURE;

	}

	status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device access error (write B)\n");

		return EXIT_FAILURE;

	}

	status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device access error (write C)\n");

		return EXIT_FAILURE;

	}

	

	/* Performs operation using plain C code */

	simple_sgemm(N, alpha, h_A, h_B, beta, h_C);

	h_C_ref = h_C;

	/* Clear last error */

	cublasGetError();

	/* Performs operation using cublas */

	cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);

	status = cublasGetError();

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! kernel execution error.\n");

		return EXIT_FAILURE;

	}

	

	/* Allocate host memory for reading back the result from device memory */

	h_C = (float*)malloc(n2 * sizeof(h_C[0]));

	if (h_C == 0) {

		fprintf (stderr, "!!!! host memory allocation error (C)\n");

		return EXIT_FAILURE;

	}

	/* Read the result back */

	status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device access error (read C)\n");

		return EXIT_FAILURE;

	}

	/* Check result against reference */

	error_norm = 0;

	ref_norm = 0;

	for (i = 0; i < n2; ++i) {

		diff = h_C_ref[i] - h_C[i];

		error_norm += diff * diff;

		ref_norm += h_C_ref[i] * h_C_ref[i];

	}

	error_norm = (float)sqrt((double)error_norm);

	ref_norm = (float)sqrt((double)ref_norm);

	if (fabs(ref_norm) < 1e-7) {

		fprintf (stderr, "!!!! reference norm is 0\n");

		return EXIT_FAILURE;

	}

	printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED");

	/* Memory clean up */

	free(h_A);

	free(h_B);

	free(h_C);

	free(h_C_ref);

	status = cublasFree(d_A);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! memory free error (A)\n");

		return EXIT_FAILURE;

	}

	status = cublasFree(d_B);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! memory free error (B)\n");

		return EXIT_FAILURE;

	}

	status = cublasFree(d_C);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! memory free error (C)\n");

		return EXIT_FAILURE;

	}

	/* Shutdown */

	status = cublasShutdown();

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! shutdown error (A)\n");

		return EXIT_FAILURE;

	}

	if (argc > 1) {

		if (!strcmp(argv[1], "-noprompt") ||

			!strcmp(argv[1], "-qatest") ) 

		{

			return EXIT_SUCCESS;

		}

	} 

	else

	{

		printf("\nPress ENTER to exit...\n");

		getchar();

	}

	return EXIT_SUCCESS;

}

Those errors are from the linker, not the compiler. You aren’t linking the cublas library. Add -lcublas as an argument to gcc.

Many thanks for your reply. I did this:

[root@Kiran ~]# gcc -lcublas cublasExample.c

And the output is:

cublasExample.c:6:20: error: cublas.h: No such file or directory

cublasExample.c: In function ‘main’:

cublasExample.c:32: error: ‘cublasStatus’ undeclared (first use in this function)

cublasExample.c:32: error: (Each undeclared identifier is reported only once

cublasExample.c:32: error: for each function it appears in.)

cublasExample.c:32: error: expected ‘;’ before ‘status’

cublasExample.c:51: error: ‘status’ undeclared (first use in this function)

cublasExample.c:52: error: ‘CUBLAS_STATUS_SUCCESS’ undeclared (first use in this function)

cublasExample.c:152: warning: incompatible implicit declaration of built-in function ‘sqrt’

cublasExample.c:154: warning: incompatible implicit declaration of built-in function ‘fabs’

External Media Still errors!

Using gcc like that won’t ever work. If you want to be that simplistic, use nvcc and add -lcublas.

Thanks once again. I successfully executed the program! :-)

OK, so now go back and make it work for gcc. As yourself the following three questions:

  1. How would I tell the compiler where to find cublas.h?
  2. How would I tell the linker to link in libcublas.so?
  3. How would I tell the linker where to find libcublas.so?

ya definitely I will try it. I was on vacations for the last two days :-)

By the way right now I am facing difficulty in understanding the following section of teh above code:

/* Check result against reference */

	error_norm = 0;

	ref_norm = 0;

	for (i = 0; i < n2; ++i) {

		diff = h_C_ref[i] - h_C[i];

		error_norm += diff * diff;

		ref_norm += h_C_ref[i] * h_C_ref[i];

	}

	error_norm = (float)sqrt((double)error_norm);

	ref_norm = (float)sqrt((double)ref_norm);

	if (fabs(ref_norm) < 1e-7) {

		fprintf (stderr, "!!!! reference norm is 0\n");

		return EXIT_FAILURE;

	}

	printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED");

I am sure its some kind of test to check how much accurate is my result, but unfortunately not more than that is clear to me.

Any pointers here please…?

h_C_ref is an array that stores the values of your output matrix from the CPU. On the other hand, h_C is an array that stores the values of your output matrix from the GPU calculation (specifically, copied from your device pointer). The routine calculates the mean square error between the two to ensure that the two output matrices are close to one another.

I am going to find the answers of the three questions above. But before that I would like to know what is the function of “-lcublas”. (I was able to compile and run using nvcc -lcublas name.cpp -o outputfile)

It is the answer to question 2.

The code over here just checks whether the result returned by the cublas is accurate or not.

The thing is that in their code they must have used some special functions for multiplications or other tasks like __mul24()

That is why they are performing a check to hoe correct their result is…