cublasAlloc fails even though there is enough memory

Hi everyone,

I’ve just started working with CUBLAS, so there might be something simple that I am missing.

The app I have written allocates 3 matrices (A, B and C) and uses them with the cublasSgemm function to multiply A and B and store the result in C. My matrix dimensions are 2048*2048.

cublasAlloc succeeds in allocating A and B, but fails with error code 3 (CUBLAS_STATUS_ALLOC_FAILED) when trying to allocate C, indicating that there’s insufficient memory. But that can’t be right because I have 256MB on my GPU (GeForce 8600M GT). These 3 matrices should equate to 48MB. Furthermore, another application I wrote using the CUDA runtime functions successfully allocates 3 2048*2048 arrays (my own attempt at matrix multiplication, before I found CUBLAS).

Can someone please point out why cublasAlloc is failing.

My laptop’s specs:

Windows Vista Business 32 bit.

GeForce 8600M GT with 256MB memory.

CUDA toolkit and SDK version 2.3.

CUDA notebook driver version 195.62.

Below is the code of my CUBLAS test application.

Thanks in advance for any help.

// CUBLASTest_MatrixMatrixMultiply.cpp : Defines the entry point for the console application.

//

#include "stdafx.h"

#include <stdlib.h>

#include <string.h>

#include <iostream>

#include "cublas.h"

#include "cuda.h"

#include "cutil.h"

using namespace std;

#define N 2048 //NxN element matrices.

//#define TEST_DOUBLE

int _tmain(int argc, char** argv)

{

#ifdef TEST_DOUBLE

	double* A;

	double* B;

	double* C;

	double* d_A = 0;

	double* d_B = 0;

	double* d_C = 0;

#else

	float* A;

	float* B;

	float* C;

	float* d_A = 0;

	float* d_B = 0;

	float* d_C = 0;

#endif

	CUdevice device;

	CUcontext context;

	int n2 = N*N;

	cublasStatus status;

	cout << "CUBLAS large matrix multiplication timing test." << endl;

	cout << "Binding to first available CUDA device." << endl;

	if (cuInit(0) != CUDA_SUCCESS)

	{

		cout << "CUDA initialization failed." << endl;

		getchar();

		return 0;

	}

	if (cuDeviceGet(&device, 0) != CUDA_SUCCESS)

	{

		cout << "Unable to get CUDA device." << endl;

		getchar();

		return 0;

	}

	char device_name[1024];

	cuDeviceGetName(device_name, 1024, device);

	cout << "CUDA device name: " << device_name << endl;

	if (cuCtxCreate(&context, CU_CTX_SCHED_YIELD, device) != CUDA_SUCCESS)

	{

		cout << "Unable to create CUDA context." << endl;

		getchar();

		return 0;

	}

	cout << "Initializing CUBLAS." << endl;

	status = cublasInit();

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! CUBLAS initialization error\n");

		getchar();

		return EXIT_FAILURE;

	}

#ifdef TEST_DOUBLE

	A = new double[n2];

	B = new double[n2];

	C = new double[n2];

#else

	A = new float[n2];

	B = new float[n2];

	C = new float[n2];

#endif

	for (int i = 0; i < n2; i++)

	{

		A[i] = B[i] = 1.0;

	}

	status = cublasAlloc(n2, sizeof(A[0]), (void**)&d_A);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device memory allocation error (A)\n");

		getchar();

		return EXIT_FAILURE;

	}

	

	status = cublasAlloc(n2, sizeof(B[0]), (void**)&d_B);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device memory allocation error (B)\n");

		getchar();

		return EXIT_FAILURE;

	}

	status = cublasAlloc(n2, sizeof(C[0]), (void**)&d_C);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device memory allocation error (C)\n");

		getchar();

		return EXIT_FAILURE;

	}

	status = cublasSetVector(n2, sizeof(A[0]), A, 1, d_A, 1);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device access error (write A)\n");

		getchar();

		return EXIT_FAILURE;

	}

	status = cublasSetVector(n2, sizeof(B[0]), B, 1, d_B, 1);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device access error (write B)\n");

		getchar();

		return EXIT_FAILURE;

	}

	status = cublasSetVector(n2, sizeof(C[0]), C, 1, d_C, 1);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device access error (write C)\n");

		getchar();

		return EXIT_FAILURE;

	}

	cublasGetError();

	cout << "Starting test." << endl;

	double start = GetTickCount64();

	for (int i = 0; i < 100; i++)

#ifdef TEST_DOUBLE

		cublasDgemm('n', 'n', N, N, N, 1.0, d_A, N, d_B, N, 0.0, d_C, N);

#else

		cublasSgemm('n', 'n', N, N, N, 1.0f, d_A, N, d_B, N, 0.0f, d_C, N);

#endif

	status = cublasGetError();

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! kernel execution error.\n");

		getchar();

		return EXIT_FAILURE;

	}

	/* Read the result back */

	status = cublasGetVector(n2, sizeof(C[0]), d_C, 1, C, 1);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! device access error (read C)\n");

		getchar();

		return EXIT_FAILURE;

	}

	double end = GetTickCount64();

	double duration = (end - start) / 100.0;

	cout << "Matrix multiplication duration: " << duration << " milliseconds." << endl;

	cout << "Resulting C[0] = " << C[0] << "." << endl;

	

	status = cublasFree(d_A);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! memory free error (A)\n");

		getchar();

		return EXIT_FAILURE;

	}

	status = cublasFree(d_B);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! memory free error (B)\n");

		getchar();

		return EXIT_FAILURE;

	}

	status = cublasFree(d_C);

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! memory free error (C)\n");

		getchar();

		return EXIT_FAILURE;

	}

	delete A;

	delete B;

	delete C;

	/* Shutdown */

	status = cublasShutdown();

	if (status != CUBLAS_STATUS_SUCCESS) {

		fprintf (stderr, "!!!! shutdown error (A)\n");

		getchar();

		return EXIT_FAILURE;

	}

	cuCtxDestroy(context);

	if (argc > 1) {

		if (!strcmp(argv[1], "-noprompt") ||

			!strcmp(argv[1], "-qatest") ) 

		{

			return EXIT_SUCCESS;

		}

	} 

	else

	{

		printf("\nPress ENTER to exit...\n");

		getchar();

	}

	return EXIT_SUCCESS;

}

What does cuMemGetInfo report for the amount of memory on your machine? CUBLAS may allocate additional memory behind the scenes, for example.

Thanks for pointing that out tmurray. After adding a few calls to cuMemGetInfo in my code, to see how much memory is used at each step, I found that I have 104.4MB free before calling cublasInit, but only 41.4MB free after cublasInit. That explains why I had problems.

After switching off all of Vista’s display themes, I found I had 222MB free before calling cublasInit and 160MB after calling cublasInit.

I do still have some questions though. Does CUBLAS always allocate about 60MB? What does CUBLAS do with this memory?

Is the previous number before or after creating a context?

My “previous” values are taken after creating a context but before calling cublasInit. So that’s 222MB free after creating a context, 160MB free after calling cublasInit (the very next operation).