Hi everyone,
I’ve just started working with CUBLAS, so there might be something simple that I am missing.
The app I have written allocates 3 matrices (A, B and C) and uses them with the cublasSgemm function to multiply A and B and store the result in C. My matrix dimensions are 2048*2048.
cublasAlloc succeeds in allocating A and B, but fails with error code 3 (CUBLAS_STATUS_ALLOC_FAILED) when trying to allocate C, indicating that there’s insufficient memory. But that can’t be right because I have 256MB on my GPU (GeForce 8600M GT). These 3 matrices should equate to 48MB. Furthermore, another application I wrote using the CUDA runtime functions successfully allocates 3 2048*2048 arrays (my own attempt at matrix multiplication, before I found CUBLAS).
Can someone please point out why cublasAlloc is failing.
My laptop’s specs:
Windows Vista Business 32 bit.
GeForce 8600M GT with 256MB memory.
CUDA toolkit and SDK version 2.3.
CUDA notebook driver version 195.62.
Below is the code of my CUBLAS test application.
Thanks in advance for any help.
// CUBLASTest_MatrixMatrixMultiply.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include "cublas.h"
#include "cuda.h"
#include "cutil.h"
using namespace std;
#define N 2048 //NxN element matrices.
//#define TEST_DOUBLE
int _tmain(int argc, char** argv)
{
#ifdef TEST_DOUBLE
double* A;
double* B;
double* C;
double* d_A = 0;
double* d_B = 0;
double* d_C = 0;
#else
float* A;
float* B;
float* C;
float* d_A = 0;
float* d_B = 0;
float* d_C = 0;
#endif
CUdevice device;
CUcontext context;
int n2 = N*N;
cublasStatus status;
cout << "CUBLAS large matrix multiplication timing test." << endl;
cout << "Binding to first available CUDA device." << endl;
if (cuInit(0) != CUDA_SUCCESS)
{
cout << "CUDA initialization failed." << endl;
getchar();
return 0;
}
if (cuDeviceGet(&device, 0) != CUDA_SUCCESS)
{
cout << "Unable to get CUDA device." << endl;
getchar();
return 0;
}
char device_name[1024];
cuDeviceGetName(device_name, 1024, device);
cout << "CUDA device name: " << device_name << endl;
if (cuCtxCreate(&context, CU_CTX_SCHED_YIELD, device) != CUDA_SUCCESS)
{
cout << "Unable to create CUDA context." << endl;
getchar();
return 0;
}
cout << "Initializing CUBLAS." << endl;
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
getchar();
return EXIT_FAILURE;
}
#ifdef TEST_DOUBLE
A = new double[n2];
B = new double[n2];
C = new double[n2];
#else
A = new float[n2];
B = new float[n2];
C = new float[n2];
#endif
for (int i = 0; i < n2; i++)
{
A[i] = B[i] = 1.0;
}
status = cublasAlloc(n2, sizeof(A[0]), (void**)&d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
getchar();
return EXIT_FAILURE;
}
status = cublasAlloc(n2, sizeof(B[0]), (void**)&d_B);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (B)\n");
getchar();
return EXIT_FAILURE;
}
status = cublasAlloc(n2, sizeof(C[0]), (void**)&d_C);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device memory allocation error (C)\n");
getchar();
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(A[0]), A, 1, d_A, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write A)\n");
getchar();
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(B[0]), B, 1, d_B, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write B)\n");
getchar();
return EXIT_FAILURE;
}
status = cublasSetVector(n2, sizeof(C[0]), C, 1, d_C, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (write C)\n");
getchar();
return EXIT_FAILURE;
}
cublasGetError();
cout << "Starting test." << endl;
double start = GetTickCount64();
for (int i = 0; i < 100; i++)
#ifdef TEST_DOUBLE
cublasDgemm('n', 'n', N, N, N, 1.0, d_A, N, d_B, N, 0.0, d_C, N);
#else
cublasSgemm('n', 'n', N, N, N, 1.0f, d_A, N, d_B, N, 0.0f, d_C, N);
#endif
status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
getchar();
return EXIT_FAILURE;
}
/* Read the result back */
status = cublasGetVector(n2, sizeof(C[0]), d_C, 1, C, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (read C)\n");
getchar();
return EXIT_FAILURE;
}
double end = GetTickCount64();
double duration = (end - start) / 100.0;
cout << "Matrix multiplication duration: " << duration << " milliseconds." << endl;
cout << "Resulting C[0] = " << C[0] << "." << endl;
status = cublasFree(d_A);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! memory free error (A)\n");
getchar();
return EXIT_FAILURE;
}
status = cublasFree(d_B);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! memory free error (B)\n");
getchar();
return EXIT_FAILURE;
}
status = cublasFree(d_C);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! memory free error (C)\n");
getchar();
return EXIT_FAILURE;
}
delete A;
delete B;
delete C;
/* Shutdown */
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! shutdown error (A)\n");
getchar();
return EXIT_FAILURE;
}
cuCtxDestroy(context);
if (argc > 1) {
if (!strcmp(argv[1], "-noprompt") ||
!strcmp(argv[1], "-qatest") )
{
return EXIT_SUCCESS;
}
}
else
{
printf("\nPress ENTER to exit...\n");
getchar();
}
return EXIT_SUCCESS;
}