I suspect that I am making an incorrect assumption somewhere, but I cannot get the cublasAlloc command to work in a simple C code. The manual says that CUBLAS is self-contained at the API level, so I presume that I don’t have to use nvcc to compile. Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include "cublas.h"
int main () {
int order = 256;
int i,j;
float* cpuA;
float* cpuB;
float* cpuC;
const float* gpuA;
const float* gpuB;
float* gpuC;
cublasStatus retStatus;
printf("order is %d\n",order);
// set up matrix values
cpuA = (float*) malloc (order*order*sizeof(float));
for (i=1; i<=order; i++) {
for (j=1; j<=order; j++) {
cpuA[(i-1)*order+j-1] = (float)i*order + j;
}
}
// ...same for cpuB and cpuC ...
retStatus = cublasInit ();
// test for error
retStatus = cublasGetError ();
if (retStatus != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS: an error occured in cublasInit\n");
} else {
printf("CUBLAS: cublasInit worked\n");
}
// set up gpu matrix --- DIES HERE ---
cublasAlloc (order*order, sizeof(float), (void**)gpuA);
// test for error
retStatus = cublasGetError ();
if (retStatus != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS: an error occured in cublasAlloc\n");
} else {
printf("CUBLAS: cublasAlloc worked\n");
}
retStatus = cublasSetMatrix (order, order, sizeof(float),
cpuA, order, (void*)gpuA, order);
// ...same for gpuB, gpuC...
// call BLAS routine
(void) cublasSgemm ('N','T',order,order,order,0.5,
gpuA,order,gpuB,order,0.5,gpuC,order);
retStatus = cublasGetMatrix (order, order, sizeof(float),
gpuC, order, cpuC, order);
return(0);
}
It compiles with
gcc -pg -ggdb -o aloneCUBLAS aloneCUBLAS.c -I/usr/local/cuda/include -L/usr/local/cuda/lib -lcublas
It segfaults at the cublasAlloc command when I run it, and gdb doesn’t seem to be helping:
(gdb) run
Starting program: /home/mstock/testBLAS/aloneCUBLAS
[Thread debugging using libthread_db enabled]
[New Thread -1208863040 (LWP 13234)]
order is 256
CUBLAS: cublasInit worked
Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread -1208863040 (LWP 13234)]
0x007ec3f2 in cublasAlloc () from /usr/local/cuda/lib/libcublas.so
(gdb) bt
#0 0x007ec3f2 in cublasAlloc () from /usr/local/cuda/lib/libcublas.so
#1 0x08048923 in main () at aloneCUBLAS.c:53
Does anybody have an idea what I doing wrong?
Other CUDA codes run, but I have not tried other CUBLAS calls. This is on a RHEL4u3 system with an 8800GTX.