Just wondering what kind of memory limits you guys are having using CUBLAS. We’re using cublaSgemm, using a Tesla C1060 (4GB ddr3). We are trying to allocate ~1.97GB matrix, but am getting a cublas error on the cublasAlloc() calls. In the code, we’re looking at variables cubeArray and xptr :unsure:
[codebox]void cublasTestData(string headerFile, string dataFile, double *runStats)
{
//Read data cube from disk
std::clock_t start;
double diff;
start = std::clock();
float *cubeArray = readData(headerFile, dataFile); //~2GB
runStats[1] = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
//Initialize variables for Covariance
float scalar = (1.0 / (float)numPixels);
//Solution Matrix (numBands x numBands)
float *secondTerm = (float*)malloc(sizeof(float) * numBands * numBands);
//Cube Array Device Memory
float* xptr;
//Solution Matrix Device Memory
float* yptr;
//Unit vector Device Memory
float* zptr;
//Signature Sums Device Memory
float* sigptr;
//Unit Vector (numPixels x 1)
float* unitVector = (float*)malloc(sizeof(float) * numPixels);
float* signatureSums = (float*)malloc(sizeof(float) * numBands);
for(int i=0; i<numPixels; i++){
unitVector[i] = 1.0;
}
memset(secondTerm, 0, sizeof(float) * numBands * numBands);
memset(signatureSums, 0, sizeof(float) * numBands);
//CUBLAS State (error handling)
cublasStatus state;
if(cublasInit() == CUBLAS_STATUS_NOT_INITIALIZED) {
printf("CUBLAS init error.\n");
}
//Allocate device memory for data cube
state = cublasAlloc(numBands*numPixels, sizeof(*cubeArray), (void**)&xptr);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation video memory.\n"); //Error being thrown here
}
//Allocate device memory for solution
state = cublasAlloc(numBands*numBands, sizeof(*secondTerm), (void**)&yptr);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation video memory.\n");
}
//Allocate device memory for unit vector
state = cublasAlloc(numPixels, sizeof(*unitVector), (void**)&zptr);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation video memory.\n");
}
//Allocate device memory for signature sums
state = cublasAlloc(numBands, sizeof(*signatureSums), (void**)&sigptr);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation video memory.\n");
}
//Copy data cube from Host to Device
state = cublasSetMatrix(numPixels, numBands, sizeof(*cubeArray), cubeArray, numPixels, xptr, numPixels);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation matrix.\n");
}
//Copy solution matrix from Host to Device
state = cublasSetMatrix(numBands, numBands, sizeof(*secondTerm), secondTerm, numBands, yptr, numBands);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation matrix.\n");
}
//Copy unit vector from Host to Device
state = cublasSetMatrix(numPixels, 1, sizeof(*unitVector), unitVector, numPixels, zptr, numPixels);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation matrix.\n");
}
//Copy signature vector from Host to Device
state = cublasSetMatrix(numBands, 1, sizeof(*signatureSums), signatureSums, numBands, sigptr, numBands);
if(state != CUBLAS_STATUS_SUCCESS) {
printf("Error allocation matrix.\n");
}
cublasSgemm('n', 'n', numBands, 1, numPixels, 1.0, xptr, numBands, zptr, numPixels, 1.0, sigptr, numBands);
cublasSgemm('n','t', numBands, numBands, 1, scalar*scalar, sigptr, numBands, sigptr, numBands, 1.0, yptr, numBands);
cublasSgemm('n', 't', numBands, numBands, numPixels, scalar, xptr, numBands, xptr, numBands, -1.0, yptr, numBands);
if (state != CUBLAS_STATUS_SUCCESS) {
printf("CUBLAS execution error.\n");
}
state = cublasGetMatrix(numBands,numBands, sizeof(*yptr), yptr, numBands, secondTerm, numBands);
free(signatureSums);
free(unitVector);
free(secondTerm);
runStats[0] = numBands * numRows * numCols / 1000000;
if(dataType == 2){
runStats[0] *= 2.0;
}else if(dataType == 4){
runStats[0] *= 4.0;
}
runStats[2] = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;
cublasFree(xptr);
cublasFree(yptr);
cublasFree(zptr);
cublasFree(sigptr);
}
[/codebox]