Hello:
I’m trying to use cublasDgemm() for large matrices and I obtain a CUBLAS_STATUS_MAPPING_ERROR in a strange behavior: first cublasDgemm() and cublasGetMatrix() run apparently well but after several calls I obtain the error when I try to get the matrix to main memory.
The hardware and drivers:
GPU: GeForce GTX 550 Ti
GPU Memory: 1535.2 MB
Driver: NVIDIA-Linux-x86_64-310.32.run (downloaded from www.nvidia.com)
CUDA version: 5.0 (also with 4.2)
Operating system: Debian GNU/Linux 64 bits (gcc 4.7.2)
I’m writing some code in order to compute the performance of cublasDgemm() in my GPU, so I execute the function several times for several matrix dimensions (all matrices are square). The dimensions are from 500 to 7500 in steps of 1000.
As cublasDgemm() uses 3 matrices C=alphaAB+betaC, in the case of dimensions 7500 the total amount of memory used in double precision is 3750075008/1024/1024 = 1287.5 MB, which is less than the total memory in the GPU
The computations run OK for dimensions 500 … 6500. The problem comes when the dimensions are 7500. But it is a bit strange: in my case, with dimensions 7500 the cumputations succeed the first three times, but at the fourth repetition, when I try to get the matrix C from the GPU I obtain the error CUBLAS_STATUS_MAPPING_ERROR. Then the computer becomes totally blocked and the only solution is to reboot the machine.
I think I free correctly all the memory used, so I don’t know where can be the problem.
Below is the code pasted. Can anyone with a similar card try to execute it and post the results?
Thanks
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>
//dimension limit M=N=K
#define DLIM 7500
//repetitions
#define R 10
//function that calls cublasDgemm
void gpudgemm(int N,double* A,double* B,double* C);
//function that gets error code in CUDA
void cudaerrorcode(cudaError_t status);
//function that gets error code in CUBLAS
void cublaserrorcode(cublasStatus_t status);
int main()
{
//variable declaration
int i=0,j=0;
double* A=NULL;
double* B=NULL;
double* C=NULL;
//memory allocation and initialization
A = (double*)malloc(DLIM*DLIM*sizeof(double));
B = (double*)malloc(DLIM*DLIM*sizeof(double));
C = (double*)malloc(DLIM*DLIM*sizeof(double));
if((A==NULL)||(B==NULL)||(C==NULL))
{
fprintf(stderr,"Error in malloc()\n");
exit(EXIT_FAILURE);
}
for(i=0;i<DLIM*DLIM;i++)
{
A[i] = 1.0;
B[i] = 1.0;
C[i] = 1.0;
}
//computations
for(i=500;i<=DLIM;i+=1000)
{
fprintf(stderr,"M=N=K= %d\n",i);
for(j=0;j<R;j++)
{
fprintf(stderr,"Computation %d ",j);
fflush(stderr);
gpudgemm(i,A,B,C);
fprintf(stderr,"...done\n");
}
fprintf(stderr,"\n");
}
//memory free
free(A);
free(B);
free(C);
//end of function
return 0;
}
void gpudgemm(int N,double* A,double* B,double* C)
{
//variable declaration
double ab=1.0;
double* cA=NULL;
double* cB=NULL;
double* cC=NULL;
cublasHandle_t handle;
cublasOperation_t TRANSA=CUBLAS_OP_N;
cublasOperation_t TRANSB=CUBLAS_OP_N;
cublasStatus_t status1=CUBLAS_STATUS_SUCCESS;
cublasStatus_t status2=CUBLAS_STATUS_SUCCESS;
cublasStatus_t status3=CUBLAS_STATUS_SUCCESS;
cudaError_t Status1=cudaSuccess;
cudaError_t Status2=cudaSuccess;
cudaError_t Status3=cudaSuccess;
//memory allocation and data copy to GPU
status1 = cublasCreate(&handle);
if(status1!=CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr,"Error in cublasCreate()\n");
fprintf(stderr,"CUBLAS error code: ");
cublaserrorcode(status1);
fprintf(stderr,"\n");
exit(EXIT_FAILURE);
}
Status1 = cudaMalloc((void**)&cA,(size_t)(N*N)*sizeof(*A));
Status2 = cudaMalloc((void**)&cB,(size_t)(N*N)*sizeof(*B));
Status3 = cudaMalloc((void**)&cC,(size_t)(N*N)*sizeof(*C));
if((Status1!=cudaSuccess)||
(Status2!=cudaSuccess)||
(Status3!=cudaSuccess))
{
fprintf(stderr,"Error in cudaMalloc()\n");
fprintf(stderr,"CUDA error code cA: ");
cudaerrorcode(Status1);
fprintf(stderr,"\n");
fprintf(stderr,"CUDA error code cB: ");
cudaerrorcode(Status2);
fprintf(stderr,"\n");
fprintf(stderr,"CUDA error code cC: ");
cudaerrorcode(Status3);
fprintf(stderr,"\n");
exit(EXIT_FAILURE);
}
status1 = cublasSetMatrix(N,N,(int)sizeof(*A),(const void*)A,N,(void*)cA,N);
status2 = cublasSetMatrix(N,N,(int)sizeof(*B),(const void*)B,N,(void*)cB,N);
status3 = cublasSetMatrix(N,N,(int)sizeof(*C),(const void*)C,N,(void*)cC,N);
if((status1!=CUBLAS_STATUS_SUCCESS)||
(status2!=CUBLAS_STATUS_SUCCESS)||
(status3!=CUBLAS_STATUS_SUCCESS))
{
fprintf(stderr,"Error in cublasSetMatrix()\n");
fprintf(stderr,"CUBLAS error code cA: ");
cublaserrorcode(status1);
fprintf(stderr,"\n");
fprintf(stderr,"CUBLAS error code cB: ");
cublaserrorcode(status2);
fprintf(stderr,"\n");
fprintf(stderr,"CUBLAS error code cC: ");
cublaserrorcode(status3);
fprintf(stderr,"\n");
exit(EXIT_FAILURE);
}
//dgemm computation
status1 = cublasDgemm(handle,TRANSA,TRANSB,N,N,N,&ab,cA,N,cB,N,&ab,cC,N);
if(status1!=CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr,"Error in cublasDgemm()\n");
fprintf(stderr,"CUBLAS error code: ");
cublaserrorcode(status1);
fprintf(stderr,"\n");
exit(EXIT_FAILURE);
}
//result recovery
status1 = cublasGetMatrix(N,N,(int)sizeof(*C),(const void*)cC,N,(void*)C,N);
if(status1!=CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr,"Error in cublasGetMatrix()\n");
fprintf(stderr,"CUBLAS error code: ");
cublaserrorcode(status1);
fprintf(stderr,"\n");
exit(EXIT_FAILURE);
}
//memory free
status1 = cudaFree((void*)cA);
status2 = cudaFree((void*)cB);
status3 = cudaFree((void*)cC);
if((Status1!=cudaSuccess)||
(Status2!=cudaSuccess)||
(Status3!=cudaSuccess))
{
fprintf(stderr,"Error in cudaFree()\n");
fprintf(stderr,"CUDA error code cA: ");
cudaerrorcode(Status1);
fprintf(stderr,"\n");
fprintf(stderr,"CUDA error code cB: ");
cudaerrorcode(Status2);
fprintf(stderr,"\n");
fprintf(stderr,"CUDA error code cC: ");
cudaerrorcode(Status3);
fprintf(stderr,"\n");
exit(EXIT_FAILURE);
}
//destroy cublas instance
status1 = cublasDestroy(handle);
if(status1!=CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr,"Error in cublasDestroy()\n");
fprintf(stderr,"CUBLAS error code: ");
cublaserrorcode(status1);
fprintf(stderr,"\n");
exit(EXIT_FAILURE);
}
//end of function
return;
}
void cudaerrorcode(cudaError_t status)
{
if(status==cudaSuccess)
{
fprintf(stderr,"CUDA SUCCESS");
}
else if(status==cudaErrorMemoryAllocation)
{
fprintf(stderr,"'cudaErrorMemoryAllocation'");
}
else if(status==cudaErrorInvalidDevicePointer)
{
fprintf(stderr,"'cudaErrorInvalidDevicePointer'");
}
else if(status==cudaErrorInitializationError)
{
fprintf(stderr,"'cudaErrorInitializationError'");
}
else
{
fprintf(stderr,"UNKNOWN CUDA ERROR");
}
//end of function
return;
}
void cublaserrorcode(cublasStatus_t status)
{
if(status==CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr,"CUBLAS SUCCESS");
}
else if(status==CUBLAS_STATUS_NOT_INITIALIZED)
{
fprintf(stderr,"'CUBLAS_STATUS_NOT_INITIALIZED'");
}
else if(status==CUBLAS_STATUS_ALLOC_FAILED)
{
fprintf(stderr,"'CUBLAS_STATUS_ALLOC_FAILED'");
}
else if(status==CUBLAS_STATUS_INVALID_VALUE)
{
fprintf(stderr,"'CUBLAS_STATUS_INVALID_VALUE'");
}
else if(status==CUBLAS_STATUS_ARCH_MISMATCH)
{
fprintf(stderr,"'CUBLAS_STATUS_ARCH_MISMATCH'");
}
else if(status==CUBLAS_STATUS_MAPPING_ERROR)
{
fprintf(stderr,"'CUBLAS_STATUS_MAPPING_ERROR'");
}
else if(status==CUBLAS_STATUS_EXECUTION_FAILED)
{
fprintf(stderr,"'CUBLAS_STATUS_EXECUTION_FAILED'");
}
else if(status==CUBLAS_STATUS_INTERNAL_ERROR)
{
fprintf(stderr,"'CUBLAS_STATUS_INTERNAL_ERROR'");
}
else
{
fprintf(stderr,"UNKNOWN CUBLAS ERROR");
}
//end of function
return;
}