Hello,
we are having some peculiar problems with CUBLAS’ CGEMM (cuda 3.1 on a Tesla C1060).
The first thing to note is that for highly rectangular matrices (half a dozen x millions), the CUBLAS implementation of CGEMM is slower than, say, MKL’s CBLAS. For instance, if A and B are 3x30,000,000 single-precision complex matrices, then CUBLAS takes about 16 times longer than MKL to compute A^H.B. It gets worse if you’re doing 1x30,000,000 - for which MKL is about 150 times faster!
The second problem is that for certain matrix sizes, CUBLAS spits out an ‘execution failed’ error. For example, 3x33,333,333 gives this error, whereas 5x33,333,333 does not. Likewise, 30x3,333,333 fails, but 50x3,333,333 does not. Below I’ve attached my code for testing this - I’d appreciate it if someone could see if they can reproduce it*. Strangely, this problem does not occur for real matrices (using SGEMM).
Thanks!
- You will probably require at least 1.5 GB of RAM free on both the host and your device. To compile, the following should do it:
nvcc main.cu -lcublas
Then when you run it, you will be prompted for M and then N. Please try M=3 and N=33333333 first and let me know what output you get…
#include <stdio.h>
#include <stdlib.h>
#include <cublas.h>
#include <cuda.h>
#define RE 1.0
#define IM 2.0
void checkCudaError();
void checkCublasError();
int main(void)
{
cuComplex *hX, *hY;
cuComplex *dX, *dY;
cuComplex one, zero;
one.x=1.0; one.y=0.0;
zero.x=0.0; zero.y=0.0;
int i,M,N;
// Get problem size
printf("M = ");
scanf("%i",&M);
printf("N = ");
scanf("%i",&N);
// Allocate device memory
cudaMalloc((void**)&dX,N*M*sizeof(cuComplex));
cudaMalloc((void**)&dY,M*M*sizeof(cuComplex));
checkCudaError();
// Allocate host memory
hX=(cuComplex*)malloc(M*N*sizeof(cuComplex));
hY=(cuComplex*)malloc(M*M*sizeof(cuComplex));
// Fill host memory
for(i=0; i<M*N; i++)
{
hX[i].x=RE; hX[i].y=IM;
}
// Copy host matrix to device
cudaMemcpy(dX,hX,N*M*sizeof(cuComplex),cudaMemcpyHostToDevice);
checkCudaError();
// Perform the multiplication
cublasCgemm('C','N', M, M, N, one, dX, N, dX, N, zero, dY, M);
checkCublasError();
// Cleanup
free(hX);
free(hY);
cudaFree(dX);
cudaFree(dY);
checkCudaError();
printf("Passed!\n");
}
void checkCudaError()
{
cudaError_t error = cudaGetLastError();
if(error!=cudaSuccess) {
printf("CUDA ERROR: %s\n", cudaGetErrorString(error) );
exit(-1);
}
}
void checkCublasError()
{
cublasStatus cs=cublasGetError();
if(cs==CUBLAS_STATUS_SUCCESS) return;
switch(cs)
{
case CUBLAS_STATUS_NOT_INITIALIZED:
printf("CUBLAS ERROR: Not initialised.\n");
break;
case CUBLAS_STATUS_ALLOC_FAILED:
printf("CUBLAS ERROR: Alloc failed.\n");
break;
case CUBLAS_STATUS_INVALID_VALUE:
printf("CUBLAS ERROR: Invalid value.\n");
break;
case CUBLAS_STATUS_ARCH_MISMATCH:
printf("CUBLAS ERROR: Arch mismatch.\n");
break;
case CUBLAS_STATUS_MAPPING_ERROR:
printf("CUBLAS ERROR: Mapping error.\n");
break;
case CUBLAS_STATUS_EXECUTION_FAILED:
printf("CUBLAS ERROR: Execution failed.\n");
break;
case CUBLAS_STATUS_INTERNAL_ERROR:
printf("CUBLAS ERROR: Internal error.\n");
break;
}
exit(-2);
}