Hi all,
I experienced a slow down of CUBLAS gemm in my original code.
When I start to investigate the cause, I found out that it was after an CUFFT call cufftExecR2C (or D2Z) that the slow down happens. However, I did not see this effect after calling complex to complex transform or complex to real transform. A subsequent call of R2C and then C2C can remove the slow down in cublas.
Also, it seems that other cublas routines are also affected after R2C call.
Does anyone have some idea about why this happens?
I have experienced this in both CUDA 4.2 and 5.0
I attached some example code below:
#include <stdlib.h>;
#include <stdio.h>;
#include <sys/time.h>;
#include <cufft.h>;
#include <cublas_v2.h>;
double getTime()
{
struct timeval time;
gettimeofday(&time, NULL);
double time_ms = (double) time.tv_sec * 1000 + ((double) time.tv_usec / 1000.0);
return time_ms;
}
void
runTest( int argc, char** argv)
{
int device = 0;
int N = 12800;
int M = 4096;
if(argc >= 2)
{
device = atoi(argv[1]);
}
cudaSetDevice(device);
cudaError_t cudastatus;
float* d_A = NULL;
cudastatus = cudaMalloc( (void**)&d_A, sizeof(float) * N*M);
float* d_C = NULL;
cudastatus = cudaMalloc( (void**)&d_C, sizeof(float) * N * N);
double begin_time;
double end_time;
cublasHandle_t handle;
cublasCreate(&handle);
begin_time = getTime();
float alpha = 1.0;
float beta = 1.0;
cublasSgemm(handle, CUBLAS_OP_N,CUBLAS_OP_T,N,N,M,&(alpha),d_A, N, d_A, N, &(beta), d_C, N);
cudaDeviceSynchronize();
end_time = getTime();
printf("cublas original time: %f\n", end_time - begin_time);
cufftHandle plan;
cufftComplex *data ;
cudaMalloc ( ( void **)&data , sizeof ( cufftComplex ) *( 100) );
cufftComplex* idata;
cudaMalloc( (void**)&idata, sizeof(cufftComplex)*100);
cufftPlan1d(&plan, 100, CUFFT_C2C, 1);
cufftExecC2C(plan, idata, data, CUFFT_FORWARD);
cufftDestroy(plan);
begin_time = getTime();
cublasSgemm(handle, CUBLAS_OP_N,CUBLAS_OP_T,N,N,M,&(alpha),d_A, N, d_A, N, &(beta), d_C, N);
cudaDeviceSynchronize();
end_time = getTime();
printf("cublas after calling C2C: %f\n", end_time - begin_time);
cufftPlan1d(&plan, 100, CUFFT_R2C, 1);
cufftExecR2C(plan, (cufftReal*)idata, data);
cufftDestroy(plan);
cudaFree(idata);
cudaFree(data);
begin_time = getTime();
cublasSgemm(handle, CUBLAS_OP_N,CUBLAS_OP_T,N,N,M,&(alpha),d_A, N, d_A, N, &(beta), d_C, N);
cudaDeviceSynchronize();
end_time = getTime();
printf("cublas after calling R2C: %f\n", end_time - begin_time);
cudaFree(d_A);
cudaFree(d_C);
cublasDestroy(handle);
return;
}
int
main( int argc, char** argv)
{
runTest( argc, argv);
return 1;
}
Here is what I got, time in milliseconds:
cublas original time: 1888.888916
cublas after calling C2C: 1889.112061
cublas after calling R2C: 4290.118896
Thanks.