Hello,
I am trying to achieve stream concurrency when running cusolverDnDsyevj.
However, executing the code below with nvpp shows that all the streams are running sequentially.
It works fine with other functions such as cusolverDnDgetrf or cublas Dgemm but not with cusolverDnDsyevj or cusolverDnDsyevd.
I can’t find out what is wrong.
Do you have any ideas ?
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#define NBSTREAM 1
#define M 500
int main(int argc, char *argv[])
{
struct timeval tv;
int i,j,k;
const int m = M;
double start,end,timeg;
double *cg, *rcvcg;
double alpha = 1.0;
double beta = 0.0;
cudaMallocHost((void **)&cg, m*m*sizeof(double));
cudaMallocHost((void **)&rcvcg, m*m*sizeof(double));
for( i = 0; i < m; i++)
for ( j = 0; j < m ; j++)
cg[i*m+j] = (((double) rand() / (double) RAND_MAX));
cudaSetDevice(0);
cusolverDnHandle_t cusolverH[NBSTREAM];
cudaStream_t stream[NBSTREAM];
syevjInfo_t syevj_params = NULL;
cusolverDnCreateSyevjInfo(&syevj_params);
double *sol;
cudaMallocHost((void **)&sol,m*sizeof(double));
int l_work[NBSTREAM];
double *d_a[NBSTREAM];
double *d_sol[NBSTREAM];
double *d_work[NBSTREAM];
int *d_info[NBSTREAM];
cusolverStatus_t status[NBSTREAM];
for(i=0; i< NBSTREAM; i++){
cudaMalloc((void **)&d_a[i],m*m*sizeof(double));
cudaMalloc((void **)&d_sol[i], m*sizeof(double));
cudaMalloc((void **)&d_info[i], sizeof(int));
}
for(i=0; i<NBSTREAM; i++){
cudaStreamCreateWithFlags(&stream[i], cudaStreamNonBlocking);
cusolverDnCreate(&cusolverH[i]);
cusolverDnSetStream(cusolverH[i], stream[i]);
}
for(i=0; i<NBSTREAM; i++){
cudaMemcpyAsync(d_a[i], cg, m*m*sizeof(double), cudaMemcpyHostToDevice,stream[i]);
}
cudaDeviceSynchronize();
gettimeofday(&tv,NULL);
start=(double) tv.tv_sec+(double)tv.tv_usec*1.e-6;
for(int i=0;i<NBSTREAM;i++){
cusolverDnDsyevj_bufferSize(cusolverH[i],
CUSOLVER_EIG_MODE_VECTOR,
CUBLAS_FILL_MODE_LOWER,
m,
d_a[i],
m,
d_sol[i],
&l_work[i],
syevj_params);
cudaMalloc((void **)&d_work[i], sizeof(double)*l_work[i]);
}
for(i=0; i<NBSTREAM; i++)
status[i] = cusolverDnDsyevj(cusolverH[i],
CUSOLVER_EIG_MODE_VECTOR,
CUBLAS_FILL_MODE_LOWER,
m,
d_a[i],
m,
d_sol[i],
d_work[i],
l_work[i],
d_info[i],
syevj_params);
cudaDeviceSynchronize();
gettimeofday(&tv,NULL);
end=(double) tv.tv_sec+(double)tv.tv_usec*1.e-6;
timeg = end-start;
for(i=0; i<NBSTREAM; i++){
cudaMemcpyAsync(sol, d_sol[i], sizeof(double)*m, cudaMemcpyDeviceToHost, stream[i]);
cudaMemcpyAsync(rcvcg, d_a[i], sizeof(double)*m*m, cudaMemcpyDeviceToHost, stream[i]);
}
cudaDeviceSynchronize();
printf("Time to do syevj on %d streams: %f\n", NBSTREAM, timeg);
return 0;
}