Hello, I am trying to utilize streams to parallely calculate cublasSdgmm and SVD decomposition in cuSolver, but I found there are no benefit using more stream. Could you help me with this?
The codes are like this:
void cudamalloc_pointer_of_pointer(float **&device_data, float**&host_data, float** src_data, int batchSize,
size_t arraySize) {
float **tem_data = (float **)malloc(batchSize * sizeof(float *));
for (int i = 0; i < batchSize; i++) {
CUDA_CHECK(cudaMalloc((void **)&tem_data[i], arraySize));
CUDA_CHECK(cudaMemcpy(tem_data[i], src_data[i], arraySize,
cudaMemcpyHostToDevice));
}
CUDA_CHECK(cudaMalloc((void **)&device_data, batchSize * sizeof(float *)));
CUDA_CHECK(cudaMemcpy(device_data, tem_data, batchSize * sizeof(float *),
cudaMemcpyHostToDevice));
//free(tem_data);
host_data = tem_data;
}
void cudamalloc_pointer_of_pointer(float **&device_data, float**&host_data,
int batchSize, size_t arraySize) {
host_data = (float **)malloc(batchSize * sizeof(float *));
for (int i = 0; i < batchSize; i++) {
CUDA_CHECK(cudaMalloc((void **)&(host_data[i]), arraySize));
CUDA_CHECK(cudaMemset(host_data[i], 0, arraySize));
}
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMalloc((void **)&device_data, batchSize * sizeof(float *)));
CUDA_CHECK(cudaMemcpy(device_data, host_data, batchSize * sizeof(float *),
cudaMemcpyHostToDevice));
}
void batched_dgmm(cublasSideMode_t mode, int batchSize, int m, int n, float** hostA, float** hostx, float** hostC) {
cudaStream_t *streamArray = 0;
streamArray = (cudaStream_t*)malloc(batchSize * sizeof(cudaStream_t));
for (int i = 0; i < batchSize; i++)
CUDA_CHECK(
cudaStreamCreateWithFlags(&streamArray[i], cudaStreamNonBlocking));
cublasHandle_t handle;
CUBLAS_CHECK(cublasCreate(&handle));
for(int i=0;i<batchSize;i++) {
cublasSetStream(handle, streamArray[i]);
cublasSdgmm(handle, mode, m, n, hostA[i], m, hostx[i], n, hostC[i], m);
}
CUDA_CHECK(cudaDeviceSynchronize());
cublasDestroy(handle);
for (int i = 0; i < batchSize; i++)
CUDA_CHECK(cudaStreamDestroy(streamArray[i]));
free(streamArray);
}
void test_batched_dgmm() {
clock_t start_time, end_time;
float **a = NULL, **b = NULL;
float **hosta = NULL, **hostb = NULL;
float **c = NULL, **hostc = NULL;
int batchSize = 1000;
int m = 58;
int eigenSize = 3;
cudamalloc_pointer_of_pointer(a, hosta, batchSize, m * eigenSize * sizeof(float));
cudamalloc_pointer_of_pointer(b, hostb, batchSize, eigenSize * eigenSize * sizeof(float));
cudamalloc_pointer_of_pointer(c, hostc, batchSize,
m * eigenSize * sizeof(float));
int count = 0;
while (count < 10) {
start_time = clock();
batched_dgmm(CUBLAS_SIDE_RIGHT, batchSize, m, eigenSize, hosta, hostb,
hostc);
end_time = clock();
printf("%d th: time:%f\n", count, (double)(end_time - start_time) / CLOCKS_PER_SEC);
}
}
and I found whether I remove this line of code “cublasSetStream(handle, streamArray[i]);” or not, there is no difference in the time used.
I use Visual Studio 2019 to build the code, and my GPU is RTX 3090.