Streaming CuSolver SVD

Hello,

I would like to test streaming SVD operations on a GeForce 930MX.
I am wondering why in the following code, the streams are working concurrently even for the smallest matrices. In the profiler, the streams execute the SVD one after the other and there is a significant gap between each execution (200ms, in the 500x500 case), which I also don’t understand.

Anything wrong with this code?
Thanks

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <time.h>
#include "helper_cuda.h"


void benchmarks(int m, int n, int streamSize, int timeSize) {

    assert(m >= n);
    cusolverDnHandle_t cusolverHandle;
    cudaStream_t stream[streamSize];

    const int lda = m; /* lda >= m */
    const int ldu = m; /* ldu >= m */
    const int ldv = n; /* ldv >= n */

    float *A = NULL;
    float *U = NULL;
    float *V = NULL;
    float *S = NULL;
    float *d_A = NULL; /* lda-by-n-by-streamSize */
    float *d_U = NULL; /* ldu-by-m-by-streamSize */
//  float *d_V = NULL; /* ldv-by-n-by-streamSize */
    float *d_S = NULL; /* minmn-by-streamSizee */
    int *d_info = NULL; /* streamSize */
    int lwork = 0;       /* size of workspace */
    float *d_work = NULL; /* device workspace for gesvd */
    float *d_rwork = NULL; /* device workspace for gesvd */
    int info[streamSize];       /* info = [info0 ; info1 ; ...] */

    int iStream;
    struct timespec start, stop;

    printf("GESVD: size A: %f MB\n", lda * n * streamSize * sizeof(float) / 1e6);
    checkCudaErrors(cudaMallocHost((void**)&A, lda * n * streamSize * sizeof(float)));
    checkCudaErrors(cudaMallocHost((void**)&U, ldu * n * streamSize * sizeof(float)));
//  checkCudaErrors(cudaMallocHost((void**)&V, ldv * n * streamSize * sizeof(float)));
    checkCudaErrors(cudaMallocHost((void**)&S, n * streamSize * sizeof(float)));

    for (int i = 0; i < lda * n * streamSize; i++)
        A[i] = rand() / (float)RAND_MAX;

    /* step 1: create cusolver handle, bind a stream  */
    checkCudaErrors(cusolverDnCreate(&cusolverHandle));
    for (iStream = 0; iStream < streamSize; iStream++) {
        checkCudaErrors(cudaStreamCreateWithFlags(&stream[iStream], cudaStreamNonBlocking));
    }

    /* step 3: Allocate device arrays */
    checkCudaErrors(cudaMalloc((void**)&d_A   , sizeof(float) * lda * n * streamSize));
    checkCudaErrors(cudaMalloc((void**)&d_U   , sizeof(float) * ldu * n * streamSize));
//  checkCudaErrors(cudaMalloc((void**)&d_V   , sizeof(float) * ldv * n * streamSize));
    checkCudaErrors(cudaMalloc((void**)&d_S   , sizeof(float) * n * streamSize));
    checkCudaErrors(cudaMalloc((void**)&d_info, sizeof(int  ) * streamSize));

    /* step 4: query working space */
    checkCudaErrors(cusolverDnSetStream(cusolverHandle, stream[0]));
    checkCudaErrors(cusolverDnSgesvd_bufferSize(
            cusolverHandle,
            m,
            n,
            &lwork));
    checkCudaErrors(cudaMalloc((void**)&d_work, sizeof(float) * lwork * streamSize));
    checkCudaErrors(cudaMalloc((void**)&d_rwork, sizeof(float) * (n - 1) * streamSize));

    int nfailure = 0;
    float min_elapsed = 1e300;
    float elapsed;

    /* step 5: compute singular values of A0 and A1 */
    for (int iTime = 0; iTime < timeSize; iTime++) {

        clock_gettime(CLOCK_REALTIME, &start);

        for (iStream = 0; iStream < streamSize; iStream++)
            checkCudaErrors(cudaMemcpyAsync(&d_A[lda * n * iStream],
                    &A[lda * n * iStream],
                    sizeof(float)*lda*n,
                    cudaMemcpyHostToDevice,
                    stream[iStream]));

        for (iStream = 0; iStream < streamSize; iStream++) {
            checkCudaErrors(cusolverDnSetStream(cusolverHandle, stream[iStream]));
            checkCudaErrors(cusolverDnSgesvd(
                    cusolverHandle,
                    'S',
                    'N',
                    m,
                    n,
                    &d_A[lda * n * iStream],
                    lda,
                    &d_S[n * iStream],
                    &d_U[ldu * n * iStream],
                    ldu,
                    NULL,
                    ldv,
                    &d_work[lwork * iStream],
                    lwork,
                    &d_rwork[(n - 1) * iStream],
                    &d_info[iStream]));
        }

        for (iStream=0; iStream < streamSize; iStream++){
            checkCudaErrors(cudaStreamSynchronize(stream[iStream]));
        }
        checkCudaErrors(cudaMemcpy(U   , d_U   , sizeof(float)*ldu*n*streamSize, cudaMemcpyDeviceToHost));
//      checkCudaErrors(cudaMemcpy(V   , d_V   , sizeof(float)*ldv*n*streamSize, cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaMemcpy(S   , d_S   , sizeof(float)*n*streamSize, cudaMemcpyDeviceToHost));
        checkCudaErrors(cudaMemcpy(info, d_info, sizeof(int) * streamSize, cudaMemcpyDeviceToHost));

        for (iStream = 0; iStream < streamSize; iStream++)
            if (info[iStream] > 0)
                nfailure++;

        clock_gettime(CLOCK_REALTIME, &stop);

        elapsed = (stop.tv_sec - start.tv_sec) * 1e3 + (stop.tv_nsec - start.tv_nsec) / 1e6;

        if (elapsed < min_elapsed)
            min_elapsed = elapsed;

        printf("Try %i: %f ms\n", iTime, elapsed / streamSize);

    }

    printf("Elapsed time %ix%i: %10.1fms\n", m, n, min_elapsed / streamSize);

    /* Check for convergence */
    if (nfailure > 0)
        printf("The algorithm computing SVD failed to converge %i times.\n", nfailure );

    /* free resources */
    checkCudaErrors(cudaFreeHost(A));
    checkCudaErrors(cudaFreeHost(U));
    checkCudaErrors(cudaFreeHost(V));
    checkCudaErrors(cudaFreeHost(S));
    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_U));
//  checkCudaErrors(cudaFree(d_V));
    checkCudaErrors(cudaFree(d_S));
    checkCudaErrors(cudaFree(d_info));
    checkCudaErrors(cudaFree(d_work));
    checkCudaErrors(cudaFree(d_rwork));

    checkCudaErrors(cusolverDnDestroy(cusolverHandle));
    for (iStream = 0; iStream < streamSize; iStream++) {
        checkCudaErrors(cudaStreamDestroy(stream[iStream]));
    }
    checkCudaErrors(cudaDeviceReset());
}

int main() {
//    benchmarks(   10,   10, 3, 5);
//    benchmarks(  100,  100, 3, 5);
//    benchmarks(  200,  100, 3, 5);
//    benchmarks(  200,  200, 3, 5);
//    benchmarks(  500,  100, 3, 5);
//    benchmarks(  500,  200, 3, 5);
      benchmarks(  500,  500, 3, 5);
//    benchmarks( 1000,  100, 3, 5);
//    benchmarks( 1024,  512, 3, 5);
//    benchmarks( 1000, 1000, 3, 5);
//    benchmarks( 2000,  100, 3, 5);
//    benchmarks( 2000,  500, 3, 5);
//    benchmarks( 6000,  100, 3, 5);
//    benchmarks( 6000,  500, 3, 5);
//    benchmarks( 8000,  100, 3, 5);
//    benchmarks( 8000,  500, 3, 5);
//    benchmarks(10000,  100, 3, 5);
//    benchmarks(10000,  500, 3, 5);
}