Hello,
I tried to use batched svd based on this example https://docs.nvidia.com/cuda/cusolver/index.html#batchgesvdj-example1 using cuda 9.1 on a GeForce GTX 1070.
I have many (>2 000 000) small 4x4 matrices which I want to apply SVD on.
The problem is that the call to cusolverDnSgesvdjBatched in my cuda implementation takes more time (> 4 seconds) than consecutive multi-threaded calls to the analogous opencv-function (< 2 sec), which I found rather counter-intuitive.
Here is an example code:
#include <stdio.h>
#include <iostream>
#include <assert.h>
#include <cuda_runtime.h>
#include <cusolverDn.h>
#include <chrono>
#define cudaErr(err) { cudaCheckErr((err), __FILE__, __LINE__); }
inline void cudaCheckErr(cudaError_t errCode, const char *file, int line, bool abort=true){
if (errCode != cudaSuccess){
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(errCode), file, line);
if (abort) exit(errCode);
}
}
__global__
void prepareSVD(
int N,
float* d_A
){
int a_stride = 16; // size of every matrix in A
int i = blockIdx.x*blockDim.x + threadIdx.x;
int a_beg = a_stride*i;
if (i < N){
d_A[a_beg +0] = 1;
d_A[a_beg +4] = 5;
d_A[a_beg +8] = 2;
d_A[a_beg +12] = -1;
d_A[a_beg +1] = -2;
d_A[a_beg +5] = 8;
d_A[a_beg +9] = 1;
d_A[a_beg +13] = 4;
d_A[a_beg +2] = 3;
d_A[a_beg +6] = -1;
d_A[a_beg +10] = 1;
d_A[a_beg +14] = -3;
d_A[a_beg +3] = 3;
d_A[a_beg +7] = 4;
d_A[a_beg +11] = 5;
d_A[a_beg +15] = 6;
}
}
int main(void){
int N = 2000000;
int n, m, nm;
n = m = 4; // for this particular domain problem
nm = n*m; // 16
float* d_A = NULL;
float* d_V = NULL;
float* d_U = NULL;
float* d_S = NULL;
int * d_info = NULL;
int lwork = 0;
float *d_work = NULL;
cudaErr(cudaMalloc((void**)&d_A, sizeof(float)*nm*N));
cudaErr(cudaMalloc((void**)&d_V, sizeof(float)*nm*N));
cudaErr(cudaMalloc((void**)&d_U, sizeof(float)*nm*N));
cudaErr(cudaMalloc((void**)&d_S, sizeof(float)*n*N));
cudaErr(cudaMalloc ((void**)&d_info, sizeof(int )*N));
cudaErr(cudaDeviceSynchronize());
int blockSize = 512;
// Fill the A with values on device side
prepareSVD<<< (N + blockSize-1)/blockSize, blockSize>>>(
N,
d_A
);
cudaErr(cudaDeviceSynchronize());
int lda = m; /* lda >= m */
int ldu = m; /* ldu >= m */
int ldv = n; /* ldv >= n */
cusolverDnHandle_t cusolverH = NULL;
cudaStream_t stream = NULL;
gesvdjInfo_t gesvdj_params = NULL;
cusolverStatus_t status = CUSOLVER_STATUS_SUCCESS;
int* info = (int*)malloc(N*sizeof(int));
const float tol = 1.e-7;
const int max_sweeps = 15;
const int sort_svd = 0; //don't sort singular values
const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR; //compute singular vectors
// create cusolver handle, bind a stream
status = cusolverDnCreate(&cusolverH);
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaErr(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
status = cusolverDnSetStream(cusolverH, stream);
assert(CUSOLVER_STATUS_SUCCESS == status);
//configuration of gesvdj
status = cusolverDnCreateGesvdjInfo(&gesvdj_params);
assert(CUSOLVER_STATUS_SUCCESS == status);
//default value of tolerance is machine zero
status = cusolverDnXgesvdjSetTolerance(gesvdj_params,tol);
assert(CUSOLVER_STATUS_SUCCESS == status);
// default value of max. sweeps is 100
status = cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps);
assert(CUSOLVER_STATUS_SUCCESS == status);
// disable sorting
status = cusolverDnXgesvdjSetSortEig(gesvdj_params,sort_svd);
assert(CUSOLVER_STATUS_SUCCESS == status);
status = cusolverDnSgesvdjBatched_bufferSize(
cusolverH,
jobz,
m,
n,
d_A,
lda,
d_S,
d_U,
ldu,
d_V,
ldv,
&lwork,
gesvdj_params,
N
);
cudaErr(cudaDeviceSynchronize());
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaErr(cudaMalloc((void**)&d_work, sizeof(float)*lwork));
cudaErr(cudaDeviceSynchronize());
auto startTime = std::chrono::steady_clock::now();
// this function takes to long
status = cusolverDnSgesvdjBatched(
cusolverH,
jobz,
m,
n,
d_A,
lda,
d_S,
d_U,
ldu,
d_V,
ldv,
d_work,
lwork,
d_info,
gesvdj_params,
N
);
cudaErr(cudaDeviceSynchronize());
auto diff_sec = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - startTime) / 1000000000.0;
std::cout << "[TIME] Time for svd call: " << diff_sec.count() << std::endl;
assert(CUSOLVER_STATUS_SUCCESS == status);
cudaErr(cudaMemcpy(info, d_info, sizeof(int) * N, cudaMemcpyDeviceToHost));
cudaErr(cudaDeviceSynchronize());
free(info);
cudaErr(cudaFree(d_S));
cudaErr(cudaFree(d_U));
cudaErr(cudaFree(d_info));
cudaErr(cudaFree(d_A));
cudaErr(cudaFree(d_V));
}
Is this expected behaviour?
Is there a way to get this faster?
Thank you very much in advance.