Hello all,
Currently I have a collection of vectors and matrices that I execute within a loop using CUBLAS SGEMV. I would like to use batched CUBLAS calls for this.
Can batched CUBLAS calls be employed for SGEMV? If not, does anyone have any suggestions to convert the code to be able to employee the batched CUBLAS - perhaps translate the SGEMV to SGEMM?
A sample of the code I am using is copied below. Thank you to anyone with some idea(s).
void batchingCalls(int P, int K,
std::vector<std::vector<float> > &A,
std::vector<std::vector<float> > &b,
std::vector<std::vector<float> > &c
std::vector<float> &alpha) {
float *d_A, *d_b, *d_c;
cublasHandle_t handle;
int size = P*K;
int cnt = A.size();
cublasCreate(&handle);
for(int i = 0; i < cnt; ++i) {
cudaMalloc((void**)&d_A, size*sizeof(float));
cudaMalloc((void**)&d_b, size*sizeof(float));
cudaMalloc((void**)&d_c, K*sizeof(float));
cudaMemcpy(d_A, A[i].data(), size*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b[i].data(), size*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_c, cl[i].data(), K*sizeof(float), cudaMemcpyHostToDevice);
float beta = 1.0f;
/* call cublas SGEMV - can this be translated to SGEMM to employ batched CUBLAS ?? */
cublasSgemv(handle, CUBLAS_OP_T, P, K, &alpha[i], d_A, P, d_b, 1, &beta, d_c, 1);
cudaMemcpy(c[i].data(), d_c, K*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_b);
cudaFree(d_c);
}
cublasDestroy(handle);
}