Hello all,
I have been using CUDA for a while now, but am now using CUBLAS to do some blas calls, just to get whatever speedup over lapack/atlas I can get. I have a large code snippet posted here, I have tried to parse out all the debugging prints I used…and bear with me as I explain what happens. The code is being used to try and reduce redundant transfers to the GPU when two consecutive DGEMM calls use the same matrix. The logic of the determining whether or not to use a new matrices/old matrices is correct. After the first time that a matrix is reused and some pointers have been passed around, the cublasGetMatrix() returns status 11 == CUBLAS_STATUS_MAPPING_ERROR. This is bizarre to me because the C matrix ,in the A*B=C, is the only matrix which is always reallocated, so what could have gone wrong with it? Thanks for helping!
[codebox]double *last_A = NULL; double last_m = 0;
double *last_B = NULL; double last_k = 0;
double *last_C = NULL; double last_n = 0;
double last_nca = 0;
double last_ncb = 0;
double last_ncc = 0;
double *last_devPtrA, *last_devPtrB, *last_devPtrC;
double *devPtrA, *devPtrB, *devPtrC;
…
DGEMM_wrapper() {
cublasStatus stat;
bool newA = true, newB = true;
bool oldAUse = false, oldBUse = false, oldCUse = false;
/* these mats should only be the ones I will multiply */
/* so A(m,k)*B(k,n) = C(m,n) */
if(last_A == NULL) {
stat = cublasInit();
}
if(A == last_A) {//Keep the devPtrA the same
devPtrA = last_devPtrA;
newA = false;
oldAUse = true;
}
else if(A == last_B) { //Use devPtrB from last calculation as matrix A
devPtrA = last_devPtrB;
newA = false;
oldBUse = true;
}
else if(A == last_C) { //Use devPtrC from last calculation as matrix A
devPtrA = last_devPtrC;
newA = false;
oldCUse = true;
}
if(B == last_B) {//Keep devPtrB the same
devPtrB = last_devPtrB;
newB = false;
oldBUse = true;
}
else if(B == last_A) {//Use devPtrA from last calculation as matrix B
devPtrB = last_devPtrA;
newB = false;
oldAUse = true;
}
else if(B == last_C) {//Use devPtrC from last calculation as matrix B
devPtrB = last_devPtrC;
newB = false;
oldCUse = true;
}
//If a matrix is not to be reused, that space is then freed for use in this multiply
if(!oldAUse) { stat = cublasFree(last_devPtrA); }
if(!oldBUse) { stat = cublasFree(last_devPtrB); }
if(!oldCUse) { stat = cublasFree(last_devPtrC); }
/* when we set it, have to worry about submatrices*/
/* A → a */
stat = cublasAlloc (m*n, sizeof(double), (void**)&devPtrC);
stat = cublasSetMatrix (n, m, sizeof(double), C, ncc, devPtrC, n);
if (transa == ‘n’ && transb == ‘n’) {
if(newA) { stat = cublasAlloc(m*k, sizeof(double), (void **)&devPtrA); stat = cublasSetMatrix (k, m, sizeof(double), A, nca, devPtrA, k); }
if(newB) { stat = cublasAlloc(k*n, sizeof(double), (void **)&devPtrB); stat = cublasSetMatrix (n, k, sizeof(double), B, ncb, devPtrB, n); }
cudaThreadSynchronize();
cublasDgemm(transb,transa,n,m,k,alpha,devPtrB,n,devPtrA,k,be
ta,devPtrC,n);
}
else if (transa == ‘t’ && transb == ‘n’) {
if(newA) { stat = cublasAlloc(m*k, sizeof(double), (void **)&devPtrA); stat = cublasSetMatrix (m, k, sizeof(double), A, nca, devPtrA, m); }
if(newB) { stat = cublasAlloc(k*n, sizeof(double), (void **)&devPtrB); stat = cublasSetMatrix (n, k, sizeof(double), B, ncb, devPtrB, n); }
cudaThreadSynchronize();
cublasDgemm(transb,transa,n,m,k,alpha,devPtrB,n,devPtrA,m,be
ta,devPtrC,n);
}
else if (transa == ‘n’ && transb == ‘t’) {
if(newA) { stat = cublasAlloc(m*k, sizeof(double), (void **)&devPtrA); stat = cublasSetMatrix (k, m, sizeof(double), A, nca, devPtrA, k); }
if(newB) { stat = cublasAlloc(k*n, sizeof(double), (void **)&devPtrB); stat = cublasSetMatrix (k, n, sizeof(double), B, ncb, devPtrB, k); }
cudaThreadSynchronize();
cublasDgemm(transb,transa,n,m,k,alpha,devPtrB,k,devPtrA,k,be
ta,devPtrC,n);
}
else {
if(newA) { stat = cublasAlloc(m*k, sizeof(double), (void **)&devPtrA); stat = cublasSetMatrix (m, k, sizeof(double), A, nca, devPtrA, m); }
if(newB) { stat = cublasAlloc(k*n, sizeof(double), (void **)&devPtrB); stat = cublasSetMatrix (k, n, sizeof(double), B, ncb, devPtrB, k); }
cudaThreadSynchronize();
cublasDgemm(transb,transa,n,m,k,alpha,devPtrB,k,devPtrA,m,be
ta,devPtrC,n);
}
cudaThreadSynchronize();
stat = cublasGetMatrix (n, m, sizeof(double), devPtrC, n, C, ncc);
last_A = A; last_m = m; last_nca = nca;
last_B = B; last_k = k; last_ncb = ncb;
last_C = C; last_n = n; last_ncc = ncc;
last_devPtrA = &(*devPtrA); last_devPtrB = &(*devPtrB); last_devPtrC = &(*devPtrC);
devPtrA = NULL; devPtrB = NULL; devPtrC = NULL;[/codebox]