cublasDgemm is giving weird CUBLAS_STATUS_INTERNAL_ERROR

Hello Forum!
I am trying to do the following simple operations on some tall skinny matrices (# of rows >> # of cols):

blockVectorR = blockVectorX * lambda
blockVectorR = blockVectorX - blockVectorR
activeBlockVectorR = get the active columns of blockVectorR(see activeMask, column i is active if activeMask[i] = 1)
XtactR = blockVectorX’* activeBlockVectorR

[Dimension: blockVectorR, blockVectorX => Mblocksize, activeBlockVectorR => McurrentBlockSize, lambda => blocksizeblocksize, XtactR => blocksizecurrentBlockSize]

I am using both cublasDgemm routine and OpenMP target offloading for this purpose. But the weird thing is, the very last cublasDgemm call [which is doing transpose(X)*Y type operation] is giving me CUBLAS_STATUS_INTERNAL_ERROR. Though the the final output of the following codeblock matches with two other implementation of the same oprations [ one is using Intel MKL library & cpu kernels and the other one is using NVBLAS dgemm routine and same GPU kernels in following codeblock]. I have attached the outputs here.

Can anyone please point me what am I doing wrong? Or am I missing something here? I really appreciate your help.
Thanks
Fazlay

#include <stdio.h>
#include <stdlib.h>

#ifdef USE_CUBLAS
#include <cuda_runtime.h>
#include "cublas_v2.h"
#endif
#define DGEMM_RESTRICT __restrict__

//Custom kernels for GPU
void getActiveBlockVector_GPU_v2(double *activeBlockVectorR, int *activeMask, double *blockVectorR, int M, int blocksize, int currentBlockSize);
void mat_sub_GPU_v2(double *src1, double *src2, double *dst, const int row, const int col);

//Utility Kernel
void print_matrix(double *array, int row, int col);

int main()
{
    int M = 4985944, blocksize = 8;
    int i, j, k;
    int currentBlockSize = 7;
    const double cudaAlpha = 1.0;
    const double cudaBeta = 0.0;
    int activeMask[8];
    activeMask[0] = 1 ; activeMask[1] = 1 ; activeMask[2] = 1 ; activeMask[3] = 1 ;
    activeMask[4] = 1 ; activeMask[5] = 1 ; activeMask[6] = 1 ; activeMask[7] = 0;

    double* DGEMM_RESTRICT blockVectorX = (double*) malloc(sizeof(double) * M * blocksize);
    double* DGEMM_RESTRICT lambda = (double*) malloc(sizeof(double) * blocksize * blocksize);
    double* DGEMM_RESTRICT blockVectorR = (double*) malloc(sizeof(double) * M * blocksize);
    double* DGEMM_RESTRICT activeBlockVectorR = (double*) malloc(sizeof(double) * M * blocksize);
    
    double* DGEMM_RESTRICT XtactR = (double*) malloc(sizeof(double) * blocksize * currentBlockSize);
    double* DGEMM_RESTRICT cblas_XtactR = (double*) malloc(sizeof(double) * blocksize * currentBlockSize);
    double* DGEMM_RESTRICT nvblas_XtactR = (double*) malloc(sizeof(double) * blocksize * currentBlockSize);
    
    srand(0);

    //#pragma omp paralle for default(shared) private(j)
    for(i = 0 ; i < M ; i++)
    {
        for(j = 0 ; j < blocksize ; j++)
        {
            blockVectorX[i * blocksize + j] = (double)rand()/(double)RAND_MAX;
        }
    }

    //#pragma omp paralle for default(shared) private(j)
    for(i = 0 ; i < blocksize ; i++)
    {
        for(j = 0 ; j < blocksize ; j++)
        {
            lambda[i * blocksize + j] = (double)rand()/(double)RAND_MAX;
        }
    }

    //======== offloading to GPU start ==========//
    #if defined(USE_CUBLAS)
        double *d_blockVectorX, *d_lambda, *d_blockVectorR, *d_activeBlockVectorR, *devPtrXtR, *devPtrXtactR;
        int *d_activeMask;
        cudaError_t cuberror;
        
        cuberror = cudaMalloc ((void**)&d_blockVectorX, M * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed d_blockVectorX\n"); return 0; }
        cuberror = cudaMalloc ((void**)&d_lambda, blocksize * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed d_lambda\n"); return 0; }
        cuberror = cudaMalloc ((void**)&d_blockVectorR, M * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed d_blockVectorR\n"); return 0; }
        cuberror = cudaMalloc ((void**)&d_activeBlockVectorR, M * currentBlockSize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed d_activeBlockVectorR\n"); return 0; }
        cuberror = cudaMalloc ((void**)&devPtrXtR, blocksize * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed d_blockVectorR\n"); return 0; }
        cuberror = cudaMalloc ((void**)&devPtrXtactR, blocksize * currentBlockSize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed devPtrXtactR\n"); return 0; }
        cuberror = cudaMalloc ((void**)&d_activeMask, blocksize * sizeof(int));
        if( cuberror != 0 ){ printf("cudaMalloc Filed d_activeMask\n"); return 0; }
        
        cuberror = cudaMemcpy((void *)d_blockVectorX, blockVectorX, M * blocksize * sizeof(double), cudaMemcpyHostToDevice);
        if( cuberror != 0 ){ printf("cudaMemcpy X ==> %d\n", cuberror); return 0; }
        cuberror = cudaMemcpy((void *)d_activeMask, activeMask, blocksize * sizeof(int), cudaMemcpyHostToDevice);
        if( cuberror != 0 ){ printf("cudaMemcpy d_activeMask ==> %d\n", cuberror); return 0; }
        cuberror = cudaMemcpy((void *)d_lambda, lambda, blocksize * blocksize * sizeof(double), cudaMemcpyHostToDevice);
        if( cuberror != 0 ){ printf("cudaMemcpy lambda ==> %d\n", cuberror); return 0; }

        cublasStatus_t cubstat;
        cublasHandle_t handle;
        cublasStatus_t gemmStat;
        cubstat = cublasCreate(&handle);
        if( cubstat != CUBLAS_STATUS_SUCCESS ){ printf("HandleCreationFailure\n"); return 0; }
     
        //blockVectorR = blockVectorX * lambda
        gemmStat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, blocksize, M, blocksize, 
                  &cudaAlpha, d_lambda, blocksize, d_blockVectorX, blocksize, &cudaBeta, d_blockVectorR, blocksize); 
        if(gemmStat != CUBLAS_STATUS_SUCCESS) {printf("** cublasDgemm status-1: %d\n", gemmStat);}
        
        //blockVectorR = blockVectorX - blockVectorR
        mat_sub_GPU_v2(d_blockVectorX, d_blockVectorR, d_blockVectorR, M, blocksize);
        
        //activeBlockVectorR = get the active columns of blockVectorR(first 7 columns are active, see activeMask array)
        getActiveBlockVector_GPU_v2(d_activeBlockVectorR, d_activeMask, d_blockVectorR, M, blocksize, currentBlockSize);
       
        //XtactR = blockVectorX'* activeBlockVectorR
        gemmStat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, currentBlockSize, blocksize, M, 
                  &cudaAlpha, d_activeBlockVectorR, currentBlockSize, d_blockVectorX, blocksize, &cudaBeta, devPtrXtactR, currentBlockSize); 
        if(gemmStat != CUBLAS_STATUS_SUCCESS) {printf("** cublasDgemm status-2: %d\n", gemmStat);}
        
        //copying result back to CPU
        cuberror = cudaMemcpy(XtactR, devPtrXtactR, blocksize * currentBlockSize * sizeof(double), cudaMemcpyDeviceToHost);
        if( cuberror != 0 ){ printf("cudaMemcpy Failed XtactR: %d\n", cuberror);}
        
        printf("\n=========== Printing XtactR matrix (GPU) ==============\n");
        print_matrix(XtactR, blocksize, currentBlockSize);
    #endif
   
    #if defined(USE_CUBLAS)
        cudaFree (d_blockVectorX);
        cudaFree (d_lambda);
        cudaFree (d_blockVectorR);
        cudaFree (devPtrXtR);
        cublasDestroy(handle);
    #endif

    return 0;
}

//GPU custom kernels
void getActiveBlockVector_GPU_v2(double *activeBlockVectorR, int *activeMask, double *blockVectorR, int M, int blocksize, int currentBlockSize)
{
    int i, j, k = 0;
    #pragma omp target is_device_ptr(activeBlockVectorR, blockVectorR, activeMask)
    #pragma omp teams distribute parallel for default(shared) private(j, k) 
    for(i = 0 ; i < M ; i++)
    {
        k = 0;
        for(j = 0 ; j < blocksize ; j++)
        {
            if(activeMask[j] == 1)
            {
                activeBlockVectorR[i * currentBlockSize + k] = blockVectorR[i * blocksize + j];
                k++;
            }
        }
    }
}

void mat_sub_GPU_v2(double *src1, double *src2, double *dst, const int row, const int col)
{
    int i, j;
    #pragma omp target is_device_ptr(src1, src2, dst)
    #pragma omp teams distribute parallel for private(j) default(shared)
    for(i = 0; i < row ; i++)
    {
        for(j = 0 ; j < col ; j++)
        {
            dst[i * col + j] = src1[i * col + j] - src2[i * col + j];
        }
    }
}

//Utility kernel
void print_matrix(double *array, int row, int col)
{
    int i, j;
    for(i = 0; i < row ; i++)
    {
        for(j = 0 ; j < col ; j++)
        {
            printf("%lf ", array[i * col + j]);
        }
        printf("\n");
    }
}
  • P.S.: Cross-posted.