cublasDgemm issue :- Copying Rectangular matrix from Host to Device - cudaErrorInvalidPitchValue (error 12)

Hello Forum,

I am trying to copy two rectangular matrices (first matrix dimension - Mblocksize and second matrix dimension - blocksizeblocksize) and call cublasDgemm function to perform matrix multiplication. I tried following functions to copy those matrices from host to device:

  • cublasSetMatrix: this matrix gives me CUBLAS_STATUS_MAPPING_ERROR error while copying the first matrix (M*blocksize). However, when I ran the executable with cuda-memcheck then it gives me "Program hit cudaErrorInvalidPitchValue (error 12) due to "invalid pitch argument" on CUDA API call to cudaMemcpy2D" message. cublasSetMatrix works fine when M == blocksize i.e the matrix is square and cublasDgemm gives correct multiplication result.
  • cudaMemcpy and cudaMemcpy2D (with cudaMallocPitch): I also used these two functions to copy the matrices. These functions don't give any error message. However, the cublasDgemm function doesn't give proper multiplication result. In this case, the resultant matrix is always zero. And in this case, the cublasDgemm call gives "On entry to DGEMM parameter number 8 had an illegal value" warning message while running the executable

I think the issue is copying a rectangular matrix between host and device. Can you please advice me to resolve this issue? Any help is much appreciated.

Thanks,
Fazlay

Here is the code:

#include <stdio.h>
#include <stdlib.h>

#ifdef USE_CUBLAS
#include <cuda_runtime.h>
#include "cublas_v2.h"
#endif
#define DGEMM_RESTRICT __restrict__

void print_matrix(double *array, int row, int col)
{
    int i, j;
    for(i = 0; i < row ; i++)
    {
        for(j = 0 ; j < col ; j++)
        {
            printf("%lf ", array[i * col + j]);
        }
        printf("\n");
    }
}

int main()
{
    int M = 4096, blocksize = 8;
    int i, j;
    double* DGEMM_RESTRICT blockVectorX = (double*) malloc(sizeof(double) * M * blocksize);
	double* DGEMM_RESTRICT lambda = (double*) malloc(sizeof(double) * blocksize * blocksize);
	double* DGEMM_RESTRICT blockVectorR = (double*) malloc(sizeof(double) * M * blocksize);

    for(i = 0 ; i < M ; i++)
    {
        for(j = 0 ; j < blocksize ; j++)
        {
            blockVectorX[i * blocksize + j] = 1.0;
        }
    }

    for(i = 0 ; i < blocksize ; i++)
    {
        for(j = 0 ; j < blocksize ; j++)
        {
            lambda[i * blocksize + j] = 0.5;
        }
    }

    //======== offloading to GPU start ==========//
    #if defined( USE_CUBLAS )
        double* devPtrX;
        double* devPtrLambda;
        double* devPtrR;
        const double cudaAlpha = 1.0;
        const double cudaBeta = 0.0;
        
        cublasStatus_t cubstat;
        cublasHandle_t handle;
        cudaError_t cuberror;
        cuberror = cudaMalloc ((void**)&devPtrX, M * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed devPtrX\n"); return 0; }
        cuberror = cudaMalloc ((void**)&devPtrLambda, blocksize * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed devPtrLambda\n"); return 0; }
        cuberror = cudaMalloc ((void**)&devPtrR, M * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed devPtrR\n"); return 0; }

cubstat = cublasCreate(&handle);
        if( cubstat != CUBLAS_STATUS_SUCCESS ){ printf("HandleCreationFailure\n"); return 0; }
        
        //---------- copying blockVectorX from host to device ------------------
        // cuberror = cudaMemcpy(devPtrX, blockVectorX, M * blocksize * sizeof(double), cudaMemcpyHostToDevice);
        // if( cuberror != 0 ){ printf("cudaMemcpy X ==> %d\n", cuberror); return 0; }
        
        // size_t devPitch;
        // cudaError_t pitchError = cudaMallocPitch((void **)&devPtrX, &devPitch, blocksize * sizeof(double), M);
        // if( pitchError != cudaSuccess ){ printf("cudaMallocPitch ==> cudaSuccess ==> %d\n", devPitch);}
        // cudaError_t memError = cudaMemcpy2D(devPtrX, devPitch, blockVectorX, blocksize * sizeof(double), blocksize * sizeof(double), M, cudaMemcpyHostToDevice);
        // if( memError != cudaSuccess ){ printf("cudaMemcpy2D ==> cudaSuccess\n");}
 
        cubstat = cublasSetMatrix (M, blocksize, sizeof(double), blockVectorX, blocksize, devPtrX, blocksize);
        if( cubstat != CUBLAS_STATUS_SUCCESS ){ printf("SetMatrixFailure X\n"); return 0; }
        
        //---------- copying lambda from host to device ------------------
        cubstat = cublasSetMatrix (blocksize, blocksize, sizeof(double), lambda, blocksize, devPtrLambda, blocksize);
        if( cubstat != CUBLAS_STATUS_SUCCESS ){ printf("SetMatrixFailure lambda\n"); return 0; }
        
        //---------- calling cublasDgemm function ------------------
        cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, blocksize, blocksize, 
                  &cudaAlpha, devPtrX, blocksize, devPtrLambda, blocksize, &cudaBeta, devPtrR, blocksize); //XY code 1

        //---------- copying blockVectorR from device to host ------------------
        // cuberror = cudaMemcpy(blockVectorR, devPtrR, M * blocksize * sizeof(double), cudaMemcpyDeviceToHost);
        // if( cuberror != 0 ){ printf("cudaMemcpy Failed blockVectorR: %d\n", cuberror);}

        // memError = cudaMemcpy2D(blockVectorR, blocksize * sizeof(double), devPtrR, devPitch, blocksize * sizeof(double), M, cudaMemcpyDeviceToHost);
        // if( memError == cudaSuccess ){ printf("cudaMemcpy2D ==> cudaSuccess device->host\n");}

        cubstat = cublasGetMatrix (M, blocksize, sizeof(double), devPtrR, blocksize, blockVectorR, blocksize);
        if( cubstat != CUBLAS_STATUS_SUCCESS ){ printf("GetMatrixFailure blockVectorR\n");}
        
        cudaFree (devPtrX);
        cudaFree (devPtrLambda);
        cudaFree (devPtrR);
        cublasDestroy(handle);  
    #endif

    print_matrix(blockVectorR, 2, blocksize);
    
    return 0;
}

regarding the error that your code is currently having (CUBLAS_STATUS_MAPPING_ERROR), you should study the cublas documentation for cublasSetMatrix carefully:

https://docs.nvidia.com/cuda/cublas/index.html#cublassetmatrix

Let’s take a look at the failing call:

cubstat = cublasSetMatrix (M, blocksize, sizeof(double), blockVectorX, blocksize, devPtrX, blocksize);
                           ^                                                 ^                   ^
                           number of rows to transfer                        number of src rows  number of rows in dest

Does that look right to you? How could you transfer M (=4096) rows if the source and destination matrix only have blocksize (=8) rows? If you’re going to transfer the entire matrix anyway, it may be less confusing simply to use cudaMemcpy. It will work just fine.