Copying Rectangular matrix from Host to Device : cudaErrorInvalidPitchValue (error 12)

Hello Forum,

I am trying to copy two rectangular matrices (first matrix dimension - Mblocksize and second matrix dimension - blocksizeblocksize) and call cublasDgemm function to perform matrix multiplication. I tried following functions to copy those matrices from host to device:

  • cublasSetMatrix: this matrix gives me CUBLAS_STATUS_MAPPING_ERROR error while copying the first matrix (M*blocksize). However, when I ran the executable with cuda-memcheck then it gives me "Program hit cudaErrorInvalidPitchValue (error 12) due to "invalid pitch argument" on CUDA API call to cudaMemcpy2D" message. cublasSetMatrix works fine when M == blocksize i.e the matrix is square and cublasDgemm gives correct multiplication result.
  • cudaMemcpy and cudaMemcpy2D (with cudaMallocPitch): I also used this two function for copy the matrices. These functions does give any error message, however the cublasDgemm function call doesn't give proper multiplication result. In this case, the resultant matrix is always zero. And in this case, the cublasDgemm call gives "On entry to DGEMM parameter number 8 had an illegal value" warning message while running the executable

I think the issue is copying a rectangular matrix between host and device. Can you please advice me to resolve this issue? Any help is much appreciated.

Thanks,
Fazlay

Here is the code:

#include <stdio.h>
#include <stdlib.h>

#ifdef USE_CUBLAS
#include <cuda_runtime.h>
#include "cublas_v2.h"
#endif
#define DGEMM_RESTRICT __restrict__

void print_matrix(double *array, int row, int col)
{
    int i, j;
    for(i = 0; i < row ; i++)
    {
        for(j = 0 ; j < col ; j++)
        {
            printf("%lf ", array[i * col + j]);
        }
        printf("\n");
    }
}

int main()
{
    int M = 4096, blocksize = 8;
    int i, j;
    double* DGEMM_RESTRICT blockVectorX = (double*) malloc(sizeof(double) * M * blocksize);
	double* DGEMM_RESTRICT lambda = (double*) malloc(sizeof(double) * blocksize * blocksize);
	double* DGEMM_RESTRICT blockVectorR = (double*) malloc(sizeof(double) * M * blocksize);

    for(i = 0 ; i < M ; i++)
    {
        for(j = 0 ; j < blocksize ; j++)
        {
            blockVectorX[i * blocksize + j] = 1.0;
        }
    }

    for(i = 0 ; i < blocksize ; i++)
    {
        for(j = 0 ; j < blocksize ; j++)
        {
            lambda[i * blocksize + j] = 0.5;
        }
    }

    //======== offloading to GPU start ==========//
    #if defined( USE_CUBLAS )
        double* devPtrX;
        double* devPtrLambda;
        double* devPtrR;
        const double cudaAlpha = 1.0;
        const double cudaBeta = 0.0;
        
        cublasStatus_t cubstat;
        cublasHandle_t handle;
        cudaError_t cuberror;
        cuberror = cudaMalloc ((void**)&devPtrX, M * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed devPtrX\n"); return 0; }
        cuberror = cudaMalloc ((void**)&devPtrLambda, blocksize * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed devPtrLambda\n"); return 0; }
        cuberror = cudaMalloc ((void**)&devPtrR, M * blocksize * sizeof(double));
        if( cuberror != 0 ){ printf("cudaMalloc Filed devPtrR\n"); return 0; }

        
        cubstat = cublasCreate(&handle);
        if( cubstat != CUBLAS_STATUS_SUCCESS ){ printf("HandleCreationFailure\n"); return 0; }
        
        //---------- copying blockVectorX from host to device ------------------
        // cuberror = cudaMemcpy(devPtrX, blockVectorX, M * blocksize * sizeof(double), cudaMemcpyHostToDevice);
        // if( cuberror != 0 ){ printf("cudaMemcpy X ==> %d\n", cuberror); return 0; }
        
        // size_t devPitch;
        // cudaError_t pitchError = cudaMallocPitch((void **)&devPtrX, &devPitch, blocksize * sizeof(double), M);
        // if( pitchError != cudaSuccess ){ printf("cudaMallocPitch ==> cudaSuccess ==> %d\n", devPitch);}
        // cudaError_t memError = cudaMemcpy2D(devPtrX, devPitch, blockVectorX, blocksize * sizeof(double), blocksize * sizeof(double), M, cudaMemcpyHostToDevice);
        // if( memError != cudaSuccess ){ printf("cudaMemcpy2D ==> cudaSuccess\n");}
 
        cubstat = cublasSetMatrix (M, blocksize, sizeof(double), blockVectorX, blocksize, devPtrX, blocksize);
        if( cubstat != CUBLAS_STATUS_SUCCESS ){ printf("SetMatrixFailure X\n"); return 0; }
        
        //---------- copying lambda from host to device ------------------
        cubstat = cublasSetMatrix (blocksize, blocksize, sizeof(double), lambda, blocksize, devPtrLambda, blocksize);
        if( cubstat != CUBLAS_STATUS_SUCCESS ){ printf("SetMatrixFailure lambda\n"); return 0; }
        
        //---------- calling cublasDgemm function ------------------
        cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, blocksize, blocksize, 
                  &cudaAlpha, devPtrX, blocksize, devPtrLambda, blocksize, &cudaBeta, devPtrR, blocksize); //XY code 1

        //---------- copying blockVectorR from device to host ------------------
        // cuberror = cudaMemcpy(blockVectorR, devPtrR, M * blocksize * sizeof(double), cudaMemcpyDeviceToHost);
        // if( cuberror != 0 ){ printf("cudaMemcpy Failed blockVectorR: %d\n", cuberror);}

        // memError = cudaMemcpy2D(blockVectorR, blocksize * sizeof(double), devPtrR, devPitch, blocksize * sizeof(double), M, cudaMemcpyDeviceToHost);
        // if( memError == cudaSuccess ){ printf("cudaMemcpy2D ==> cudaSuccess device->host\n");}

        cubstat = cublasGetMatrix (M, blocksize, sizeof(double), devPtrR, blocksize, blockVectorR, blocksize);
        if( cubstat != CUBLAS_STATUS_SUCCESS ){ printf("GetMatrixFailure blockVectorR\n");}
        
        cudaFree (devPtrX);
        cudaFree (devPtrLambda);
        cudaFree (devPtrR);
        cublasDestroy(handle);  
    #endif

    print_matrix(blockVectorR, 2, blocksize);
    
    return 0;
}

https://devtalk.nvidia.com/default/topic/1054553/gpu-accelerated-libraries/cublasdgemm-issue-copying-rectangular-matrix-from-host-to-device-cudaerrorinvalidpitchvalue-error-12-/