Non-square Matrix Multiplication, not getting any cValues back

Okay, so I’m slowly learning CUDA and I’m new to parallel programming and such so bare with me please :)

I’m working on a program capable of multiplying non square matrices. I successfully managed to implement the square matrix multiplication listed in the documentation (I read it through one time to get an idea of what they were doing and then tried to build it without reading the code again to see if I understood it, only needed a few touchups). So this may only make sense if you’ve done / read that example. Anyway, I’m now trying to see if I can build a non-square matrix multiplier, and not having a lot of success. I thought I had figured out the key, which would go something like this:

  1. Dispatch enough threads to cover the size of the largest matrix (specifically making sure dimensions of the grid covered the largest side of the largest matrix)

  2. When calling getElement() to fill in the subMatricies, calculate the global thread X position ((blockDim.x * blockIdx.x) + threadIdx.x), and if it’s greater than the number of columns than the matrix in question, return NULL.

  3. When multiplying the subMatricies together, while looping through A values and B values, if either values == NULL, skip over it.

  4. Then when calling setElement() on Csub, if a value == NULL, return without setting it.

However, this has basically gone horribly wrong somewhere and hours of debugging hasn’t lead me to an answer… in the end all I’m getting back in the end are the same values I’m passing in. Normally, I’m absolutely HORRIBLY inept with pointers, but I’m pretty sure I’ve got them all right… but if anyone can see an obvious “duh” why this isn’t working right, I’d deeply appreciate it. I’ll bake you cookies! :) I’ve tried to code this adequately enough to make sure it’s easily skimmed through, cut out a few non-essential functions to cut down length, but tried to leave enough to make sure it’s got everything necessary… Anyone see any obvious problems? If I had to guess, I’d imagine the problem is somewhere in the getSubMatrix area, but I’m not sure where… -_-

[codebox]using namespace std;

#define SIZE_m 50

#define SIZE_n 100

#define SIZE_p 150

#define BLOCKSIZE 16

#define DEBUG 0

/************************************************************




** HOST HOST HOST HOST HOST **




*************************/

typedef struct{

unsigned int rows;

unsigned int cols;

unsigned int size;

float *elements;

} Matrix;

host void initHostMatrix(Matrix* M, int rows, int cols, float *elements){

M->rows = rows;

M->cols = cols;

M->size = rows * cols * sizeof(float);

M->elements = elements;

}

host void initDeviceMatrix(Matrix* d_M, Matrix* M){

d_M->rows = M->rows;

d_M->cols = M->cols;

d_M->size = M->size;

cudaMalloc((void**) &d_M->elements, d_M->size);

cudaMemcpy(d_M->elements, M->elements, d_M->size, cudaMemcpyHostToDevice);

}

/************************************************************




** DEVICE DEVICE DEVICE DEVICE DEVICE **




************************/

device Matrix getSubMatrix(Matrix M, int blockRow, int blockCol){

// set our position in Matrix M...

Matrix Msub;

Msub.rows = blockDim.x;

Msub.cols = blockDim.y;

Msub.size = Msub.rows * Msub.cols;

Msub.elements = (M.elements                             // start at beginning of M

                + (M.cols * blockDim.y * blockRow)      // skip over all the rows in blocks above current blockrow

                + (blockDim.x * blockCol));             // skip over all cols prior to block col

return Msub;

}

device float getElement(Matrix M, Matrix Msub, int row, int col){

int globalX = (blockIdx.x * blockDim.x) + col;

int globalY = (blockIdx.y * blockDim.y) + row;

if (globalX >= M.cols || globalY >= M.rows) { return NULL; }

else

    return *(Msub.elements + (row * M.cols) + col);

}

device void setElement(Matrix M, Matrix Msub, int row, int col, float cValue){

int globalX = (blockIdx.x * blockDim.x) + col;

int globalY = (blockIdx.y * blockDim.y) + row;

if (globalX >= M.cols || globalY >= M.rows) { return; }

else {

    *(Msub.elements + (row * M.cols) + col) = cValue;

}

}

global void matrixMultKernel(Matrix A, Matrix B, Matrix C){

// We now have A, B, and C. Their elements pointers

  //  point to an area directly in GPU memory

int blockRow  = blockIdx.x;

int blockCol  = blockIdx.y;

int threadRow = threadIdx.x;

int threadCol = threadIdx.y;

Matrix Csub = getSubMatrix(C, blockRow, blockCol);

float cValue = 0.0;

// – we need to make sure that the block row of A

  //               does not column index the subMatrixA below where

  //               it should...  therefore each call to getElement

  //               needs to have a getGlobalX and getGlobalY call

  //               which then checks to see if either GlobalX or

  //               GlobalY exceeds Matrix.row or Matrix.col, respect-

  //               ively, if it does, place NULL in that location,

  //               then during the multiplication stage, simply check

  //               each element for NULL values and no not multiply

  //               any elements which equate to NULL.

Matrix Asub;

Matrix Bsub;

for(int pos = 0; pos < ((SIZE_n)/(BLOCKSIZE) + 1); pos++){

Asub = getSubMatrix(A, blockRow, pos);

        Bsub = getSubMatrix(B, pos, blockCol);

shared float as[BLOCKSIZE][BLOCKSIZE];

        __shared__ float bs[BLOCKSIZE][BLOCKSIZE];

as[threadRow][threadCol] = getElement(A, Asub, threadRow, threadCol);

        bs[threadRow][threadCol] = getElement(B, Bsub, threadRow, threadCol);

__syncthreads();

for (int e = 0; e < BLOCKSIZE; e++) {

            if ((as[threadRow][e] != NULL) && (bs[e][threadCol] != NULL)){

                cValue += as[threadRow][e] * bs[e][threadCol];

            }

        }

__syncthreads();

}

setElement(C, Csub, threadRow, threadCol, cValue);

return;

}

/************************************************************




** MAIN MAIN MAIN MAIN MAIN **




************************/

int main()

{

Matrix A;

Matrix B;

Matrix C;

Matrix d_A;

Matrix d_B;

Matrix d_C;

float elementsA;

float elementsB;

float elementsC;

initElements(*elementsA, sizeof(elementsA)/sizeof(float), 1); // these two functions just init all elements to 1

initElements(*elementsB, sizeof(elementsB)/sizeof(float), 1);

initElements(*elementsC, sizeof(elementsC)/sizeof(float), 0); // this one inits all to 0

initHostMatrix(&A, SIZE_m, SIZE_n, *elementsA);

initHostMatrix(&B, SIZE_n, SIZE_p, *elementsB);

initHostMatrix(&C, SIZE_m, SIZE_p, *elementsC);

initDeviceMatrix(&d_A, &A);

initDeviceMatrix(&d_B, &B);

initDeviceMatrix(&d_C, &C);

// FIND OUR LARGEST MATRIX //

Matrix* largestMatrix;

if(A.size >= B.size) { if(A.size >= C.size) largestMatrix = &A; }

else if(B.size >= C.size) { largestMatrix = &B; }

else { largestMatrix = &C; }

// FIND THE LARGEST SIDE OF LARGESTMATRIX //

// measurement represents the LARGEST SIDE OF THE LARGEST MATRIX. THEREFORE

// there is no need to make the gridsize any larger than measurement / BLOCKSIZE +1

// (+1 is to compensate for overflow)

int measurement;

if (largestMatrix->rows > largestMatrix->cols) { measurement = largestMatrix->rows; }

else { measurement = largestMatrix->cols; }

unsigned int gridsize = (measurement / BLOCKSIZE) + 1;

dim3 dimGrid(gridsize, gridsize);

dim3 dimBlock(BLOCKSIZE,BLOCKSIZE);

matrixMultKernel<<<dimGrid, dimBlock>>> (d_A, d_B, d_C);

cudaMemcpy(C.elements, d_C.elements, d_C.size, cudaMemcpyDeviceToHost);

for(int i = 0; i < SIZE_m; i++){

    for(int j = 0; j < SIZE_p; j++){

        cout<<i<<", "<<j<<": "<<elementsC[i][j]<<"\n";

        cout.flush();

    }

}

cudaFree(d_A.elements);

cudaFree(d_B.elements);

cudaFree(d_C.elements);

return 0;

}

[/codebox]

Okay I’ve been doing a bit of debugging on this, running the cuda-gdb on it… and I could be completely wrong, but it doesn’t look like enough memory is being allocated for part of one of the matricies… when I get to the part where it cudamallocs the elements to the device inside initDeviceMatrix, these are the pointers I get:

(cuda-gdb) p A

$8 = {rows = 50, cols = 100, size = 20000, elements = 0xbfccf074}

(cuda-gdb) p B

$9 = {rows = 100, cols = 150, size = 60000, elements = 0xbfcb90e4}

(cuda-gdb) p C

$10 = {rows = 50, cols = 150, size = 30000, elements = 0xbfcc7b44}

(cuda-gdb) n

224		initDeviceMatrix(&d_A, &A);

(cuda-gdb) n

225		initDeviceMatrix(&d_B, &B);

(cuda-gdb) n

226		initDeviceMatrix(&d_C, &C);

(cuda-gdb) n

229		if(A.size >= B.size) { if(A.size >= C.size) largestMatrix = &A; }

(cuda-gdb) p d_A

$11 = {rows = 50, cols = 100, size = 20000, elements = 0x24d0000}

(cuda-gdb) p d_B

$12 = {rows = 100, cols = 150, size = 60000, elements = 0x24e0000}

(cuda-gdb) p d_C

$14 = {rows = 50, cols = 150, size = 30000, elements = 0x24f0000}

Now, when I check A, B, and C, they all seem spaced out enough in memory that their data shouldn’t be overlapped, but d_A, d_B, and d_C are much closer together, only 10000 apart each… have I coded something wrong somewhere? I’m requesting foo.size when I call cudaMalloc… shouldn’t this be reserving more space?

Good afternoon, from what ive been working on (Matlab and CUDA), here is a possible solution for anyone interested in Generic Matrix Multiplication. I use the cublas library, but the following code is to demonstrate how its possible to call cublas directly from cuda. I hope it serves all who are interested. Good luck to everyone in CUDA, David Lisin.

This is the matrixMult.m file:

[codebox]

dimension1=4000;

dimension2=4000;

A = diag(rand(dimension1,dimension1))*ones(1,dimension2);

B=ones(dimension2,dimension1);

disp(‘Matlab:’)

tic

C1=A*B;

toc

disp(‘Cuda:’)

tic

C2=matrixMulCUDA(A,B);

toc

[/codebox]

This is the actual cuda file (matrixMulCUDA.cu):

[codebox]

#pragma comment (lib,“C:\CUDA\lib\cublas.lib”)

#include “cuda.h”

#include “mex.h”

#include <string.h>

#include “cublas.h”

/* Gateway function */

void mexFunction(int nlhs, mxArray *plhs,int nrhs, const mxArray *prhs){

int mMatrizA;

int nMatrizA;

int mMatrizB;

int nMatrizB;

int mSalidaFuncion;

int nSalidaFuncion;

const mxArray *paramMatrizA;

double *matrizA;

const mxArray *paramMatrizB;

double *matrizB;

double *matrizAf, *matrizA_gpu;

double *matrizBf, *matrizB_gpu;

double *salidaFuncion;

double *salidaFuncionf, *salidaFuncion_gpu;	

int j;

/* ############################################################

######## */

/* Get the dimensions of the matrizA */

paramMatrizA=prhs[0];

/* We collect the number of params */

mMatrizA= mxGetM(paramMatrizA);

nMatrizA=mxGetN(paramMatrizA);

/* ############################################################

######## */

/* Get the dimensions of the matrizB */

paramMatrizB=prhs[1];

/* We collect the number of params */

mMatrizB= mxGetM(paramMatrizB);

nMatrizB= mxGetN(paramMatrizB);

/* ############################################################

######## */

/*###########################################################

############*/

mSalidaFuncion=mMatrizA;

nSalidaFuncion=nMatrizB;     

/*###########################################################

############*/

/* ############################################################

######## */

/* Create an mxArray for the output data */

if(!(plhs[0]=mxCreateDoubleMatrix(mSalidaFuncion,nSalidaFunc

ion,mxREAL))){

    printf("\nERROR\n");

    return;

}

/* ############################################################

######## */

/* Creamos los arrays de input y arrays de salida del GPU */

cudaMalloc((void**)&matrizA_gpu,sizeof(double)*mMatrizA*nMatrizA);

cudaMalloc((void**)&matrizB_gpu,sizeof(double)*mMatrizB*nMatrizB);

cudaMalloc((void**)&salidaFuncion_gpu,sizeof(double)*mSalidaFuncion*nSalidaF

uncion);

/* ############################################################

######## */

/* Recogemos los datos del input */

matrizA=mxGetPr(prhs[0]);

matrizB=mxGetPr(prhs[1]);

/* ############################################################

######## */

matrizAf=(double*)mxMalloc(sizeof(double)*mMatrizA*nMatrizA)

;

for(j=0;j<nMatrizA*mMatrizA;j++){

    matrizAf[j]=matrizA[j];

}    

cudaMemcpy(matrizA_gpu,matrizAf,sizeof(double)*mMatrizA*nMat

rizA,cudaMemcpyHostToDevice);

/* ############################################################

######## */

matrizBf=(double*)mxMalloc(sizeof(double)*mMatrizB*nMatrizB)

;

for(j=0;j<nMatrizB*mMatrizB;j++){

    matrizBf[j]=matrizB[j];

}    

cudaMemcpy(matrizB_gpu,matrizBf,sizeof(double)*mMatrizB*nMat

rizB,cudaMemcpyHostToDevice);

/* ############################################################

######## */

/* ############################################################

####### */

/* Reservamos la memoria de la salida de doble precision */

salidaFuncionf=(double*)mxMalloc(sizeof(double)*mSalidaFunci

on*nSalidaFuncion);

/* ############################################################

*/

/* READY TO CALL SGEMM */

//(void) cublasDgemm ('n','n',mMatrizA,nMatrizB,nMatrizA,1,matrizA,mMatrizA,matrizB_g

pu,mMatrizB,0,salidaFuncion_gpu,mMatrizA);

/* READY TO CALL SGEMM */

(void) cublasDgemm ('n','n',mMatrizA,nMatrizB,nMatrizA,1,matrizA_gpu,mMatrizA,matri

zB_gpu,mMatrizB,0,salidaFuncion_gpu,mMatrizA);

// block until the device has completed

cudaThreadSynchronize();

/* Copy result back to host */

cudaMemcpy( salidaFuncionf, salidaFuncion_gpu, sizeof(double)*mSalidaFuncion*nSalidaFuncion, cudaMemcpyDeviceToHost);

/* Create a pointer to the output data */

salidaFuncion = mxGetPr(plhs[0]);

for(j=0;j<nSalidaFuncion*mSalidaFuncion;j++){

        salidaFuncion[j]=(double)salidaFuncionf[j];       

}

/* Clean-up memory on device and host */

mxFree(matrizAf);

    mxFree(matrizBf);

    mxFree(salidaFuncionf);

cudaFree(matrizA_gpu);	

cudaFree(matrizB_gpu);	

cudaFree(salidaFuncion_gpu);        

}

[/codebox]

The output is:

Matlab:

Elapsed time is 21.544856 seconds.

Cuda:

Elapsed time is 2.545485 seconds.

So not too bad :)

Im using double precision matrix multiplication DGEMM, but if you dont need double precision, use single precision SGEMM, as it has better times.

Again, good luck to everyone working with CUDA :)