Intro CUDA - Matrix Multiplication Returning Odd Values

So it looks like I’m rapidly discovering I’m not very good with CUDA… this is my second programming attempt and second total failure. Basically a reproduction of the shared memory matrix multiplication example from the Programming Guide except without using Matrix structs, I’m trying to just use arrays…

I’m getting strangely and unexpected values for my results coming back from the kernel call… I’m sure I’m just matrix address math-ing wrong somewhere… but if someone can help me figure out where I’d sure appreciate it…

I’m sorry for the length, I cut out as much as I could, the calls to buildMatrix() and fillZeroes() just fills A and B with all 1’s and C with all 0’s. When it returns, the values I have been getting back are:

C[0][0] - C[0][63] = 1

C[0][64] - C[0][99] = 2

Same thing on row C[1]

Row 2 and 3 I get 0’s from [2][0] - [2][79] and then 4’s from [2][80]-[2][99]…

Everything else results in 0’s.

I have a feeling I’m just doing some sort of matrix address calculations wrong but I can’t see it for the life of me and this is my first attempt at this… thanks a thousand times to anyone that can help… I’m not sure what I’m doing wrong…

[codebox]

#define BLOCK_SIZE 10

#define GRID_SIZE 10

#define ARRAY_SIZE 100

/*****************************************

        CUDA SUBMATRIX GENERATOR

*****************************************/

device float* getSubMatrix(float* A, int row, int col, int stride) {

// stride = gridDim.x * blockDim.x <— represents one full row of the array

float* startOfA = (A // starting point of A

                + (stride * blockDim.y * row)   // skip over rows above this one

                + (blockDim.x * col));          // shift right to proper column

                                                // At this point we should have the value

                                                // of the first element of the BlockID

return startOfA;

}

/*****************************************

        CUDA PARALLEL MATRIX FILLER

*****************************************/

device float getElement(float* Msub, int x, int y, int stride){

return *(Msub // retrieve the value from Msub

        + (x * stride)                          // move down x rows

        + (y));                                 // move over y columns

}

/****************************************

        CUDA PARALLEL MATRIX SETTER

****************************************/

device void setElement(float* Msub, int x, int y, int stride, float value){

*(Msub // Set the value of the address pointed

    + (x * stride)                              // pointed to by Msub, skip over x rows,

    + (y))                                      // move over y columns

    = value;                                    // Assign the value

}

/*****************************************

        CUDA MATRIX MULTIPLIER

*****************************************/

global void addKernel(float* A, float* B, float* C) {

// GET OUR POSITION

int blockRow = blockIdx.x;

int blockCol = blockIdx.y;

int stride = gridDim.x * blockDim.x;  // for moving entire INDIVIDUAL rows, not block rows

                                      // in this example == 100, to move entire block rows, multiply stride

                                      // by blockDim.y to jump over all rows in the block row

// CREATE A BLOCK SIZED MATRIX TO STORE OUR ELEMENTS IN

float* CSub = getSubMatrix(C, blockRow, blockCol, stride);

float cValue = 0;

float* ASub;

float* BSub;

// USE THIS LOOP TO ITERATE OVER BLOCKS IN A AND B

for (int m = 0; m < (gridDim.x); m++){

// INITIALIZE POSITION OF A… GIVE IT A, row, AND column

    ASub = getSubMatrix(A, blockRow, m, stride);

    BSub = getSubMatrix(B, m, blockCol, stride);

// CREATE TWO BLOCK SIZED MATRICIES TO STORE A AND B IN

    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

int row = threadIdx.x;

    int col = threadIdx.y;

// FILL THE ARRAYS WITH THEIR PROPER ELEMENTS

    // BECAUSE THIS IS SHARED, ONLY NEED ONE CALL

    // EACH THREAD IN THE BLOCK WILL FILL ONE ELEMENT

    As[threadIdx.x][threadIdx.y] = getElement(ASub, row, col, stride);

    Bs[threadIdx.x][threadIdx.y] = getElement(BSub, row, col, stride);

__syncthreads();

// NOW THAT As AND Bs ARE FILLED, WE CAN BEGIN MULTIPLYING

    // AND SETTING cvalue AND WHATNOT.

    for(int i = 0; i < (BLOCK_SIZE); i++){

        cValue += As[threadIdx.x][i] * Bs[i][threadIdx.y];

    }

__syncthreads();

}

setElement(CSub, threadIdx.x, threadIdx.y, stride, cValue);

return;

}

/******************************************

        MAIN PROGRAM

******************************************/

int main()

{

srand( time (NULL) );

// CREATE 3 ARRAY_SIZE x ARRAY_SIZE arrays

float elementsA[ARRAY_SIZE][ARRAY_SIZE];

float elementsB[ARRAY_SIZE][ARRAY_SIZE];

float elementsC[ARRAY_SIZE][ARRAY_SIZE];

// FILL WITH RANDOM ELEMENTS

buildMatrix(elementsA);

buildMatrix(elementsB);

fillZeroes(elementsC);

// ALLOCATE THE ELEMENTS TO THE DEVICE

float* deviceElA;

float* deviceElB;

float* deviceElC;

cudaMalloc((void**) &deviceElA, sizeof(deviceElA));

cudaMalloc((void**) &deviceElB, sizeof(deviceElB));

cudaMalloc((void**) &deviceElC, sizeof(deviceElC));

cudaMemcpy(deviceElA, elementsA, sizeof(elementsA), cudaMemcpyHostToDevice);

cudaMemcpy(deviceElB, elementsB, sizeof(elementsB), cudaMemcpyHostToDevice);

cudaMemcpy(deviceElC, elementsC, sizeof(elementsC), cudaMemcpyHostToDevice);

dim3 dimGrid(GRID_SIZE,GRID_SIZE); // in this example, dimGrid = (10,10)

dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); // in this example, dimBlock = (10,10)

                                                                         // should make 10000 threads, 1 per element

// DISPATCH TO THE KERNEL

addKernel <<<dimGrid, dimBlock>>>(deviceElA, deviceElB, deviceElC);

// COPY THE VALUES BACK

cudaMemcpy(elementsC, deviceElC, sizeof(elementsC), cudaMemcpyDeviceToHost);

cout<<endl<<endl<<“Testing values…”<<endl<<endl;

for(int i = 0; i < 10*BLOCK_SIZE; i++){

    for(int j = 0; j < 10*BLOCK_SIZE; j++){

        cout<<i<<","<<j<<": "<<elementsA[i][j]<<", "<<elementsB[i][j]<<", "<<elementsC[i][j]<<endl;

    }

}

cudaFree(deviceElA);

cudaFree(deviceElB);

cudaFree(deviceElC);

return 0;

}[/codebox]

Bloody hell, nevermind, got it. Note to self. Stop passing sizeof pointers instead of sizeof arrays. -_- Sorry to anyone who wasted the time to read this…