Intro CUDA - Matrix Multiplication Returning Odd Values

omdown · June 25, 2009, 6:58pm

So it looks like I’m rapidly discovering I’m not very good with CUDA… this is my second programming attempt and second total failure. Basically a reproduction of the shared memory matrix multiplication example from the Programming Guide except without using Matrix structs, I’m trying to just use arrays…

I’m getting strangely and unexpected values for my results coming back from the kernel call… I’m sure I’m just matrix address math-ing wrong somewhere… but if someone can help me figure out where I’d sure appreciate it…

I’m sorry for the length, I cut out as much as I could, the calls to buildMatrix() and fillZeroes() just fills A and B with all 1’s and C with all 0’s. When it returns, the values I have been getting back are:

C[0][0] - C[0][63] = 1

C[0][64] - C[0][99] = 2

Same thing on row C[1]…

Row 2 and 3 I get 0’s from [2][0] - [2][79] and then 4’s from [2][80]-[2][99]…

Everything else results in 0’s.

I have a feeling I’m just doing some sort of matrix address calculations wrong but I can’t see it for the life of me and this is my first attempt at this… thanks a thousand times to anyone that can help… I’m not sure what I’m doing wrong…

[codebox]

#define BLOCK_SIZE 10

#define GRID_SIZE 10

#define ARRAY_SIZE 100

/*****************************************

        CUDA SUBMATRIX GENERATOR

*****************************************/

device float* getSubMatrix(float* A, int row, int col, int stride) {

// stride = gridDim.x * blockDim.x <— represents one full row of the array

float* startOfA = (A // starting point of A

                + (stride * blockDim.y * row)   // skip over rows above this one

                + (blockDim.x * col));          // shift right to proper column

                                                // At this point we should have the value

                                                // of the first element of the BlockID

return startOfA;

}

/*****************************************

        CUDA PARALLEL MATRIX FILLER

*****************************************/

device float getElement(float* Msub, int x, int y, int stride){

return *(Msub // retrieve the value from Msub

        + (x * stride)                          // move down x rows

        + (y));                                 // move over y columns

}

/****************************************

        CUDA PARALLEL MATRIX SETTER

****************************************/

device void setElement(float* Msub, int x, int y, int stride, float value){

*(Msub // Set the value of the address pointed

    + (x * stride)                              // pointed to by Msub, skip over x rows,

    + (y))                                      // move over y columns

    = value;                                    // Assign the value

}

/*****************************************

        CUDA MATRIX MULTIPLIER

*****************************************/

global void addKernel(float* A, float* B, float* C) {

// GET OUR POSITION

int blockRow = blockIdx.x;

int blockCol = blockIdx.y;

int stride = gridDim.x * blockDim.x;  // for moving entire INDIVIDUAL rows, not block rows

                                      // in this example == 100, to move entire block rows, multiply stride

                                      // by blockDim.y to jump over all rows in the block row

// CREATE A BLOCK SIZED MATRIX TO STORE OUR ELEMENTS IN

float* CSub = getSubMatrix(C, blockRow, blockCol, stride);

float cValue = 0;

float* ASub;

float* BSub;

// USE THIS LOOP TO ITERATE OVER BLOCKS IN A AND B

for (int m = 0; m < (gridDim.x); m++){

// INITIALIZE POSITION OF A… GIVE IT A, row, AND column

    ASub = getSubMatrix(A, blockRow, m, stride);

    BSub = getSubMatrix(B, m, blockCol, stride);

// CREATE TWO BLOCK SIZED MATRICIES TO STORE A AND B IN

    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

int row = threadIdx.x;

    int col = threadIdx.y;

// FILL THE ARRAYS WITH THEIR PROPER ELEMENTS

    // BECAUSE THIS IS SHARED, ONLY NEED ONE CALL

    // EACH THREAD IN THE BLOCK WILL FILL ONE ELEMENT

    As[threadIdx.x][threadIdx.y] = getElement(ASub, row, col, stride);

    Bs[threadIdx.x][threadIdx.y] = getElement(BSub, row, col, stride);

__syncthreads();

// NOW THAT As AND Bs ARE FILLED, WE CAN BEGIN MULTIPLYING

    // AND SETTING cvalue AND WHATNOT.

    for(int i = 0; i < (BLOCK_SIZE); i++){

        cValue += As[threadIdx.x][i] * Bs[i][threadIdx.y];

    }

__syncthreads();

}

setElement(CSub, threadIdx.x, threadIdx.y, stride, cValue);

return;

}

/******************************************

        MAIN PROGRAM

******************************************/

int main()

{

srand( time (NULL) );

// CREATE 3 ARRAY_SIZE x ARRAY_SIZE arrays

float elementsA[ARRAY_SIZE][ARRAY_SIZE];

float elementsB[ARRAY_SIZE][ARRAY_SIZE];

float elementsC[ARRAY_SIZE][ARRAY_SIZE];

// FILL WITH RANDOM ELEMENTS

buildMatrix(elementsA);

buildMatrix(elementsB);

fillZeroes(elementsC);

// ALLOCATE THE ELEMENTS TO THE DEVICE

float* deviceElA;

float* deviceElB;

float* deviceElC;

cudaMalloc((void**) &deviceElA, sizeof(deviceElA));

cudaMalloc((void**) &deviceElB, sizeof(deviceElB));

cudaMalloc((void**) &deviceElC, sizeof(deviceElC));

cudaMemcpy(deviceElA, elementsA, sizeof(elementsA), cudaMemcpyHostToDevice);

cudaMemcpy(deviceElB, elementsB, sizeof(elementsB), cudaMemcpyHostToDevice);

cudaMemcpy(deviceElC, elementsC, sizeof(elementsC), cudaMemcpyHostToDevice);

dim3 dimGrid(GRID_SIZE,GRID_SIZE); // in this example, dimGrid = (10,10)

dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); // in this example, dimBlock = (10,10)

                                                                         // should make 10000 threads, 1 per element

// DISPATCH TO THE KERNEL

addKernel <<<dimGrid, dimBlock>>>(deviceElA, deviceElB, deviceElC);

// COPY THE VALUES BACK

cudaMemcpy(elementsC, deviceElC, sizeof(elementsC), cudaMemcpyDeviceToHost);

cout<<endl<<endl<<“Testing values…”<<endl<<endl;

for(int i = 0; i < 10*BLOCK_SIZE; i++){

    for(int j = 0; j < 10*BLOCK_SIZE; j++){

        cout<<i<<","<<j<<": "<<elementsA[i][j]<<", "<<elementsB[i][j]<<", "<<elementsC[i][j]<<endl;

    }

}

cudaFree(deviceElA);

cudaFree(deviceElB);

cudaFree(deviceElC);

return 0;

}[/codebox]

omdown · June 25, 2009, 7:12pm

Bloody hell, nevermind, got it. Note to self. Stop passing sizeof pointers instead of sizeof arrays. -_- Sorry to anyone who wasted the time to read this…

Topic		Replies	Views
nVidia CUDA Programming Guide and shared memory CUDA Programming and Performance	0	1462	January 12, 2010
Non-square Matrix Multiplication, not getting any cValues back CUDA Programming and Performance	2	3389	July 1, 2009
Matrix Multiplication Buggy CUDA Programming and Performance	13	5233	May 5, 2010
Weird Matrix-Vector Results - Help? CUDA Programming and Performance	2	4930	April 6, 2010
CUDA Matrix Example CUDA Programming and Performance	5	8298	March 4, 2010
Matrix Calculations/Manipulations CUDA Programming and Performance	1	403	March 20, 2017
Is it possible to "shuffle" multiply two matrices without stride information? CUDA Programming and Performance	1	266	August 25, 2023
Example of Matrix Multiplication(from cuda book) points that i dont anderstend ... CUDA Programming and Performance	6	34329	February 3, 2008
A value of zero is either printed as 1 or 4 in a kernel for some specific case !! CUDA Programming and Performance	7	725	January 13, 2018
Problems of matrix multiplication With and without CUDA CUDA Programming and Performance	15	9993	January 18, 2012

Intro CUDA - Matrix Multiplication Returning Odd Values

Related topics