I have been playing around with Cuda, and tried to use the example from the cuda manual on matrix multiplication from within R (statistical language). Everything appears to work “almost” fine, exept for that the answers are wrong. If I multiply matrix A and B, I get what is in fact B * A for square matrices, if I multiply non square matrices I get complete nonsense answers. As the way I did it differs so little from the cuda 2.1 manual, I’m getting the impression that there might be something wrong in there…

Thanks in advance, I would really appreciate a little help.

The code is included below:

#include <stdio.h>

#include <cutil.h>

#include <math.h>

#include <stdlib.h>

#define BLOCK_SIZE 16

// kernel module

**global** void Muld(float *A, float *B, int wA, int wB, float *C)

{

// Block index

int bx = blockIdx.x;

int by = blockIdx.y;

// Thread index

int tx = threadIdx.x;

int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block

int aBegin = wA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block

int aEnd = aBegin + wA - 1;

// Step size used to iterate through the sub-matrices of A

int aStep = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block

int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B

int bStep = BLOCK_SIZE * wB;

// The element of the block sub-matrix that is computed by the thread

float Csub = 0;

// Loop over all the sub-matrices of A and B required to compute the block sub-matrix

for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)

{

```
// Shared memory for the sub-matrices of A and B
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from global memory to shared memory, each thread loads one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together, each thread computes one element of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k) Csub += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}
```

// Write the block sub-matrix to global memory, each thread writes one element

int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

C[c + wB * ty + tx] = Csub;

}

// main routine

int main(double *AA, double *BB, long *hAA, long *wAA, long *wBB, double *CC)

{

int hA = *hAA;

int wA = *wAA;

int wB = *wBB;

// pointer to host & device array

float *Ah, *Bh, *Ch, *Ad, *Bd, *Cd;

// allocate host and device memory for matrices A and B

int size_A = hA * wA;

int mem_size_A = size_A * sizeof(float);

Ah = (float *)malloc(mem_size_A);

int size_B = wA * wB;

int mem_size_B = size_B * sizeof(float);

Bh = (float *)malloc(mem_size_B);

// initialize host memory

for (int i=0; i<size_A; i++) Ah[i] = (float)AA[i];

for (int i=0; i<size_B; i++) Bh[i] = (float)BB[i];

// allocate device memory

cudaMalloc((void**)&Ad, mem_size_A);

cudaMalloc((void**)&Bd, mem_size_B);

// copy host memory to device

cudaMemcpy(Ad, Ah, mem_size_A, cudaMemcpyHostToDevice);

cudaMemcpy(Bd, Bh, mem_size_B, cudaMemcpyHostToDevice);

// Allocate device memory for C

int size_C = hA * wB;

int mem_size_C = size_C * sizeof(float);

cudaMalloc((void**)&Cd, mem_size_C);

// Allocate host memory for C

Ch = (float *)malloc(mem_size_C);

// Compute the execution configuration assuming the matrix dimensions are multiples of BLOCK_SIZE

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

dim3 dimGrid(hA / dimBlock.x, wB / dimBlock.y);

// Launch the device computation

Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);

// Read C from the device

cudaMemcpy(Ch, Cd, mem_size_C, cudaMemcpyDeviceToHost);

for (int i=0; i<size_C; i++) CC[i] = (double)Ch[i];

// Clean up the memory

free(Ah);

free(Bh);

free(Ch);

cudaFree(Ad);

cudaFree(Bd);

cudaFree(Cd);

cudaThreadExit();

}