I have been playing around with Cuda, and tried to use the example from the cuda manual on matrix multiplication from within R (statistical language). Everything appears to work “almost” fine, exept for that the answers are wrong. If I multiply matrix A and B, I get what is in fact B * A for square matrices, if I multiply non square matrices I get complete nonsense answers. As the way I did it differs so little from the cuda 2.1 manual, I’m getting the impression that there might be something wrong in there…
Thanks in advance, I would really appreciate a little help.
The code is included below:
#include <stdio.h>
#include <cutil.h>
#include <math.h>
#include <stdlib.h>
#define BLOCK_SIZE 16
// kernel module
global void Muld(float *A, float *B, int wA, int wB, float *C)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// The element of the block sub-matrix that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
{
// Shared memory for the sub-matrices of A and B
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from global memory to shared memory, each thread loads one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together, each thread computes one element of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k) Csub += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to global memory, each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
// main routine
int main(double *AA, double *BB, long *hAA, long *wAA, long *wBB, double *CC)
{
int hA = *hAA;
int wA = *wAA;
int wB = *wBB;
// pointer to host & device array
float *Ah, *Bh, *Ch, *Ad, *Bd, *Cd;
// allocate host and device memory for matrices A and B
int size_A = hA * wA;
int mem_size_A = size_A * sizeof(float);
Ah = (float *)malloc(mem_size_A);
int size_B = wA * wB;
int mem_size_B = size_B * sizeof(float);
Bh = (float *)malloc(mem_size_B);
// initialize host memory
for (int i=0; i<size_A; i++) Ah[i] = (float)AA[i];
for (int i=0; i<size_B; i++) Bh[i] = (float)BB[i];
// allocate device memory
cudaMalloc((void**)&Ad, mem_size_A);
cudaMalloc((void**)&Bd, mem_size_B);
// copy host memory to device
cudaMemcpy(Ad, Ah, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(Bd, Bh, mem_size_B, cudaMemcpyHostToDevice);
// Allocate device memory for C
int size_C = hA * wB;
int mem_size_C = size_C * sizeof(float);
cudaMalloc((void**)&Cd, mem_size_C);
// Allocate host memory for C
Ch = (float *)malloc(mem_size_C);
// Compute the execution configuration assuming the matrix dimensions are multiples of BLOCK_SIZE
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(hA / dimBlock.x, wB / dimBlock.y);
// Launch the device computation
Muld<<<dimGrid, dimBlock>>>(Ad, Bd, wA, wB, Cd);
// Read C from the device
cudaMemcpy(Ch, Cd, mem_size_C, cudaMemcpyDeviceToHost);
for (int i=0; i<size_C; i++) CC[i] = (double)Ch[i];
// Clean up the memory
free(Ah);
free(Bh);
free(Ch);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
cudaThreadExit();
}