I would like to use CUDA from within R. I have managed to get it working with a very simple function. Now I was trying to do matrix multiplication. The shared object I made compiles and loads fine, but the return matrix C (where C = A * B) is always equal to 0. When I set the matrix height and width manually, very strange answers come out (I don’t understand why this affects the outcomes in the first place). I must admit that there’s a big gap between what I want to achieve and my current programming knowledge, I hope however that someone can provide me with some guidance on this.
The R part is:
dyn.load(“mmult.so”)
mmult = function(AA,BB){
hAA = dim(AA)[1]
wAA = dim(AA)[2]
hBB = dim(BB)[1]
wBB = dim(BB)[2]
CC = as.double(0)
z = .C(“main”,as.double(AA),as.double(BB),as.integer(hAA),as.integer
(wAA),as.integer(hBB),as.integer(wBB),array(as.double(CC),dim
=c(hAA,wBB)))
z[[7]]
}
The CUDA / C part is:
#include <stdio.h>
#include <cutil.h>
#include <math.h>
#define BLOCK_SIZE 16
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
// kernel module
global void matrixMul(float *C, float *A, float *B, int WA, int WB)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = WA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + WA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * WB;
// Csub is used to store the element of the block sub-matrix that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{
// Declaration of the shared memory array As used to store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory to shared memory; each thread loads one element of each matrix
AS(ty, tx) = A[a + WA * ty + tx];
BS(ty, tx) = B[b + WB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together, each thread computes one element of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k) Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory, each thread writes one element
int c = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + WB * ty + tx] = Csub;
}
// main routine
int main(long *wAA, long *wBB, long *hAA, long *hBB, double *AA, double *BB, double *CC)
{
int WA = *wAA;
int WB = *wBB;
int HA = *hAA;
int HB = *hBB;
int WC = WB;
int HC = HA;
// pointer to host & device array
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
// allocate host memory for matrices A and B
int size_A = WA * HA;
int mem_size_A = sizeof(float) * size_A;
h_A = (float *)malloc(mem_size_A);
int size_B = WB * HB;
int mem_size_B = sizeof(float) * size_B;
h_B = (float *)malloc(mem_size_B);
// initialize host memory
for (int i=0; i<size_A; i++) h_A[i] = (float)AA[i];
for (int i=0; i<size_B; i++) h_B[i] = (float)BB[i];
// allocate device memory
cudaMalloc((void**) &d_A, mem_size_A);
cudaMalloc((void**) &d_B, mem_size_B);
// copy host memory to device
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
// allocate device memory for result
int size_C = WC * HC;
int mem_size_C = sizeof(float) * size_C;
cudaMalloc((void**) &d_C, mem_size_C);
// allocate host memory for the result
h_C = (float*)malloc(mem_size_C);
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);
// execute the kernel
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
// copy result from device to host
cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
// store the results
for (int i=0; i<size_C; i++) CC[i] = (double)h_C[i];
// clean up memory
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaThreadExit();
}