I would like to use CUDA from within R. I have managed to get it working with a very simple function. Now I was trying to do matrix multiplication. The shared object I made compiles and loads fine, but the return matrix C (where C = A * B) is always equal to 0. When I set the matrix height and width manually, very strange answers come out (I don’t understand why this affects the outcomes in the first place). I must admit that there’s a big gap between what I want to achieve and my current programming knowledge, I hope however that someone can provide me with some guidance on this.

The R part is:

dyn.load(“mmult.so”)

mmult = function(AA,BB){

hAA = dim(AA)[1]

wAA = dim(AA)[2]

hBB = dim(BB)[1]

wBB = dim(BB)[2]

CC = as.double(0)

z = .C(“main”,as.double(AA),as.double(BB),as.integer(hAA),as.integer

(wAA),as.integer(hBB),as.integer(wBB),array(as.double(CC),dim

=c(hAA,wBB)))

z[[7]]

}

The CUDA / C part is:

#include <stdio.h>

#include <cutil.h>

#include <math.h>

#define BLOCK_SIZE 16

#define AS(i, j) As[i][j]

#define BS(i, j) Bs[i][j]

// kernel module

**global** void matrixMul(float *C, float *A, float *B, int WA, int WB)

{

// Block index

int bx = blockIdx.x;

int by = blockIdx.y;

// Thread index

int tx = threadIdx.x;

int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block

int aBegin = WA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block

int aEnd = aBegin + WA - 1;

// Step size used to iterate through the sub-matrices of A

int aStep = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block

int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B

int bStep = BLOCK_SIZE * WB;

// Csub is used to store the element of the block sub-matrix that is computed by the thread

float Csub = 0;

// Loop over all the sub-matrices of A and B required to compute the block sub-matrix

for (int a = aBegin, b = bBegin;

a <= aEnd;

a += aStep, b += bStep)

{

```
// Declaration of the shared memory array As used to store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory to shared memory; each thread loads one element of each matrix
AS(ty, tx) = A[a + WA * ty + tx];
BS(ty, tx) = B[b + WB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together, each thread computes one element of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k) Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}
```

// Write the block sub-matrix to device memory, each thread writes one element

int c = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

C[c + WB * ty + tx] = Csub;

}

// main routine

int main(long *wAA, long *wBB, long *hAA, long *hBB, double *AA, double *BB, double *CC)

{

int WA = *wAA;

int WB = *wBB;

int HA = *hAA;

int HB = *hBB;

int WC = WB;

int HC = HA;

// pointer to host & device array

float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;

// allocate host memory for matrices A and B

int size_A = WA * HA;

int mem_size_A = sizeof(float) * size_A;

h_A = (float *)malloc(mem_size_A);

int size_B = WB * HB;

int mem_size_B = sizeof(float) * size_B;

h_B = (float *)malloc(mem_size_B);

// initialize host memory

for (int i=0; i<size_A; i++) h_A[i] = (float)AA[i];

for (int i=0; i<size_B; i++) h_B[i] = (float)BB[i];

// allocate device memory

cudaMalloc((void**) &d_A, mem_size_A);

cudaMalloc((void**) &d_B, mem_size_B);

// copy host memory to device

cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

// allocate device memory for result

int size_C = WC * HC;

int mem_size_C = sizeof(float) * size_C;

cudaMalloc((void**) &d_C, mem_size_C);

// allocate host memory for the result

h_C = (float*)malloc(mem_size_C);

// setup execution parameters

dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

dim3 grid(WC / threads.x, HC / threads.y);

// execute the kernel

matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);

// copy result from device to host

cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

// store the results

for (int i=0; i<size_C; i++) CC[i] = (double)h_C[i];

// clean up memory

free(h_A);

free(h_B);

free(h_C);

cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_C);

cudaThreadExit();

}