Problem with cuda in R

I would like to use CUDA from within R. I have managed to get it working with a very simple function. Now I was trying to do matrix multiplication. The shared object I made compiles and loads fine, but the return matrix C (where C = A * B) is always equal to 0. When I set the matrix height and width manually, very strange answers come out (I don’t understand why this affects the outcomes in the first place). I must admit that there’s a big gap between what I want to achieve and my current programming knowledge, I hope however that someone can provide me with some guidance on this.

The R part is:

dyn.load(“mmult.so”)

mmult = function(AA,BB){
hAA = dim(AA)[1]
wAA = dim(AA)[2]
hBB = dim(BB)[1]
wBB = dim(BB)[2]
CC = as.double(0)
z = .C(“main”,as.double(AA),as.double(BB),as.integer(hAA),as.integer
(wAA),as.integer(hBB),as.integer(wBB),array(as.double(CC),dim
=c(hAA,wBB)))
z[[7]]
}

The CUDA / C part is:

#include <stdio.h>
#include <cutil.h>
#include <math.h>

#define BLOCK_SIZE 16

#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]

// kernel module

global void matrixMul(float *C, float *A, float *B, int WA, int WB)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block
int aBegin = WA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + WA - 1;

// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * WB;

// Csub is used to store the element of the block sub-matrix that is computed by the thread
float Csub = 0;

// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{

// Declaration of the shared memory array As used to store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

// Declaration of the shared memory array Bs used to store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from device memory to shared memory; each thread loads one element of each matrix
AS(ty, tx) = A[a + WA * ty + tx];
BS(ty, tx) = B[b + WB * ty + tx];

// Synchronize to make sure the matrices are loaded
__syncthreads();

// Multiply the two matrices together, each thread computes one element of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k)   Csub += AS(ty, k) * BS(k, tx);

// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}

// Write the block sub-matrix to device memory, each thread writes one element
int c = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + WB * ty + tx] = Csub;
}

// main routine

int main(long *wAA, long *wBB, long *hAA, long *hBB, double *AA, double *BB, double *CC)
{

int WA = *wAA;
int WB = *wBB;
int HA = *hAA;
int HB = *hBB;
int WC = WB;
int HC = HA;

// pointer to host & device array
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;

// allocate host memory for matrices A and B
int size_A = WA * HA;
int mem_size_A = sizeof(float) * size_A;
h_A = (float *)malloc(mem_size_A);
int size_B = WB * HB;
int mem_size_B = sizeof(float) * size_B;
h_B = (float *)malloc(mem_size_B);

// initialize host memory
for (int i=0; i<size_A; i++) h_A[i] = (float)AA[i];
for (int i=0; i<size_B; i++) h_B[i] = (float)BB[i];

// allocate device memory
cudaMalloc((void**) &d_A, mem_size_A);
cudaMalloc((void**) &d_B, mem_size_B);

// copy host memory to device
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

// allocate device memory for result
int size_C = WC * HC;
int mem_size_C = sizeof(float) * size_C;
cudaMalloc((void**) &d_C, mem_size_C);

// allocate host memory for the result
h_C = (float*)malloc(mem_size_C);

// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);

// execute the kernel
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);

// copy result from device to host
cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

// store the results
for (int i=0; i<size_C; i++) CC[i] = (double)h_C[i];

// clean up memory
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

cudaThreadExit();
}