In c/c++ one can easily create variable sized arrays using new/malloc - but here’s the problem I’m running into on cuda:
__global__ void k_matrix_multiply_multishared(Matrix d_A, Matrix d_B, Matrix d_C,
int const tile_width) {
int sum = 0;
//create shared memory regions
__shared__ float d_sA[tile_width][tile_width]; //<--- PROBLEM! tile_width is not known at compile time - throwing compiler errors
__shared__ float d_sB[tile_width][tile_width];
int tx = threadIdx.x; int ty = threadIdx.y;
int bx = blockIdx.x; int by = blockIdx.y;
int row_id = by * tile_width + ty;
int col_id = bx * tile_width + tx;
int tid = row_id * d_C.width + col_id;
//if d_C.width =4, and tile_width =2, it would need 2 phases!
for(int i=0;i<d_C.width/tile_width;i++) {
float Pvalue = 0;
//load into shared memory
//each thread loads their own part into the shared memory:
d_sA[ty][tx] = d_A[row_id*d_A.width + (i*tile_width+tx)];
d_sB[ty][tx] = d_B[col_id+d_B.width * (i*tile_width+ty)];
__syncthreads();
for(int k=0;k<tile_width;k++) {
Pvalue += d_sA[ty][k] * d_sB[k][tx];
}
__syncthreads();
}
d_C.data[tid] = Pvalue;
}
I don’t want to use #define to set tile_width, is there a way around this?