Can you please elaborate little bit
What am I missing here?
import numpy as np from numba import cuda, types @cuda.jit def mm_shared(a, b, c): # implemented similar to https://numba.pydata.org/numba-doc/dev/cuda/examples.html # `a_cache` and `b_cache` are already correctly defined a_cache = cuda.shared.array(block_size, types.int32) b_cache = cuda.shared.array(block_size, types.int32) row, col = cuda.grid(2) stride_row, stride_col = cuda.gridsize(2) thread_x = cuda.threadIdx.x thread_y = cuda.threadIdx.y if row >= c.shape and col >= c.shape: # Quit if (row, col) is outside of valid C boundary return # TODO: use each thread to populate one element each a_cache and b_cache sum = 0 for data_row in range(row, a.shape, stride_row): for data_col in range(col, b.shape, stride_col): for i in range(a.shape): # Preload data into shared memory a_cache[thread_x][thread_y] = a[data_row][thread_y + i * N] b_cache[thread_x][thread_y] = b[thread_x + i * N][data_col] # Wait until all threads finish preloading cuda.syncthreads() # Compute partial product on the shared memory for j in range(N): # TODO: calculate the `sum` value correctly using values from the cache sum += a_cache[thread_x][j] * b_cache[j][thread_y] # Wait for all threads to finish computing cuda.syncthreads() c[data_row][data_col] = sum
I have tried everything and I believe the problem is getting the edges of shared memory to 0 out. Right now it still only works if M is whole-number divisible by N. What am I missing to imultiply the edges when M isn’t divisible by N?
This SO thread may be useful.