I am getting error as “Your code did not produce the correct output”. Can anyone please help me in resolving this. I have pasted the code below:
import numpy as np
from numba import cuda, types
@cuda.jit
def mm_shared(a, b, c):
column, row = cuda.grid(2)
sum = 0
# `a_cache` and `b_cache` are already correctly defined
a_cache = cuda.shared.array(block_size, types.int32)
b_cache = cuda.shared.array(block_size, types.int32)
# TODO: use each thread to populate one element each a_cache and b_cache
if column >= c.shape[0] and row >= c.shape[1]:
return
for i in range(cuda.gridDim.x):
# Preload data into shared memory
a_cache[cuda.threadIdx.x, cuda.threadIdx.y] = a[column, cuda.threadIdx.y + i * block_size[0]]
b_cache[cuda.threadIdx.x, cuda.threadIdx.y] = b[cuda.threadIdx.x + i * block_size[0], row]
# Wait until all threads finish preloading
cuda.syncthreads()
for j in range(block_size[0]):
# TODO: calculate the `sum` value correctly using values from the cache
sum += a_cache[cuda.threadIdx.x][j] * b_cache[j][cuda.threadIdx.y]
cuda.syncthreads()
c[column][row] = sum
Driver Code to test above Function
import numpy as np
from numba import cuda, types
Leave the values in this cell alone
M = 128
N = 32
Input vectors of MxN and NxM dimensions
a = np.arange(MN).reshape(M,N).astype(np.int32)
b = np.arange(MN).reshape(N,M).astype(np.int32)
c = np.zeros((M, M)).astype(np.int32)
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)
NxN threads per block, in 2 dimensions
block_size = (N,N)
MxM/NxN blocks per grid, in 2 dimensions
grid_size = (int(M/N),int(M/N))
print(type(block_size))
print()
mm_shared[grid_size, block_size](d_a, d_b, d_c)