I am getting error as “Your code did not produce the correct output”. Can anyone please help me in resolving this. I have pasted the code below:

import numpy as np

from numba import cuda, types

@cuda.jit

def mm_shared(a, b, c):

column, row = cuda.grid(2)

sum = 0

```
# `a_cache` and `b_cache` are already correctly defined
a_cache = cuda.shared.array(block_size, types.int32)
b_cache = cuda.shared.array(block_size, types.int32)
# TODO: use each thread to populate one element each a_cache and b_cache
if column >= c.shape[0] and row >= c.shape[1]:
return
for i in range(cuda.gridDim.x):
# Preload data into shared memory
a_cache[cuda.threadIdx.x, cuda.threadIdx.y] = a[column, cuda.threadIdx.y + i * block_size[0]]
b_cache[cuda.threadIdx.x, cuda.threadIdx.y] = b[cuda.threadIdx.x + i * block_size[0], row]
# Wait until all threads finish preloading
cuda.syncthreads()
for j in range(block_size[0]):
# TODO: calculate the `sum` value correctly using values from the cache
sum += a_cache[cuda.threadIdx.x][j] * b_cache[j][cuda.threadIdx.y]
cuda.syncthreads()
c[column][row] = sum
```

#### Driver Code to test above Function

import numpy as np

from numba import cuda, types

# Leave the values in this cell alone

M = 128

N = 32

# Input vectors of MxN and NxM dimensions

a = np.arange(M*N).reshape(M,N).astype(np.int32)
b = np.arange(M*N).reshape(N,M).astype(np.int32)

c = np.zeros((M, M)).astype(np.int32)

d_a = cuda.to_device(a)

d_b = cuda.to_device(b)

d_c = cuda.to_device©

# NxN threads per block, in 2 dimensions

block_size = (N,N)

# MxM/NxN blocks per grid, in 2 dimensions

grid_size = (int(M/N),int(M/N))

print(type(block_size))

print()

mm_shared[grid_size, block_size](d_a, d_b, d_c)