Using more shared memory does not show improvement

I wrote a tiled matrix multiplication program where the tile width is a multiple of the block width.
And I configured the tile width such that I can use all of the 48KB of shared memory that the jetson nano has (between two blocks).

Still I see that its actually slower that a simple version where the tiles are the same size as the block.

I wonder if shared memory is a good idea. Its programmer managed and the code using it is harder to read. Isint it better to simply have a L1 cache instead…

__global__ void matrixMultiplicationKernel(const float *__restrict__ A, const int A_Rows, const int A_Cols, const float *__restrict__ B, const int B_Rows, const int B_Cols, float *__restrict__ C, const int C_Rows, const int C_Cols)
{
  const int ROW = blockIdx.y * blockDim.y + threadIdx.y; // Absolute C row index
  const int COL = blockIdx.x * blockDim.x + threadIdx.x; // Absolute C column index

  const int TILE_WIDTH = 96;
  const int BLOCK_SIZE = 32;

  __shared__ float Ads[BLOCK_SIZE][TILE_WIDTH]; // blockDim.y * TILE_WIDTH (32x96)
  __shared__ float Bds[TILE_WIDTH][BLOCK_SIZE]; // TILE_WIDTH * blockDim.x (96x32)

  const int tx = threadIdx.x; // provides column index (for the local tile)
  const int ty = threadIdx.y; // provides row index (for the local tile)

  float tmpSum = 0.0f;

  for (int tileId = 0; tileId < ((A_Cols + (TILE_WIDTH - 1)) / TILE_WIDTH); tileId++)
  {
    // Collaboratively load the tile memory
    int currColIdx = (TILE_WIDTH * tileId) + tx;
    int currRowIdx = (TILE_WIDTH * tileId) + ty;

    int maxTileWidth = TILE_WIDTH;

    for(int i = 0;(i < (TILE_WIDTH/BLOCK_SIZE)) && (currColIdx < C_Cols);++i)
    {
      // Load the Ads submatrix tile row by row from A
      Ads[ty][tx + (i * BLOCK_SIZE)] = A[ROW * A_Cols + currColIdx];
      if((i + 1) < (TILE_WIDTH/BLOCK_SIZE))
         currColIdx += BLOCK_SIZE;
    }

    for(int i = 0;(i < (TILE_WIDTH/BLOCK_SIZE)) && (currRowIdx < C_Rows);++i)
    {
      // Load the Bds submatrix tile by column from B
      Bds[ty + (i * BLOCK_SIZE)][tx] = B[currRowIdx * B_Cols + COL];
      if((i + 1) < (TILE_WIDTH/BLOCK_SIZE))
         currRowIdx += BLOCK_SIZE;
    }

    // Wait until tile loading is done by all threads of the block
    __syncthreads();

    // Limit the tile width in case tile exceeds the row or column width
    if ((currColIdx >= C_Cols) || (currRowIdx >= C_Rows))
    {
       maxTileWidth = C_Cols % TILE_WIDTH;
    }

    // each thread computes temporary sum on the sub-matrices Ads and Bds
    for (int k = 0; k < maxTileWidth; k++)
    {
      tmpSum += Ads[ty][k] * Bds[k][tx];
    }
    // synchronize for sub-matrix inner product computation
    __syncthreads();
  }
  if (ROW < C_Rows && COL < C_Cols)
  {
    C[ROW * C_Cols + COL] = tmpSum;
  }
}

TILE WIDTH = BLOCK SIZE = 32
rreddy78@jetson-nano:~/Desktop/Technical$ sudo /usr/local/cuda/bin/nvprof ./matrix_mul_gen_tiled
==13488== Profiling application: ./matrix_mul_gen_tiled
==13488== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 8.19152s 101 81.104ms 77.300ms 214.18ms matrixMultiplicationKernel(float const *, int, int, float const , int, int, float, int, int)

TILE WIDTH = 3 x BLOCK SIZE
rreddy78@jetson-nano:~/Desktop/Technical$ sudo /usr/local/cuda/bin/nvprof ./matrix_mul_gen_tiled2
==13466== NVPROF is profiling process 13466, command: ./matrix_mul_gen_tiled2
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 9.95302s 101 98.545ms 92.873ms 225.14ms matrixMultiplicationKernel(float const *, int, int, float const , int, int, float, int, int)