Kernel adapted from CUDA documentation giving incorrect result

I have copied the kernel from here and added one extra argument.

However, it seems to be not giving correct result.

Without transpose:

Result:
     5     17     29     41
    17     61    105    149
    29    105    181    257
    41    149    257    365

With transpose:

Result:
    11     14     17     20
    35     46     57     68
    59     78     97    116
    83    110    137    164

The correct result should be:

Result Matrix:
    90    100    110    120
   202    228    254    280
   314    356    398    440
   426    484    542    600

What is incorrect in my modified source code?

#include <stdio.h>
#include <iostream>
#include <iomanip>
using namespace std;

#define M 4
#define TILE_DIM 2

__global__ void coalescedMultiply(int *a, int*bT, int *c)
{
    __shared__ int aTile[TILE_DIM][TILE_DIM]; 
    __shared__ int transposedTile[TILE_DIM][TILE_DIM];

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int sum = 0.0f;

    aTile[threadIdx.y][threadIdx.x] = a[row * M + threadIdx.x];
    transposedTile[threadIdx.x][threadIdx.y] = bT[(blockIdx.x * blockDim.x + threadIdx.y) * M + threadIdx.x];
    __syncthreads();

    for (int i = 0; i < TILE_DIM; i++)
    {
        sum += aTile[threadIdx.y][i] * transposedTile[i][threadIdx.x];
    }

    c[row * M + col] = sum;
}

void printMatrix(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      cout << setw(6) << mat[i * cc + j] << " ";
    }
    cout << endl;
  }
  cout << endl;
}

void initMatrix(int *mat, int rr, int cc) {
  int init = 1;
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = init++;
    }
  }
}

void initMatrixZero(int *mat, int rr, int cc) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      mat[i * cc + j] = 0;
    }
  }
}

void transposeMatrix(const int *mat, int rr, int cc, int *result) {
  for (int i = 0; i < rr; i++) {
    for (int j = 0; j < cc; j++) {
      result[j * rr + i] = mat[i * cc + j];
    }
  }
}

int main()
{
    int N = M * M;
    int *a, *b, *bT, *c;
    int *dev_a, *dev_b, *dev_bT, *dev_c;

    // Allocate memory on the host
    a  = (int *)malloc(N * sizeof(int));
    b  = (int *)malloc(N * sizeof(int));
    bT = (int *)malloc(N * sizeof(int));
    c  = (int *)malloc(N * sizeof(int));

    // Initialize input data
    initMatrix(a, M, M);
    initMatrix(b, M, M);
    initMatrixZero(bT, M, M);
    transposeMatrix(b, M, M, bT);
    initMatrixZero(c, M, M);

    printMatrix(a, M, M);
    printMatrix(b, M, M);
    printMatrix(bT, M, M);

    // Allocate memory on the device
    cudaMalloc((void **) &dev_a,  N * sizeof(int));
    cudaMalloc((void **) &dev_b,  N * sizeof(int));
    cudaMalloc((void **) &dev_bT, N * sizeof(int));
    cudaMalloc((void **) &dev_c,  N * sizeof(int));

    // Copy input data from host to device
    cudaMemcpy(dev_a,   a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b,   b, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_bT, bT, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemset(dev_c,   0, N * sizeof(int));

    // Define grid and block dimensions
    dim3 blockSize(TILE_DIM, TILE_DIM);
    dim3 gridSize(M/TILE_DIM, M/TILE_DIM);

    // Launch the kernel
    coalescedMultiply<<<gridSize, blockSize>>>(dev_a, dev_b,  dev_c);
    //coalescedMultiply<<<gridSize, blockSize>>>(dev_a, dev_bT, dev_c);
    

    // Copy the result from device to host
    cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the result
    printf("Result:\n");
    printMatrix(c, M, M);

    // Free memory on the device
    cudaFree(dev_a);
    cudaFree(dev_c);

    // Free memory on the host
    free(a);
    free(c);

    return 0;
}