I have copied the kernel from here and added one extra argument.
However, it seems to be not giving correct result.
Without transpose:
Result:
5 17 29 41
17 61 105 149
29 105 181 257
41 149 257 365
With transpose:
Result:
11 14 17 20
35 46 57 68
59 78 97 116
83 110 137 164
The correct result should be:
Result Matrix:
90 100 110 120
202 228 254 280
314 356 398 440
426 484 542 600
What is incorrect in my modified source code?
#include <stdio.h>
#include <iostream>
#include <iomanip>
using namespace std;
#define M 4
#define TILE_DIM 2
__global__ void coalescedMultiply(int *a, int*bT, int *c)
{
__shared__ int aTile[TILE_DIM][TILE_DIM];
__shared__ int transposedTile[TILE_DIM][TILE_DIM];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0.0f;
aTile[threadIdx.y][threadIdx.x] = a[row * M + threadIdx.x];
transposedTile[threadIdx.x][threadIdx.y] = bT[(blockIdx.x * blockDim.x + threadIdx.y) * M + threadIdx.x];
__syncthreads();
for (int i = 0; i < TILE_DIM; i++)
{
sum += aTile[threadIdx.y][i] * transposedTile[i][threadIdx.x];
}
c[row * M + col] = sum;
}
void printMatrix(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
cout << setw(6) << mat[i * cc + j] << " ";
}
cout << endl;
}
cout << endl;
}
void initMatrix(int *mat, int rr, int cc) {
int init = 1;
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = init++;
}
}
}
void initMatrixZero(int *mat, int rr, int cc) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
mat[i * cc + j] = 0;
}
}
}
void transposeMatrix(const int *mat, int rr, int cc, int *result) {
for (int i = 0; i < rr; i++) {
for (int j = 0; j < cc; j++) {
result[j * rr + i] = mat[i * cc + j];
}
}
}
int main()
{
int N = M * M;
int *a, *b, *bT, *c;
int *dev_a, *dev_b, *dev_bT, *dev_c;
// Allocate memory on the host
a = (int *)malloc(N * sizeof(int));
b = (int *)malloc(N * sizeof(int));
bT = (int *)malloc(N * sizeof(int));
c = (int *)malloc(N * sizeof(int));
// Initialize input data
initMatrix(a, M, M);
initMatrix(b, M, M);
initMatrixZero(bT, M, M);
transposeMatrix(b, M, M, bT);
initMatrixZero(c, M, M);
printMatrix(a, M, M);
printMatrix(b, M, M);
printMatrix(bT, M, M);
// Allocate memory on the device
cudaMalloc((void **) &dev_a, N * sizeof(int));
cudaMalloc((void **) &dev_b, N * sizeof(int));
cudaMalloc((void **) &dev_bT, N * sizeof(int));
cudaMalloc((void **) &dev_c, N * sizeof(int));
// Copy input data from host to device
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_bT, bT, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemset(dev_c, 0, N * sizeof(int));
// Define grid and block dimensions
dim3 blockSize(TILE_DIM, TILE_DIM);
dim3 gridSize(M/TILE_DIM, M/TILE_DIM);
// Launch the kernel
coalescedMultiply<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c);
//coalescedMultiply<<<gridSize, blockSize>>>(dev_a, dev_bT, dev_c);
// Copy the result from device to host
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
// Print the result
printf("Result:\n");
printMatrix(c, M, M);
// Free memory on the device
cudaFree(dev_a);
cudaFree(dev_c);
// Free memory on the host
free(a);
free(c);
return 0;
}