#include "common.h"
#include "timer.h"
#define TILE_DIM 32
__global__ void mm_tiled_kernel (float* A, float* B, float* C, unsigned int N)
{
unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
_shared_ float A S[TILE_DIM] [TILE_DIM];
_shared float B_S[TILE_DIM] [TILE_DIM];
float sum = 0.0f;
for (unsigned int tile = 0; tile < N/TILE_DIM; ++ tile)
{
A_s [threadIdx.y] [threadIdx.x] = A[rowNtile*TILE_DIM + threadIdx.x];
B_s[threadIdx.y] [threadIdx.x] = B[(tile*TILE_DIM + threadIdx.y) *N + col];
_syncthreads();
for (unsigned int i = 0; i < TILE_DIM; ++i)
{
sum += As[threadIdx.y] [i]*B_s[i] [threadIdx.x];
}
_syncthreads();
}
C[row*N+ col] = sum;
}
I converted the above kernel into a CPU program so that I can simulate a shared memory.
#include <stdio.h>
#define N 2
#define TILE_SIZE 2
#define GridDimX 1
#define GridDimY 1
#define BlockDimX 2
#define BlockDimY 2
void printToFile(FILE* stream,int sn,
int blockDimX, int blockDimY,
int blockIdxX, int blockIdxY,
int threadIdxX, int threadIdxY,
int row, int col,
int i, int ai, int bi, int shAi, int shBi,
int j, int shAj, int shBj, int temp,
int ci, int Cxy){
fprintf(stream, "%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d\n",
sn, blockDimX, blockDimY, blockIdxX, blockIdxY, threadIdxX, threadIdxY, row, col, i, ai, bi, shAi, shBi, j, shAj, shBj, temp, ci, Cxy);
}
void MatrixMulKernel(FILE*stream,int* A, int* B, int* C) {
int A_s[TILE_SIZE][TILE_SIZE]={0};
int B_s[TILE_SIZE][TILE_SIZE]={0};
int sn =0;
for (int block_y = 0; block_y < GridDimY; block_y++)
{
for (int block_x = 0; block_x < GridDimX; block_x++)
{
for (int thread_y = 0; thread_y < BlockDimY; thread_y++)
{
for (int thread_x = 0; thread_x < BlockDimX; thread_x++)
{
// Identify the row and column of the C element to work on
int row = block_y * TILE_SIZE + thread_y;
int col = block_x * TILE_SIZE + thread_x;
int sum = 0;
// Loop over the A and B tiles required to compute C element
for (int tile = 0; tile < N / TILE_SIZE; ++tile)
{
// Collaborative loading of A and B tiles into shared memory
int ai = row * N + tile * TILE_SIZE + thread_x;
int bi = (tile * TILE_SIZE + thread_y) * N + col;
A_s[thread_y][thread_x] = A[row * N + tile * TILE_SIZE + thread_x];
B_s[thread_y][thread_x] = B[(tile * TILE_SIZE + thread_y) * N + col];
for (int i = 0; i < TILE_SIZE; ++i)
{
sum += A_s[thread_y][i] * B_s[i][thread_x];
printToFile(stream, sn, 1, 1,
block_y, block_x,
thread_y, thread_x,
row, col,
tile, ai, bi, A[ai], B[bi], i, A_s[thread_y][i], B_s[i][thread_x], sum, 0, 0);
sn++;
}
}
C[row * N + col] = sum;
}
}
}
}
}
int main() {
int a[N * N] = {1, 2, 3, 4};
int b[N * N] = {1, 2, 3, 4};
int c[N * N] = {0};
FILE * stream = fopen("output.txt", "w");
fprintf(stream,"%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s\n",
"sn",
"blkDmY","blkDmX",
"blkIdY","blkIdX",
"thrIdY","thrIdX",
"row", "col", "tile", "ai", "bi", "shAi", "shBi", "i", "A_s", "B_s", "sum", "ci", "Cxy");
MatrixMulKernel(stream,a, b, c);
fclose(stream);
printf("Matrix A:\n");
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
printf("%d ", a[i * N + j]);
}
printf("\n");
}
printf("\nMatrix B:\n");
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
printf("%d ", b[i * N + j]);
}
printf("\n");
}
printf("\nMatrix C (Result of multiplication):\n");
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
printf("%d ", c[i * N + j]);
}
printf("\n");
}
return 0;
}
The output contains incorrect result:
Matrix A:
1 2
3 4
Matrix B:
1 2
3 4
Matrix C (Result of multiplication):
1 2
3 22
sn blkDmY blkDmX blkIdY blkIdX thrIdY thrIdX row col tile ai bi shAi shBi i A_s B_s sum ci Cxy (null)
0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0
1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0
2 1 1 0 0 0 1 0 1 0 1 1 2 2 0 1 2 2 0 0
3 1 1 0 0 0 1 0 1 0 1 1 2 2 1 2 0 2 0 0
4 1 1 0 0 1 0 1 0 0 2 2 3 3 0 3 1 3 0 0
5 1 1 0 0 1 0 1 0 0 2 2 3 3 1 0 3 3 0 0
6 1 1 0 0 1 1 1 1 0 3 3 4 4 0 3 2 6 0 0
7 1 1 0 0 1 1 1 1 0 3 3 4 4 1 4 4 22 0 0
What am I doing wrong?
Does this mean that a direct translation into CPU code won’t work as same as the kernel version?