Kernel shared memory simulation using CPU

user366312 · August 16, 2023, 6:18pm

Lecture 05 - Memory and Tiling

#include "common.h" 
#include "timer.h"
#define TILE_DIM 32
__global__ void mm_tiled_kernel (float* A, float* B, float* C, unsigned int N) 
{
unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;

_shared_ float A S[TILE_DIM] [TILE_DIM]; 
_shared float B_S[TILE_DIM] [TILE_DIM];

float sum = 0.0f;

for (unsigned int tile = 0; tile < N/TILE_DIM; ++ tile) 
{
A_s [threadIdx.y] [threadIdx.x] = A[rowNtile*TILE_DIM + threadIdx.x];
B_s[threadIdx.y] [threadIdx.x] = B[(tile*TILE_DIM + threadIdx.y) *N + col];
_syncthreads();

for (unsigned int i = 0; i < TILE_DIM; ++i) 
{
sum += As[threadIdx.y] [i]*B_s[i] [threadIdx.x];
}
_syncthreads();
}
C[row*N+ col] = sum;
}

I converted the above kernel into a CPU program so that I can simulate a shared memory.

#include <stdio.h>

#define N 2
#define TILE_SIZE 2

#define GridDimX 1
#define GridDimY 1

#define BlockDimX 2
#define BlockDimY 2

void printToFile(FILE* stream,int sn,
                 int blockDimX, int blockDimY,
                 int blockIdxX, int blockIdxY,
                 int threadIdxX, int threadIdxY,
                 int row, int col,
                 int i, int ai, int bi, int shAi, int shBi,
                 int j, int shAj, int shBj, int temp,
                 int ci, int Cxy){


    fprintf(stream, "%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d%-8d\n",
            sn, blockDimX, blockDimY, blockIdxX, blockIdxY, threadIdxX, threadIdxY, row, col, i, ai, bi, shAi, shBi, j, shAj, shBj, temp, ci, Cxy);


}

void MatrixMulKernel(FILE*stream,int* A, int* B, int* C) {
    int A_s[TILE_SIZE][TILE_SIZE]={0};
    int B_s[TILE_SIZE][TILE_SIZE]={0};
    int sn =0;
    for (int block_y = 0; block_y < GridDimY; block_y++)
    {
        for (int block_x = 0; block_x < GridDimX; block_x++)
        {
            for (int thread_y = 0; thread_y < BlockDimY; thread_y++)
            {
                for (int thread_x = 0; thread_x < BlockDimX; thread_x++)
                {
                    // Identify the row and column of the C element to work on
                    int row = block_y * TILE_SIZE + thread_y;
                    int col = block_x * TILE_SIZE + thread_x;
                    int sum = 0;
                    // Loop over the A and B tiles required to compute C element
                    for (int tile = 0; tile < N / TILE_SIZE; ++tile)
                    {
                        // Collaborative loading of A and B tiles into shared memory
                        int ai = row * N + tile * TILE_SIZE + thread_x;
                        int bi = (tile * TILE_SIZE + thread_y) * N + col;
                        A_s[thread_y][thread_x] = A[row * N + tile * TILE_SIZE + thread_x];
                        B_s[thread_y][thread_x] = B[(tile * TILE_SIZE + thread_y) * N + col];
                        for (int i = 0; i < TILE_SIZE; ++i)
                        {
                            sum += A_s[thread_y][i] * B_s[i][thread_x];
                            printToFile(stream, sn, 1, 1,
                                        block_y, block_x,
                                        thread_y, thread_x,
                                        row, col,
                                        tile, ai, bi, A[ai], B[bi], i, A_s[thread_y][i], B_s[i][thread_x], sum, 0, 0);
                            sn++;
                        }
                    }
                    C[row * N + col] = sum;
                }
            }
        }
    }
}

int main() {
    int a[N * N] = {1, 2, 3, 4};
    int b[N * N] = {1, 2, 3, 4};
    int c[N * N] = {0};


    FILE * stream = fopen("output.txt", "w");

    fprintf(stream,"%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s%-8s\n",
            "sn",
             "blkDmY","blkDmX",
            "blkIdY","blkIdX",
            "thrIdY","thrIdX",
            "row", "col", "tile", "ai", "bi", "shAi", "shBi", "i", "A_s", "B_s", "sum", "ci", "Cxy");
    MatrixMulKernel(stream,a, b, c);

    fclose(stream);

    printf("Matrix A:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", a[i * N + j]);
        }
        printf("\n");
    }

    printf("\nMatrix B:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", b[i * N + j]);
        }
        printf("\n");
    }

    printf("\nMatrix C (Result of multiplication):\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%d ", c[i * N + j]);
        }
        printf("\n");
    }

    return 0;
}

The output contains incorrect result:

Matrix A:
1 2
3 4

Matrix B:
1 2
3 4

Matrix C (Result of multiplication):
1 2
3 22

sn      blkDmY  blkDmX  blkIdY  blkIdX  thrIdY  thrIdX  row     col     tile    ai      bi      shAi    shBi    i       A_s     B_s     sum     ci      Cxy     (null)  
0       1       1       0       0       0       0       0       0       0       0       0       1       1       0       1       1       1       0       0       
1       1       1       0       0       0       0       0       0       0       0       0       1       1       1       0       0       1       0       0       
2       1       1       0       0       0       1       0       1       0       1       1       2       2       0       1       2       2       0       0       
3       1       1       0       0       0       1       0       1       0       1       1       2       2       1       2       0       2       0       0       
4       1       1       0       0       1       0       1       0       0       2       2       3       3       0       3       1       3       0       0       
5       1       1       0       0       1       0       1       0       0       2       2       3       3       1       0       3       3       0       0       
6       1       1       0       0       1       1       1       1       0       3       3       4       4       0       3       2       6       0       0       
7       1       1       0       0       1       1       1       1       0       3       3       4       4       1       4       4       22      0       0

What am I doing wrong?

Does this mean that a direct translation into CPU code won’t work as same as the kernel version?

Robert_Crovella · August 17, 2023, 1:40am

user366312 · August 17, 2023, 1:52am

That link doesn’t have any answer - only comments.

Robert_Crovella · August 17, 2023, 2:14am

That’s understood. I did not post it suggesting there is any answer there (although there is certainly important information in the comments.)

First, I have explained to you that this is a discussion forum, not a Q+A forum. Posters may post any relevant material for the topic. Posts need not be an answer.

Second, as a matter of courtesy to other readers as well as those who may wish to post here, I often post links when OP has chosen to cross-post their questions on multiple forums. This is beneficial for future readers and those who may wish to post here. The link posted here is not primarily for your benefit - of course you know where you have posted this question and what the status is of each.

It’s important to realize both for SO as well as for these forums that one of the intended benefits is not just for the original poster (OP). These questions live a long time and are often visited by many others in the future. So when you post here, you should expect that when I respond, I am taking those considerations into account. I am framing my posts not just for your benefit but for the benefit of others who may come later.

user366312 · August 17, 2023, 7:53pm

What I understood is that the kernel version works only because of threads.

An exact translation of the kernel into a CPU version (as I wrote the code) won’t work. In order for the CPU version to work, we have to redesign the source code using extra loops so that the inner most loops can find the simulated shared memories full when they start iterating.

Could you post a concluding remark?

Robert_Crovella · August 17, 2023, 8:20pm

When converting CPU code to GPU code, a usual place to start is for-loops. When a for-loop has independent work across its iterations, there is a fairly reliable and mechanical method to convert that for-loop into a kernel call. Basically you remove the for-loop itself, the body of the for-loop becomes the body of the kernel code, the loop variable is resynthesized using the CUDA built in variables (like threadIdx.x, etc.) and the kernel is called with an execution configuration that “matches” the extent of the for-loop iterations.

That’s a commonly explored case for initial CUDA learning, for example you can find a treatment of it in introductory CUDA DLI courses.

Conversely, if we want to take a functional GPU kernel, and “convert” it to equivalent host code, it stands to reason that we might want to use a methodology that replaces kernel calls with for-loops.

So yes, when converting GPU kernels to “equivalent” CPU code, its quite reasonable to assume that

Beyond that, and in light of your recent other question, I’m puzzled by this question in general. To me, it seems that video link you provided shows the instructor doing exactly this (going from a kernel realization to an “equivalent” CPU realization). And the instructor there does indeed demonstrate inserting for-loops, and the need for it. So it seems to me you already have an example of what needs to be done here.

(And, yes, I guess it is the same video link that you provided at the top of this thread.)

user366312 · August 17, 2023, 8:42pm

I saw that and I ran that code. That code is from the book by Kirk.

I wanted to create my own version that resembles the kernel more.

njuffa · August 17, 2023, 10:10pm

In the early life of CUDA (until maybe 2009 or so), CUDA code could be built to run on the host, simply by compiling with nvcc -deviceemu. No changes to the source code were required. It was all done with compiler magic and a support library. I did not work on this other than re-implementing CUDA device function intrinsics for use on the host.

Shared memory was a CUDA feature from the very start and supported by device emulation. Just a blob of memory. Given that the CUDA execution model (at least back then, before warp-level primitives) made no assumptions how many threads were physically running concurrently, and that CUDA code itself basically presents a single-thread view of the world (one of its major selling points: the parallelism is implicit), I think the emulation simply iterated over threads 0 to N-1. How it handled things like __syncthreads() I do not recall.

Device emulation was limited in that most, but not every CUDA program could be run correctly. Its reason for existing was the lack of a debugger in the first few years of CUDA. This was due to both lack of suitable hardware hooks in the GPUs and the considerable software engineering effort required; in fact debugger development commenced shortly after the CUDA project kicked off in the summer of 2005; CUDA’s alpha release under NDA occurred in November of 2006.

Topic		Replies	Views
Take Garbage Value wrong output how to use shared memory in a program CUDA Programming and Performance	2	5035	December 23, 2009
2 problems with my matrix multiplication code CUDA Programming and Performance	5	2043	April 14, 2012
Shared memory matrix multiplication not working CUDA Programming and Performance	6	132	October 11, 2024
Matrix multiplication from CUDA programming guide CUDA Programming and Performance	0	1871	November 23, 2009
wrong results given by the GPU the classical matr mul using shared memory outpus wrong results CUDA Programming and Performance	7	9663	October 5, 2010
An Efficient Matrix Transpose in CUDA C/C++ Technical Blog	31	2974	October 30, 2020
BUG? shared memory using in matrixMul CUDA Programming and Performance	0	1100	October 15, 2009
gemv tiling kernel CUDA Programming and Performance	0	794	March 5, 2020
Problem with simple Matrix Multiplication using shared memory CUDA Programming and Performance	0	2338	May 26, 2011
help getting shared memory working CUDA Programming and Performance	11	4395	June 12, 2007

Kernel shared memory simulation using CPU

Related topics