The following kernel is working perfectly:
#include <iostream>
#include <iomanip>
using namespace std;
const int ROWS_Y = 4;
const int COLS_X = 4;
const int TILE_ROWS_Y = 2;
const int TILE_COLS_X = 2;
#define GX(tx, lx) ((tx) * (TILE_COLS_X) + (lx))
#define GY(ty, ly) ((ty) * (TILE_ROWS_Y) + (ly))
#define GID2(gx, gy) ((gy) * (COLS_X) + (gx))
#define GID4(tx, ty, lx, ly) ((GY(ty, ly)) * (COLS_X) + (GX(tx, lx)))
#define MOSAIC_ROWS_Y ((ROWS_Y) / (TILE_ROWS_Y))
#define MOSAIC_COLS_X ((COLS_X) / (TILE_COLS_X))
#define LID(lx, ly) ((ly)*(TILE_COLS_X)+(lx))
__global__ void MultiplyAsSumOuterProductOfVectors(int *A, int *B, int *C)
{
int ty = blockIdx.y;
int tx = blockIdx.x;
int ly = threadIdx.y;
int lx = threadIdx.x;
__shared__ int subC[TILE_ROWS_Y*TILE_COLS_X];
for (int tr = 0; tr < MOSAIC_COLS_X; tr++)
{
__shared__ int subA[TILE_ROWS_Y*(TILE_COLS_X+1)];
__shared__ int subB[TILE_ROWS_Y*(TILE_COLS_X+1)];
subA[LID(lx,ly)] = A[GID4(tr,ty,lx,ly)];
subB[LID(lx,ly)] = B[GID4(tx,tr,lx,ly)];
__syncthreads();
for (int lr = 0; lr < TILE_ROWS_Y; lr++)
{
int gy = GY(ty, ly);
int gx = GX(tx, lx);
if (gy < ROWS_Y && gx < COLS_X)
{
subC[LID(lx,ly)] += subA[LID(lr,ly)] * subB[LID(lx,lr)];
}
}
C[GID4(tx,ty,lx,ly)] = subC[LID(lx,ly)];
__syncthreads();
}
}
.
However, I want to rewrite this kernel using registers and shuffles. How can I do that?
Could you kindly supply me with some reference materials so that I can study them to incorporate registers and shuffles to my existing tile-based matrix multiplication scheme?
Some source code samples will be highly appreciated.