I wrote a tiled matrix multiplication program where the tile width is a multiple of the block width.
And I configured the tile width such that I can use all of the 48KB of shared memory that the jetson nano has (between two blocks).
Still I see that its actually slower that a simple version where the tiles are the same size as the block.
I wonder if shared memory is a good idea. Its programmer managed and the code using it is harder to read. Isint it better to simply have a L1 cache instead…
__global__ void matrixMultiplicationKernel(const float *__restrict__ A, const int A_Rows, const int A_Cols, const float *__restrict__ B, const int B_Rows, const int B_Cols, float *__restrict__ C, const int C_Rows, const int C_Cols) { const int ROW = blockIdx.y * blockDim.y + threadIdx.y; // Absolute C row index const int COL = blockIdx.x * blockDim.x + threadIdx.x; // Absolute C column index const int TILE_WIDTH = 96; const int BLOCK_SIZE = 32; __shared__ float Ads[BLOCK_SIZE][TILE_WIDTH]; // blockDim.y * TILE_WIDTH (32x96) __shared__ float Bds[TILE_WIDTH][BLOCK_SIZE]; // TILE_WIDTH * blockDim.x (96x32) const int tx = threadIdx.x; // provides column index (for the local tile) const int ty = threadIdx.y; // provides row index (for the local tile) float tmpSum = 0.0f; for (int tileId = 0; tileId < ((A_Cols + (TILE_WIDTH - 1)) / TILE_WIDTH); tileId++) { // Collaboratively load the tile memory int currColIdx = (TILE_WIDTH * tileId) + tx; int currRowIdx = (TILE_WIDTH * tileId) + ty; int maxTileWidth = TILE_WIDTH; for(int i = 0;(i < (TILE_WIDTH/BLOCK_SIZE)) && (currColIdx < C_Cols);++i) { // Load the Ads submatrix tile row by row from A Ads[ty][tx + (i * BLOCK_SIZE)] = A[ROW * A_Cols + currColIdx]; if((i + 1) < (TILE_WIDTH/BLOCK_SIZE)) currColIdx += BLOCK_SIZE; } for(int i = 0;(i < (TILE_WIDTH/BLOCK_SIZE)) && (currRowIdx < C_Rows);++i) { // Load the Bds submatrix tile by column from B Bds[ty + (i * BLOCK_SIZE)][tx] = B[currRowIdx * B_Cols + COL]; if((i + 1) < (TILE_WIDTH/BLOCK_SIZE)) currRowIdx += BLOCK_SIZE; } // Wait until tile loading is done by all threads of the block __syncthreads(); // Limit the tile width in case tile exceeds the row or column width if ((currColIdx >= C_Cols) || (currRowIdx >= C_Rows)) { maxTileWidth = C_Cols % TILE_WIDTH; } // each thread computes temporary sum on the sub-matrices Ads and Bds for (int k = 0; k < maxTileWidth; k++) { tmpSum += Ads[ty][k] * Bds[k][tx]; } // synchronize for sub-matrix inner product computation __syncthreads(); } if (ROW < C_Rows && COL < C_Cols) { C[ROW * C_Cols + COL] = tmpSum; } }
TILE WIDTH = BLOCK SIZE = 32
rreddy78@jetson-nano:~/Desktop/Technical$ sudo /usr/local/cuda/bin/nvprof ./matrix_mul_gen_tiled
==13488== Profiling application: ./matrix_mul_gen_tiled
==13488== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 8.19152s 101 81.104ms 77.300ms 214.18ms matrixMultiplicationKernel(float const *, int, int, float const , int, int, float, int, int)
TILE WIDTH = 3 x BLOCK SIZE
rreddy78@jetson-nano:~/Desktop/Technical$ sudo /usr/local/cuda/bin/nvprof ./matrix_mul_gen_tiled2
==13466== NVPROF is profiling process 13466, command: ./matrix_mul_gen_tiled2
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 9.95302s 101 98.545ms 92.873ms 225.14ms matrixMultiplicationKernel(float const *, int, int, float const , int, int, float, int, int)