#include #include #include #include #include #include "helper_cuda.h" #define WARP_SIZE 32 //fragment dimensions #define M 16 #define N 16 #define K 16 //grid dimensions #define WARPS_PER_BLOCK 8 #define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK) //tile dimensions #define M_TILES 296 //256 num of fragments in the row dimension of the left matrix #define N_TILES 7 //224 same but as columns of the right matrix #define K_TILES 7 //224 same but as columns of the left matrix //global dimensions #define M_GLOBAL (M*M_TILES) #define N_GLOBAL (N*N_TILES) #define K_GLOBAL (K*K_TILES) // with 64KB of shared memory we can store 1 rectangular 8x7-tiles for the accumulation matrix. // 8*7*16*16*4bytes = 57344 B. We also need to consider // the 4096 B of skew overhead. In total we'll consume 57344+4096 = 61440 B, just enough to fill // 64KB = 65536B of shared memory. This accumulation matrix implies a left matrix 128x112 and a // right matrix 112x112, which can fit in the shared memory since they are made by half elements: // 128x112x2bytes + 112x112x2bytes = 53760B plus the skew overhead 8x4x(128+112) #define CHUNK_K 7 //number of fragments in the tile column dimension tile //block dimension #define WARP_COL_FRAG 7 #define WARP_ROW_FRAG 1 #define WARP_PER_BLOCK_ROW 8 #define WARP_PER_BLOCK_COL 1 #define BLOCK_COL_FRAG (WARP_PER_BLOCK_COL*WARP_COL_FRAG) #define BLOCK_ROW_FRAG (WARP_PER_BLOCK_ROW*WARP_ROW_FRAG) #define GLOBAL_MEM_STRIDE N_GLOBAL #define SHMEM_STRIDE (N * BLOCK_ROW_FRAG) // when a warp accesses a fragment inside the shared memory we would like to have as less // bank conflicts as possible. Without any skew, all the items in the same column of a fragment are in the same // bank resulting in 15*16 = 240 bank conflicts. By sliding each element of 32 bytes (8 words in the shared memory) we // reduce the bank conflicts to 64*3 = 192 for the fragments in the left and right matrices. In the accumulation tile // we have 32*7 = 224 bank conflicts. #define SKEW_HALF 16 //16 half = 32 bytes using namespace std; using namespace nvcuda; //struct with size = 14 bytes typedef struct { char a1; char a2; char a3; char a4; char a5; char a6; char a7; char a8; char a9; char a10; char a11; char a12; char a13; char a14; } copySize; __global__ void wmma_ker(half *A, half *B, float *D, unsigned long long *time) { extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF]; //clock variables to analyze the time unsigned long long start_clock, finish_clock; //determine block, warp and lane unsigned int warpId = threadIdx.x / WARP_SIZE; unsigned int laneId = threadIdx.x % WARP_SIZE; unsigned int blockId = blockIdx.x; // Offset in shared memory from which the B matrix is stored. const size_t shmem_idx_b_off = BLOCK_ROW_FRAG * M; // This pointer is used to access the C and D matrix fragments this warp computes. // This pointer is also used to stream the C and D matrices block-wide tile to and // from shared memory. float *shmem_warp_frag_ptr = (float *)&shmem[0][0] + warpId * SHMEM_STRIDE * M; // Each CTA slides along the 128 x 112 tiles from the top left corner of the // matrix to the right and down, and selects the next tile to compute. Once // there's no such tile, all warps in this CTA exit. for (unsigned int block_pos = blockId;; block_pos += gridDim.x) { const unsigned int block_tile_i = ((block_pos * BLOCK_COL_FRAG) / N_TILES) * (BLOCK_ROW_FRAG); const unsigned int block_tile_j = (block_pos * BLOCK_COL_FRAG) % N_TILES; // Stop when there are no more D matrix tiles to compute in this CTA. if (block_tile_i >= M_TILES) { break; } // This warp's pointer to the D matrix data to copy memory from shared // memory to gloabal memory, at the end. const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; // These fragments will accumulate the result of A and B matrix fragment // multiplications along the K_GLOBAL dimension. wmma::fragment c[WARP_ROW_FRAG][WARP_COL_FRAG]; // Load the C matrix tiles into fragments from shared memory. #pragma unroll for (int i = 0; i < WARP_ROW_FRAG; i++) { #pragma unroll for (int j = 0; j < WARP_COL_FRAG; j++) { wmma::fill_fragment(c[i][j], 0.0f); } } __syncthreads(); // Each Warp copies a row of A and a column of B // Only warp 7 copies a row of A and nothing from B const half *warp_A_ptr = (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * warpId); const half *warp_B_ptr = (warpId < 7) ? (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * warpId) : (half*)1; //warp 7 does nothing // Go through the global K dimension by a fixed step at a time. #pragma unroll for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { // First 16 threads copy slices of the A matrix to shared memory, while the other 16 threads copy slices of the B matrix. size_t shmem_idx = laneId < 16 ? (M * warpId + laneId % 16) : (warpId < 7) ? (N * warpId + laneId % 16 + shmem_idx_b_off) : 1; //warp 7 does nothing on B // First half of the warp copies the A matrix, // the second half of the warp copies the B one. (int4 is used because its size is 16 bytes) int4 *lane_ptr = laneId < 16 ? (int4 *)(warp_A_ptr + tile_k * K + laneId % 16 * K_GLOBAL) : (int4 *)(warp_B_ptr + tile_k * K + laneId % 16 * K_GLOBAL); #pragma unroll for (int i = 0; i < BLOCK_COL_FRAG * K / (sizeof(int4)/sizeof(half)); i++) { // Copy 16 bytes at once in each lane. if( warpId < 7 || laneId < 16) //warp7 and laneId > 16 do nothing *((int4 *)(&shmem[shmem_idx][0]) + i) = *(lane_ptr + i); } __syncthreads(); if(laneId == 0) finish_clock = 0; // Compute a grid of C matrix tiles in each warp. #pragma unroll for (int k_step = 0; k_step < CHUNK_K; k_step++) { wmma::fragment a[WARP_ROW_FRAG]; wmma::fragment b[WARP_COL_FRAG]; #pragma unroll for (int i = 0; i < WARP_ROW_FRAG; i++) { size_t shmem_idx_a = warpId * M + (i * M); const half *tile_ptr = &shmem[shmem_idx_a][k_step * K]; wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_HALF); #pragma unroll for (int j = 0; j < WARP_COL_FRAG; j++) { if (i == 0) { // Load the B matrix fragment once, because it is going to be // reused against the other A matrix fragments. size_t shmem_idx_b = shmem_idx_b_off + (j * N); const half *tile_ptr = &shmem[shmem_idx_b][k_step * K]; wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_HALF); } if(laneId == 0) start_clock = clock(); wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]); if(laneId == 0) finish_clock += (clock() - start_clock); } } } if(laneId == 0) time[blockId*WARPS_PER_BLOCK + warpId] = finish_clock / (K_TILES * WARP_ROW_FRAG * WARP_COL_FRAG); __syncthreads(); } // Store the D fragments to shared memory. #pragma unroll for (int i = 0; i < WARP_ROW_FRAG; i++) { #pragma unroll for (int j = 0; j < WARP_COL_FRAG; j++) { float *tile_ptr = shmem_warp_frag_ptr + i * SHMEM_STRIDE * M + j * N; wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, wmma::mem_row_major); } } __syncthreads(); // Now that shared memory contains all the D tiles, stream them to global // memory. float *dst_gmem_warp_stream_ptr = &D[gmem_idx]; #pragma unroll for (int i = 0; i < M; i++) { *((copySize *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = *((copySize *)(shmem_warp_frag_ptr + SHMEM_STRIDE * i) + laneId); } __syncthreads(); } } int main(int argc, char **argv){ cout<<"BEGIN..."<>>(d_a, d_b, d_d, d_time); cudaEventRecord(stop); cout << "Kernel concluded..." << endl; cudaMemcpy(h_d, d_d, M_GLOBAL*N_GLOBAL*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(time, d_time, BLOCK_NUMBER*WARPS_PER_BLOCK*sizeof(unsigned long long), cudaMemcpyDeviceToHost); /* for(int i=0; i