My tiled matrix multiplication cost about 5s for A * B = C, size of A, B, C is [16384 * 16384].
I think it is not fast, but I don’t know how to accelerate it. Device infomation is below:
Device TITAN Xp
CUDA Capability Major/Minor version number: 6.1
Total amount of global memory: 11.91 GBytes (12786401280 bytes)
GPU Clock rate: 1582 MHz (1.58 GHz)
Memory Bus width: 384-bits
L2 Cache Size: 3145728 bytes
Max Texture Dimension Size (x,y,z) 1D=(131072),2D=(131072,65536),3D=(16384,16384,16384)
Max Layered Texture Size (dim) x layers 1D=(32768) x 2048,2D=(32768,32768) x 2048
Total amount of constant memory 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block:65536
Wrap size: 32
Maximun number of thread per multiprocesser: 2048
Maximun number of thread per block: 1024
Maximun size of each dimension of a block: 1024 x 1024 x 64
Maximun size of each dimension of a grid: 2147483647 x 65535 x 65535
Maximu memory pitch 2147483647 bytes
And this is the kernel function.
// one block compute one tile, size 32 * 32
// int blockX = TILE_SIZE, blockY = TILE_SIZE;
// dim3 block(blockX, blockY);
// numBlockX = (K - 1) / blockX + 1;
// numBlockY = (M - 1) / blockY + 1;
// dim3 grid(numBlockX, numBlockY);
__global__ void multMatrixTile0(float *MatA, float *MatB, float *MatC, int M, int K, int N){
__shared__ float Atile[TILE_SIZE][TILE_SIZE];
__shared__ float Btile[TILE_SIZE][TILE_SIZE];
float accu = 0;
int tileNum = (K - 1) / TILE_SIZE + 1;
int rowA = blockIdx.y * blockDim.y + threadIdx.y;
int colB = blockIdx.x * blockDim.x + threadIdx.x;
int colA = threadIdx.x;
int rowB = threadIdx.y;
for(int tileIndex = 0;tileIndex < tileNum;tileIndex++){
Atile[threadIdx.y][threadIdx.x] = MatA[rowA * K + colA];
Btile[threadIdx.y][threadIdx.x] = MatB[rowB * N + colB];
__syncthreads();
// finish load
for(int i = 0;i < TILE_SIZE;i++){
accu += Atile[threadIdx.y][i] * Btile[i][threadIdx.x];
}
__syncthreads(); // wait until all threads(in the block) finish computing, then load and compute next tile
colA += TILE_SIZE;
rowB += TILE_SIZE;
}
MatC[rowA * N + colB] = accu;
}
multi.cu (12.7 KB)
mymatrix.h (2.0 KB)
Thanks in advance!