I’m not sure this is a well-written code. I’m not sure I would spend much time doing perf analysis on it.
For example this:
c[cx*bLen+cy] = cValue;
will create uncoalesced access.
You can find what I believe is a better code in the programming guide that does a similar thing:
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory
I would use that as a starting point.
When I run a test similar to what you described, using that code, I see no significant difference:
$ cat t281.cu
#include <iostream>
#define BLOCK_SIZE 16
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,
float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row
+ BLOCK_SIZE * col];
return Asub;
}
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc(&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
// Get sub-matrix Asub of A
Matrix Asub = GetSubMatrix(A, blockRow, m);
// Get sub-matrix Bsub of B
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}
int main(){
#ifndef ALT
int m = 3200;
int k = 256;
#else
int m = 256;
int k = 3200;
#endif
int n = 64;
Matrix A,B,C;
A.width = A.stride = n;
A.height = m;
A.elements = new float[A.width*A.height];
B.width = B.stride = k;
B.height = n;
B.elements = new float[B.width*B.height];
C.width = C.stride = k;
C.height = m;
C.elements = new float[C.width*C.height];
MatMul(A, B, C);
return 0;
}
$ nvcc -arch=sm_52 -lineinfo -o t281 t281.cu
$ cuda-memcheck ./t281
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t281
==1735== NVPROF is profiling process 1735, command: ./t281
==1735== Profiling application: ./t281
==1735== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 81.78% 4.0170ms 1 4.0170ms 4.0170ms 4.0170ms [CUDA memcpy DtoH]
10.60% 520.52us 2 260.26us 29.664us 490.85us [CUDA memcpy HtoD]
<b>7.63% 374.60us 1 374.60us 374.60us 374.60us MatMulKernel(Matrix, Matrix, Matrix</b>)
API calls: 96.84% 243.29ms 3 81.098ms 257.74us 242.51ms cudaMalloc
2.57% 6.4602ms 3 2.1534ms 68.316us 5.9930ms cudaMemcpy
0.26% 653.75us 3 217.92us 188.27us 269.12us cudaFree
0.21% 532.88us 94 5.6680us 290ns 230.81us cuDeviceGetAttribute
0.04% 106.85us 1 106.85us 106.85us 106.85us cuDeviceTotalMem
0.04% 95.200us 1 95.200us 95.200us 95.200us cuDeviceGetName
0.03% 67.151us 1 67.151us 67.151us 67.151us cudaLaunch
0.00% 8.8550us 3 2.9510us 320ns 7.7800us cudaSetupArgument
0.00% 7.3150us 3 2.4380us 345ns 5.6450us cuDeviceGetCount
0.00% 4.2050us 2 2.1020us 570ns 3.6350us cuDeviceGet
0.00% 1.6750us 1 1.6750us 1.6750us 1.6750us cudaConfigureCall
$ nvcc -arch=sm_52 -lineinfo -o t281 t281.cu -DALT
$ cuda-memcheck ./t281
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
[bob@fed20 misc]$ nvprof ./t281
==1793== NVPROF is profiling process 1793, command: ./t281
==1793== Profiling application: ./t281
==1793== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 81.76% 4.0408ms 1 4.0408ms 4.0408ms 4.0408ms [CUDA memcpy DtoH]
10.53% 520.62us 2 260.31us 29.633us 490.98us [CUDA memcpy HtoD]
<b>7.70% 380.65us 1 380.65us 380.65us 380.65us MatMulKernel(Matrix, Matrix, Matrix)</b>
API calls: 96.76% 239.13ms 3 79.711ms 249.00us 238.36ms cudaMalloc
2.63% 6.5034ms 3 2.1678ms 83.395us 6.0414ms cudaMemcpy
0.26% 651.52us 3 217.17us 186.20us 271.97us cudaFree
0.22% 532.88us 94 5.6680us 295ns 231.78us cuDeviceGetAttribute
0.06% 143.31us 1 143.31us 143.31us 143.31us cuDeviceGetName
0.04% 89.436us 1 89.436us 89.436us 89.436us cuDeviceTotalMem
0.03% 66.992us 1 66.992us 66.992us 66.992us cudaLaunch
0.00% 9.1650us 3 3.0550us 345ns 7.9650us cudaSetupArgument
0.00% 6.3650us 3 2.1210us 340ns 4.6700us cuDeviceGetCount
0.00% 4.1450us 2 2.0720us 465ns 3.6800us cuDeviceGet
0.00% 1.8850us 1 1.8850us 1.8850us 1.8850us cudaConfigureCall
$
Fedora 20, CUDA 9.1, GTX 960
Note that if you are actually interested in the fastest performance matrix multiply, I encourage you to use cublas, not this naive kernel. I assume this is for learning purposes.