Okay, so I got something I think is worth sharing.
I first wrote a driver that runs 4 implementations and measures the time and correctness of the results: 1. host version, 2. sample version (from sdk/programming guide), 3. cublas version, 4. denoir version.
Then I began on my own “subdivide on k” implementation.
First let me say my original implementation where each block computes a full [m x n] in shared memory using only some of the k rows did not perform well at all. It took about 4x as long as the denoir implementation, and I believe part of the reason is the very low occupancy (one block per multiprocessor). I might have tried to optimize it (bank conflicts, etc) but it has some inherent problems and it cannot run at all when m and n exceed the shared memory, which will happen even when m and n are not that big.
The matrix multiply example in the programming guide subdivides the problem on m and n, but not on k. I decided to write an impementation that subdivides on m and n and k, so that when m and n are small, it can still launch many blocks. It subdivides m into pieces of SUBDIV_M, and n into SUBDIV_N, and with k, each block processes SUBDIV_K at a time, but will accumulate much more than that. The entire k is divided into Q roughly equal size pieces, and each block processes the k/Q rows SUBDIV_K at a time.
My implementation requires m and n to be multiples of SUBDIV_M and SUBDIV_N, so I am running a case where m=64, n=16, and k = 256*79
On my 9800GTX:
host mmult time: 46.735200 (ms)
simple mmult time: 2.007814 (ms)
Total Errors = 0
cublas mmult time: 4.031665 (ms)
Total Errors = 0
denoir mmult time: 2.885172 (ms)
Total Errors = 0
jamie mmult time: 4.738407 (ms)
Total Errors = 0
jamie2 mmult time: 1.168706 (ms)
Total Errors = 0
I in another thread, vvolkov mentioned that “shared memory is substantially slower than registers” so I also attempted to minimize accesses to shared memory. Each block loads the matrix data into shared memory, and by further subdividing the block and keeping things in registers, I average 1 shared memory access per madd instead of 4. I think this also makes it easier to avoid bank conflicts.
Here it is:
[codebox]
#include “ctimer.h”
// each block produces a 16x16 chunk of output, but doesn’t necessarily scan through the entire K rows.
static const int SUBDIV_M = 16;
static const int SUBDIV_N = 16;
static const int SUBDIV_K = 16;
static const int DESIRED_BLOCKS = 256;
static const int ACCWIDTH = SUBDIV_M+1;
#define SUBSUB_M 2
#define SUBSUB_N 2
static const int NTHREADS = (SUBDIV_M/SUBSUB_M)*(SUBDIV_N/SUBSUB_N);
// function to accumulate for the 2x2 sub-sub-block
device void ssacc_2x2(float acc[ACCWIDTH], int nrow_offs, int mcol_offs, int loader_col, float s_B[SUBDIV_K], float s_A[SUBDIV_K]) {
float ssa_0_0 = 0;
float ssa_0_1 = 0;
float ssa_1_0 = 0;
float ssa_1_1 = 0;
for (int i=0; i < SUBDIV_K; i++) {
int kk = (i+loader_col) % SUBDIV_K;
float ssB_0 = s_B[0+nrow_offs][kk];
float ssB_1 = s_B[1+nrow_offs][kk];
float ssA = s_A[0+mcol_offs][kk];
ssa_0_0 += ssB_0 * ssA;
ssa_1_0 += ssB_1 * ssA;
ssA = s_A[1+mcol_offs][kk];
ssa_0_1 += ssB_0 * ssA;
ssa_1_1 += ssB_1 * ssA;
}
acc[0 + nrow_offs][0 + mcol_offs] += ssa_0_0;
acc[0 + nrow_offs][1 + mcol_offs] += ssa_0_1;
acc[1 + nrow_offs][0 + mcol_offs] += ssa_1_0;
acc[1 + nrow_offs][1 + mcol_offs] += ssa_1_1;
}
// The host will launch M/SUBDIV_M x N/SUBDIV_N x Q blocks, where Q is determined at run time (may be 1)
// dim3 theGrid(M/SUBDIV_M, N/SUBDIV_N * Q);
global void aTbKernel3(float *g_A, float *g_B, float *g_C, int M, int N, int K, int Q) {
int starting_m = blockIdx.x * SUBDIV_M;
g_A += starting_m * K;
int starting_n = (blockIdx.y / Q) * SUBDIV_N;
g_B += starting_n * K;
int qofQ = blockIdx.y % Q;
g_C += qofQ * M * N;
int k_per_Q = (K + Q - 1)/Q;
k_per_Q = (k_per_Q + SUBDIV_K - 1)/SUBDIV_K * SUBDIV_K;
int starting_k = qofQ * k_per_Q;
int ending_k = starting_k + k_per_Q;
if (ending_k > K) {
ending_k = K;
}
int nrow_offs = threadIdx.y * SUBSUB_N;
int mcol_offs = threadIdx.x * SUBSUB_M;
// naive clearing of accum matrix (has bank conflicts)
__shared__ float accum[SUBDIV_N][ACCWIDTH];
for (int nrow=0; nrow < SUBSUB_N; nrow++) {
for (int mcol=0; mcol < SUBSUB_M; mcol++) {
accum[nrow + nrow_offs][mcol + mcol_offs] = 0;
}
}
__shared__ float s_A[SUBDIV_M][SUBDIV_K];
__shared__ float s_B[SUBDIV_N][SUBDIV_K];
int tid = blockDim.x*threadIdx.y + threadIdx.x; // blockDim.x should be equal to SUBDIV_M
int loader_col = tid % SUBDIV_K;
int loader_startrow = tid / SUBDIV_K;
for (int current_k=starting_k; current_k < ending_k; current_k += SUBDIV_K) {
// load SUBDIV_K columns of A into s_A
for (int row=loader_startrow; row < SUBDIV_M; row += NTHREADS/SUBDIV_K) {
s_A[row][loader_col] = g_A[row*K+current_k+loader_col];
}
// load SUBDIV_K columns of B into s_B
for (int row=loader_startrow; row < SUBDIV_N; row += NTHREADS/SUBDIV_K) {
s_B[row][loader_col] = g_B[row*K+current_k+loader_col];
}
__syncthreads();
if (true) {
ssacc_2x2(accum, nrow_offs, mcol_offs, loader_col, s_B, s_A);
}
else if (false) {
// arrays do not get optimized to registers, even though (with unrolling) all indices would be constant
float ssaccum[SUBSUB_N][SUBSUB_M];
for (int nn=0; nn < SUBSUB_N; nn++) {
for (int mm=0; mm< SUBSUB_M; mm++) {
ssaccum[nn][mm] = 0;
}
}
for (int i=0; i < SUBDIV_K; i++) {
int kk = (i+loader_col) % SUBDIV_K;
float ss_BN[SUBSUB_N];
for (int nn=0; nn < SUBSUB_N; nn++) {
ss_BN[nn] = s_B[nn+nrow_offs][kk];
}
for (int mm=0; mm< SUBSUB_M; mm++) {
float mval = s_A[mm+mcol_offs][kk];
for (int nn=0; nn < SUBSUB_N; nn++) {
ssaccum[nn][mm] += mval*ss_BN[nn];
}
}
}
for (int nrow=0; nrow < SUBSUB_N; nrow++) {
for (int mcol=0; mcol < SUBSUB_M; mcol++) {
accum[nrow + nrow_offs][mcol + mcol_offs] += ssaccum[nrow][mcol];
}
}
}
else {
// naive summing into accum, lots of shared access, and bank conflicts
for (int nrow=0; nrow < SUBSUB_N; nrow++) {
for (int mcol=0; mcol < SUBSUB_M; mcol++) {
float total = 0;
for (int kcol=0; kcol < SUBDIV_K; kcol++) {
total += s_A[mcol + mcol_offs][kcol]*s_B[nrow + nrow_offs][kcol];
}
accum[nrow + nrow_offs][mcol + mcol_offs] += total;
}
}
}
}
// copy accumulated buffer to output
for (int nrow=0; nrow < SUBSUB_N; nrow++) {
for (int mcol=0; mcol < SUBSUB_M; mcol++) {
g_C[M*(nrow + nrow_offs + starting_n) + mcol + mcol_offs + starting_m] = accum[nrow + nrow_offs][mcol + mcol_offs];
}
}
}
global void sumC(float *g_C, int M, int N, int Q) {
int threadm = blockIdx.x*blockDim.x+threadIdx.x;
int threadn = blockIdx.y*blockDim.y+threadIdx.y;
float sum = 0;
for (int q=0; q < Q; q++) {
sum += g_C[q*M*N + M*threadn + threadm];
}
g_C[M*threadn + threadm] = sum;
}
double mmult_jamie2(float* C, const float* At, const float* B, unsigned int hA, unsigned int wA, unsigned int wB) {
unsigned int m = hA;
unsigned int k = wA;
unsigned int n = wB;
int hC = hA;
int wC = wB;
int mnblocks = (m/SUBDIV_M) * (n/SUBDIV_N);
int Q = (DESIRED_BLOCKS + mnblocks - 1)/mnblocks;
float *tC = (float *)malloc(hC*wC*sizeof(float)*Q);
Matrix d_A, d_B, d_C;
d_A.R = k;
d_A.C = m;
d_B.R = k;
d_B.C = n;
d_C.R = m*Q;
d_C.C = n;
cudaMalocMatrix(&d_A);
cudaMalocMatrix(&d_B);
cudaMalocMatrix(&d_C);
cudaMemcpy(d_A.E, At, d_A.size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B.E, B, d_B.size, cudaMemcpyHostToDevice);
cudaMemset(d_C.E, 0, d_C.size);
dim3 theGrid(m/SUBDIV_M, n/SUBDIV_N * Q);
dim3 threadBlock(SUBDIV_M/SUBSUB_M, SUBDIV_N/SUBSUB_N);
dim3 sumGrid(m/SUBDIV_M, n/SUBDIV_N);
dim3 sumThreadBlock(SUBDIV_M, SUBDIV_N);
struct ctimer timer1;
restart(&timer1);
for (int i=0; i < ITERATIONS; i++) {
aTbKernel3<<< theGrid, threadBlock >>>(d_A.E, d_B.E, d_C.E, d_A.C, d_B.C, d_B.R, Q);
cudaThreadSynchronize();
sumC<<< sumGrid, sumThreadBlock >>>(d_C.E, d_A.C, d_B.C, Q);
cudaThreadSynchronize();
}
double elapsed = stop(&timer1)/ITERATIONS;
cudaMemcpy(tC, d_C.E, d_C.size, cudaMemcpyDeviceToHost);
// transpose tC into C
for (int r=0; r < hC; r++) {
for (int c=0; c < wC; c++) {
C[r*wC+c] = tC[c*hC+r];
}
}
free(tC);
cudaFreeMatrix(&d_A);
cudaFreeMatrix(&d_B);
cudaFreeMatrix(&d_C);
return elapsed;
}
[/codebox]
I experimented with different values for the parameters at the top, and the values are about the best on my 9800GTX. Larger SUBDIV_M and SUBDIV_N should offer lower occupancy due to shared memory needs, but lower overall bandwidth needs due to fewer total loads from global memory. Larger SUBSUB_M and SUBSUB_N requires fewer shared memory loads within the sub-sub accumulation, but needs more registers, and results in fewer threads per block, again lowering threads per multiprocessor.
I haven’t calculated what a bandwidth-limited theoretical best would be. I suspect it’s not close, but would make an interesting comparison.