Hi,
I am trying to optimize a kernel in my program which performs the following operation.
Given a triangular matrix, for each matrix column I need to perform elementwise multiplication with three arrays, then perform a sum reduction to get the result for this column.
Host code:
/*
matrix size = N x N
length of arrayI = N
length of arrayD = N-1
length of arrayF, result = N * Y
in my usecase, N,Y are approx. 2000
*/
void cpu(double* result,
const double* matrix,
const double* arrayI,
const double* arrayF,
const double* arrayD,
int N,
int Y){
for(int y = 0; y < Y; y++){
const double* arrayFy = arrayF + y*N;
double* resulty = result + y*N;
for(int e2 = 1; e2 < N; e2++){
for(int e1 = 0; e1 < e2; e1++){
const double d = arrayD[e2 - 1];
const double i = arrayI[e2];
const double matrixelem = matrix[e2 * N + e1];
const double f = arrayFy[e2];
const double fid = f * i * d;
resulty[e1] += fid * matrixelem;
}
}
}
}
I came up with the following kernel which gives me a speedup of 62 on Titan Xp.
/*
matrix size = N x N
length of arrayI = N
length of arrayD = N-1
length of arrayF, result = N * Y
*/
__global__
void kernel1(double* result,
const double* matrix,
const double* arrayI,
const double* arrayF,
const double* arrayD,
int N,
int Y){
for(int y = blockIdx.y; y < Y; y += gridDim.y){
const double* arrayFy = arrayF + y*N;
double* resulty = result + y*N;
/*
Idea: switch e1 and e2 loop. for fixed e1, perform reduction in register, then assign result to global memory result
*/
for(size_t e1 = threadIdx.x + blockIdx.x * blockDim.x; e1 < N; e1 += blockDim.x * gridDim.x){
double e1_contribution = 0.0;
for(size_t e2 = 1; e2 < N; e2++){
if(e1 < e2){
const double d = arrayD[e2 - 1];
const double i = arrayI[e2];
const double f = arrayFy[e2];
const double matrixelem = matrix[e2 * N + e1];
const double fid = f * i * d;
e1_contribution += fid * matrixelem;
}
}
resulty[e1] = e1_contribution;
}
}
}
For optimization, I cache the arrays in shared memory which increases the speedup to 70 compared to cpu
/*
matrix size = N x N
length of arrayI = N
length of arrayD = N-1
length of arrayF, result = N * Y
*/
__global__
void kernel2(double* result,
const double* matrix,
const double* arrayI,
const double* arrayF,
const double* arrayD,
int N,
int Y){
constexpr int batchsize = 256;
__shared__ double smemD[batchsize+1];
__shared__ double smemi[batchsize];
__shared__ double smemF[batchsize];
const int batchiters = (N + batchsize - 1) / batchsize;
for(int y = blockIdx.y; y < Y; y += gridDim.y){
const double* arrayFy = arrayF + y*N;
double* resulty = result + y*N;
/*
Idea: all threads access the same index in arrays. partition arrays into shared memory
*/
for(int iter = 0; iter < batchiters; iter++){
for(int i = threadIdx.x; i < batchsize; i += blockDim.x){
const int index = iter * batchsize + i;
if(index < N){
smemi[i] = arrayI[index];
smemF[i] = arrayFy[index];
}
if(iter == 0){
if(index < N-1)
smemD[i+1] = arrayD[index];
}else{
if(index < N)
smemD[i] = arrayD[index-1];
}
}
__syncthreads();
/*
Idea: switch e1 and e2 loop. for fixed e1, perform reduction in register, then assign result to global memory result
*/
for(size_t e1 = threadIdx.x + blockIdx.x * blockDim.x; e1 < N; e1 += blockDim.x * gridDim.x){
double e1_contribution = 0.0;
for(size_t e2_iter = 0; e2_iter < batchsize; e2_iter++){
const size_t e2 = iter * batchsize + e2_iter;
if(e2 < N && e1 < e2){
const double d = smemD[e2_iter];
const double i = smemi[e2_iter];
const double f = smemF[e2_iter];
const double matrixelem = matrix[e2 * N + e1];
const double fid = f * i * d;
e1_contribution += fid * matrixelem;
}
}
resulty[e1] += e1_contribution;
}
__syncthreads(); //important!!! don't start next iteration before threads are finished
}
}
}
Could you suggest me if it is possible to optimize this kernel further?