How to optimize this kernel

Hi,
I am trying to optimize a kernel in my program which performs the following operation.
Given a triangular matrix, for each matrix column I need to perform elementwise multiplication with three arrays, then perform a sum reduction to get the result for this column.

Host code:

/*
    matrix size = N x N
    length of arrayI = N
    length of arrayD = N-1
    length of arrayF, result = N * Y

    in my usecase, N,Y are approx. 2000
*/
void cpu(double* result,
        const double* matrix,
        const double* arrayI,
        const double* arrayF,
        const double* arrayD,
        int N,
		int Y){

	for(int y = 0; y < Y; y++){

		const double* arrayFy = arrayF + y*N;
		double* resulty = result + y*N;

		for(int e2 = 1; e2 < N; e2++){
			for(int e1 = 0; e1 < e2; e1++){
				const double d = arrayD[e2 - 1];
				const double i = arrayI[e2];
				const double matrixelem = matrix[e2 * N + e1];
				const double f = arrayFy[e2];
				const double fid = f * i * d;

				resulty[e1] += fid * matrixelem;
			}
		}
	}
}

I came up with the following kernel which gives me a speedup of 62 on Titan Xp.

/*
    matrix size = N x N
    length of arrayI = N
    length of arrayD = N-1
    length of arrayF, result = N * Y
*/
__global__
void kernel1(double* result,
        const double* matrix,
        const double* arrayI,
        const double* arrayF,
        const double* arrayD,
        int N,
   		int Y){

	for(int y = blockIdx.y; y < Y; y += gridDim.y){

		const double* arrayFy = arrayF + y*N;
		double* resulty = result + y*N;

		/*
		  Idea: switch e1 and e2 loop. for fixed e1, perform reduction in register, then assign result to global memory result
		*/
		for(size_t e1 = threadIdx.x + blockIdx.x * blockDim.x; e1 < N; e1 += blockDim.x * gridDim.x){
			double e1_contribution = 0.0;

			for(size_t e2 = 1; e2 < N; e2++){
				if(e1 < e2){
					const double d = arrayD[e2 - 1];
					const double i = arrayI[e2];
					const double f = arrayFy[e2];
					const double matrixelem = matrix[e2 * N + e1];
					const double fid = f * i * d;

					e1_contribution += fid * matrixelem;
				}
			}

			resulty[e1] = e1_contribution;
		}
	}
}

For optimization, I cache the arrays in shared memory which increases the speedup to 70 compared to cpu

/*
    matrix size = N x N
    length of arrayI = N
    length of arrayD = N-1
    length of arrayF, result = N * Y
*/
__global__
void kernel2(double* result,
        const double* matrix,
        const double* arrayI,
        const double* arrayF,
        const double* arrayD,
        int N,
   		int Y){

	constexpr int batchsize = 256;

	__shared__ double smemD[batchsize+1];
	__shared__ double smemi[batchsize];
	__shared__ double smemF[batchsize];

	const int batchiters = (N + batchsize - 1) / batchsize;

	for(int y = blockIdx.y; y < Y; y += gridDim.y){

		const double* arrayFy = arrayF + y*N;
		double* resulty = result + y*N;

        /*
		  Idea: all threads access the same index in arrays. partition arrays into shared memory
		*/
		for(int iter = 0; iter < batchiters; iter++){

			for(int i = threadIdx.x; i < batchsize; i += blockDim.x){
				const int index = iter * batchsize + i;
				if(index < N){
					smemi[i] = arrayI[index];
					smemF[i] = arrayFy[index];
				}
                if(iter == 0){
				    if(index < N-1)
					    smemD[i+1] = arrayD[index];
                }else{
                    if(index < N)
                        smemD[i] = arrayD[index-1];
				}
			}

			__syncthreads();
            /*
    		  Idea: switch e1 and e2 loop. for fixed e1, perform reduction in register, then assign result to global memory result
    		*/
			for(size_t e1 = threadIdx.x + blockIdx.x * blockDim.x; e1 < N; e1 += blockDim.x * gridDim.x){
				double e1_contribution = 0.0;

				for(size_t e2_iter = 0; e2_iter < batchsize; e2_iter++){
					const size_t e2 = iter * batchsize + e2_iter;
					if(e2 < N && e1 < e2){
						const double d = smemD[e2_iter];
                        const double i = smemi[e2_iter];
    					const double f = smemF[e2_iter];
						const double matrixelem = matrix[e2 * N + e1];
						const double fid = f * i * d;

						e1_contribution += fid * matrixelem;
					}
				}

				resulty[e1] += e1_contribution;
			}
            __syncthreads(); //important!!! don't start next iteration before threads are finished
		}
	}
}

Could you suggest me if it is possible to optimize this kernel further?

Run your code through the profiler.

Nvidia Visual Profiler (nvvp). Or the CUDA profiling in the Visual Studio or Eclipse integration.
There’s a guided optimization that highlights only the apparent problems with the kernel without presenting too much irrelevant information.

Christian