In principle, it should be sufficient to transpose your coefficent matrix, and update the address calculations accordingly. But the benefits really depend on the maximum degree and coefficient datatype.
I wrote some toy example (see below) which just computes the sum of coefficients for each polynomial.
Approach 1 uses your data layout. Approach 2 uses a transposed data layout. Approach 3 uses your data layout, but uses 8 threads per polynomial. Approach 4 uses a transposed data layout, but uses 8 threads per polynomial.
3 and 4 could give you an idea how one could use multiple threads per polynomial, but this requires quite a refactoring in general (and may not even prove beneficial)
Kernel timings on a rtx 3090 reported by nsight compute (approach 1,2,3,4):
float coefficient, degree 10
Duration usecond 113.18
Duration usecond 112.51
Duration usecond 73.41
Duration usecond 96.86
float coefficient, degree 32
Duration usecond 477.12
Duration usecond 352
Duration usecond 157.89
Duration usecond 285.54
thrust::complex<double>
coefficient, degree 10
Duration usecond 328.45
Duration usecond 256.80
Duration usecond 679.90
Duration usecond 679.30
thrust::complex<double>
coefficient, degree 32
Duration msecond 1.25
Duration usecond 788.13
Duration msecond 1.35
Duration msecond 1.35
The smaller the element size and the access stride, the closer the performance of non-transposed vs transposed.
Approach 3 works nice on floats, since 8-thread float sum reduction is hardware accelerated on 3090. The same applies to approach 4, but its memory access is worse (because using multiple threads counters the effect of data transposition)
For your use case with degree 10 and double-complex type, transposition gives ~25% speedup on this example.
//nvcc -O3 -arch=sm_86 main.cu -o main
#include <vector>
#include <algorithm>
#include <iostream>
#include <cassert>
#include <thrust/complex.h>
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
namespace cg = cooperative_groups;
using CoeffT = thrust::complex<double>;
//compute sum of coefficients per polynomial
__global__
void coeffSumKernel(CoeffT* __restrict__ output, const CoeffT* __restrict__ coeffs, int N, int maxDegree){
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < N){
const CoeffT* myCoeffs = coeffs + tid * maxDegree;
CoeffT result = 0;
for(int i = 0; i < maxDegree; i++){
result += myCoeffs[i];
}
output[tid] = result;
}
}
//compute sum of coefficients per polynomial, coeffs are transposed
__global__
void coeffSumKernel2(CoeffT* __restrict__ output, const CoeffT* __restrict__ coeffs, int N, int maxDegree){
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < N){
const CoeffT* myCoeffs = coeffs + tid;
CoeffT result = 0;
for(int i = 0; i < maxDegree; i++){
result += myCoeffs[i * N];
}
output[tid] = result;
}
}
//compute sum of coefficients per polynomial, use groupsize threads (1,2,4,8,16,or 32) per polynomial
template<int groupsize>
__global__
void coeffSumKernel3(CoeffT* __restrict__ output, const CoeffT* __restrict__ coeffs, int N, int maxDegree){
auto group = cg::tiled_partition<groupsize>(cg::this_thread_block());
const int groupId = (threadIdx.x + blockIdx.x * blockDim.x) / groupsize;
if(groupId < N){
const CoeffT* myCoeffs = coeffs + groupId * maxDegree;
CoeffT result = 0;
const int numIters = (maxDegree + groupsize - 1) / groupsize;
for(int iter = 0; iter < numIters; iter++){
const int index = iter * groupsize + group.thread_rank();
CoeffT val = index < maxDegree ? myCoeffs[index] : 0;
result += cg::reduce(group, val, cg::plus<CoeffT>());
}
if(group.thread_rank() == 0){
output[groupId] = result;
}
}
}
//compute sum of coefficients per polynomial, coeffs are transposed, use groupsize threads (1,2,4,8,16,or 32) per polynomial
template<int groupsize>
__global__
void coeffSumKernel4(CoeffT* __restrict__ output, const CoeffT* __restrict__ coeffs, int N, int maxDegree){
auto group = cg::tiled_partition<groupsize>(cg::this_thread_block());
const int groupId = (threadIdx.x + blockIdx.x * blockDim.x) / groupsize;
if(groupId < N){
const CoeffT* myCoeffs = coeffs + groupId;
CoeffT result = 0;
const int numIters = (maxDegree + groupsize - 1) / groupsize;
for(int iter = 0; iter < numIters; iter++){
const int index = iter * groupsize + group.thread_rank();
CoeffT val = index < maxDegree ? myCoeffs[index * N] : 0;
result += cg::reduce(group, val, cg::plus<CoeffT>());
}
if(group.thread_rank() == 0){
output[groupId] = result;
}
}
}
int main(){
const int N = 1000000;
const int maxDegree = 10;
const int maxCoeffs = N * maxDegree;
//Approach 1, this is your current data layout
std::vector<CoeffT> coeffs1(maxCoeffs);
for(int i = 0; i < N; i++){
for(int j = 0; j < maxDegree; j++){
coeffs1[i * maxDegree + j] = i;
}
}
CoeffT* d_coeffs1;
CoeffT* d_result1;
cudaMalloc(&d_coeffs1, sizeof(CoeffT) * maxCoeffs);
cudaMalloc(&d_result1, sizeof(CoeffT) * N);
cudaMemcpy(d_coeffs1, coeffs1.data(), sizeof(CoeffT) * maxCoeffs, cudaMemcpyHostToDevice);
coeffSumKernel<<<(maxCoeffs + 127)/128, 128>>>(d_result1, d_coeffs1, N, maxDegree);
std::vector<CoeffT> result1(N);
cudaMemcpy(result1.data(), d_result1, sizeof(CoeffT) * N, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// for(int i = 0; i < N; i++){
// if(result1[i] != i * maxDegree){
// printf("%f %d\n",result1[i], i * maxDegree);
// }
// assert(result1[i] == i * maxDegree);
// }
cudaFree(d_coeffs1);
cudaFree(d_result1);
//Approach 2, transpose the coefficient matrix. the coefficients of polynomial i are stored in the i-th column
std::vector<CoeffT> coeffs2(maxCoeffs);
for(int i = 0; i < N; i++){
for(int j = 0; j < maxDegree; j++){
coeffs2[i + j * N] = i;
}
}
CoeffT* d_coeffs2;
CoeffT* d_result2;
cudaMalloc(&d_coeffs2, sizeof(CoeffT) * maxCoeffs);
cudaMalloc(&d_result2, sizeof(CoeffT) * N);
cudaMemcpy(d_coeffs2, coeffs2.data(), sizeof(CoeffT) * maxCoeffs, cudaMemcpyHostToDevice);
coeffSumKernel2<<<(maxCoeffs + 127)/128, 128>>>(d_result2, d_coeffs2, N, maxDegree);
std::vector<CoeffT> result2(N);
cudaMemcpy(result2.data(), d_result2, sizeof(CoeffT) * N, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
//assert(result2 == result1);
cudaFree(d_coeffs2);
cudaFree(d_result2);
//Approach 3, your data layout, use multiple threads per polynomial
constexpr int numThreadsPerPoly3 = 8;
std::vector<CoeffT> coeffs3(maxCoeffs);
for(int i = 0; i < N; i++){
for(int j = 0; j < maxDegree; j++){
coeffs3[i * maxDegree + j] = i;
}
}
CoeffT* d_coeffs3;
CoeffT* d_result3;
cudaMalloc(&d_coeffs3, sizeof(CoeffT) * maxCoeffs);
cudaMalloc(&d_result3, sizeof(CoeffT) * N);
cudaMemcpy(d_coeffs3, coeffs3.data(), sizeof(CoeffT) * maxCoeffs, cudaMemcpyHostToDevice);
coeffSumKernel3<numThreadsPerPoly3><<<((N * numThreadsPerPoly3) + 127)/128, 128>>>(d_result3, d_coeffs3, N, maxDegree);
std::vector<CoeffT> result3(N);
cudaMemcpy(result3.data(), d_result3, sizeof(CoeffT) * N, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
//assert(result3 == result1);
cudaFree(d_coeffs3);
cudaFree(d_result3);
//Approach 4, transpose the coefficient matrix. use multiple threads per polynomial
constexpr int numThreadsPerPoly4 = 8;
std::vector<CoeffT> coeffs4(maxCoeffs);
for(int i = 0; i < N; i++){
for(int j = 0; j < maxDegree; j++){
coeffs4[i + j * N] = i;
}
}
CoeffT* d_coeffs4;
CoeffT* d_result4;
cudaMalloc(&d_coeffs4, sizeof(CoeffT) * maxCoeffs);
cudaMalloc(&d_result4, sizeof(CoeffT) * N);
cudaMemcpy(d_coeffs4, coeffs4.data(), sizeof(CoeffT) * maxCoeffs, cudaMemcpyHostToDevice);
coeffSumKernel4<numThreadsPerPoly4><<<((N * numThreadsPerPoly4) + 127)/128, 128>>>(d_result4, d_coeffs4, N, maxDegree);
std::vector<CoeffT> result4(N);
cudaMemcpy(result4.data(), d_result4, sizeof(CoeffT) * N, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
//assert(result4 == result1);
cudaFree(d_coeffs4);
cudaFree(d_result4);
}