Hello!

I’m completely new to CUDA programming. I’ve ported an algorithm for finding all (complex) roots of polynomials which was originally written in C, to CUDA. I’m interested in finding ways of improving the performance of the kernel.

Here is the kernel function and the function which calls the kernel:

```
__global__ void ehrlich_aberth_kernel(std::int64_t size, std::int64_t deg,
const thrust::complex<double> *poly,
thrust::complex<double> *roots, double *alpha, bool *conv,
point *points, point *hull) {
const std::int64_t itmax = 50;
// Compute roots
std::int64_t i;
// This is a "grid-stride loop" see http://alexminnaar.com/2019/08/02/grid-stride-loops.html
for (std::int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
idx += blockDim.x * gridDim.x) {
i = idx * (deg + 1);
ehrlich_aberth(poly + i, roots + i - idx, deg, itmax, alpha + i, conv + i - idx, points + i,
hull + i);
}
}
// Function which calls the CUDA kernel
inline void apply_ehrlich_aberth(cudaStream_t stream, void **buffers, const char *opaque,
std::size_t opaque_len) {
const EhrlichAberthDescriptor &d =
*UnpackDescriptor<EhrlichAberthDescriptor>(opaque, opaque_len);
const std::int64_t size = d.size;
const std::int64_t deg = d.deg;
const thrust::complex<double> *poly =
reinterpret_cast<const thrust::complex<double> *>(buffers[0]);
thrust::complex<double> *roots = reinterpret_cast<thrust::complex<double> *>(buffers[1]);
const int block_dim = 256;
const int grid_dim = std::min<int>(1024, (size + block_dim - 1) / block_dim);
// Preallocate memory for temporary arrays used within the kernel
// allocating these arrays within the kernel with `new` results in a an illegal memory access
// error for some reason I don't understand
double *alpha;
bool *conv;
point *points;
point *hull;
cudaMalloc(&alpha, size * (deg + 1) * sizeof(double));
cudaMalloc(&conv, size * deg * sizeof(bool));
cudaMalloc(&points, size * (deg + 1) * sizeof(point));
cudaMalloc(&hull, size * (deg + 1) * sizeof(point));
ehrlich_aberth_kernel<<<grid_dim, block_dim, 0, stream>>>(size, deg, poly, roots, alpha, conv,
points, hull);
// free memory
cudaFree(alpha);
cudaFree(conv);
cudaFree(points);
cudaFree(hull);
cudaError_t cudaerr = cudaDeviceSynchronize();
if (cudaerr != cudaSuccess)
printf("kernel launch failed with error \"%s\".\n", cudaGetErrorString(cudaerr));
}
} // namespace
```

The rest of the code is here.

The `ehrlich_aberth`

function takes a vector of polynomial coefficients of shape n_polynomials * (deg + 1) where deg is the degree of the polynomial and it returns the roots of shape n_polynomials*deg. There’s a test function `test_cuda.cc.cu`

and a version for the CPU `test_cpu.cc`

.

For 1M polynomials, the kernel call takes about ~60ms using the RTX 3090 card. This is at least an order of magnitude slower than I expected because a paper implementing a similar complex polynomial solver claims a performance of ~10ms per 1M polynomials, and that’s with a much much older gpu ( Tesla C2075). The same code compiled for the CPU with 1M polynomials takes ~7 seconds.

Here is some output from NSight compute:

**I’m wondering if there’s any low hanging fruit for optimizing this code that’s obvious to CUDA experts?**

Thanks!

EDIT: I was just looking at the specs for Tesla C2075 and its FP64 theoretical performance is 514 GFLOPS while for RTX 3090 it’s 556 GFLOPS. I was naively expecting that RTX 3090 would be significantly faster in double precision as well as float precision.