Hi, I ran into something interesting when converting a simulation from using cuComplex.h to using thrust/complex.h. When I was using cuComplex, I overloaded the basic math operations by myself for double/complex math, which wasn’t necessary with thrust. Or so I thought?
It turns out that when you divide a real double-precision number by a complex double, the calculation is a bit inefficient, and I think I know what’s happening.
It seems to first turn the double into a complex number, basically appending a zero to it, then dividing complex/complex. This can be done more efficiently, in a way that saves a division. I pasted a simple program below that I made to isolate and test this.
Long story short: If I write a function by hand to do the operation without the additional divide, kernel calls just doing this division average 6.22 ms according to Nsight Compute. Doing it with Thrust’s division operator, they take 10.97 ms on average.
So one can simply overload the / operator for a bit of a speedup, as:
__device__ thrust::complex<double> operator/(double a, thrust::complex<double> b) {
double divByDenominator = a / (b.real() * b.real() + b.imag() * b.imag());
return thrust::complex<double>(b.real() * divByDenominator, -b.imag() * divByDenominator);
}
I tried this with floats instead of doubles, and it doesn’t seem to matter there. This was on a 2080 Super. If it turns out to be the case in general and not just a me thing, maybe it’s worth putting something like that explicitly in the library, or maybe the compiler is just missing something it shouldn’t? This was with CUDA 11.7.
The code I used for testing is here:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust/complex.h>
#define TESTSIZE 64*1048576
#define THREADS_PER_BLOCK 128
#define NLAUNCHES 5
//divide the arrays using thrust standard operator
__global__ void divideWithThrust(double* x, thrust::complex<double>* y, thrust::complex<double>* z) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
z[i] = x[i] / y[i];
}
//divide the arrays by hand
__global__ void divideDZ(double* x, thrust::complex<double>* y, thrust::complex<double>* z) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
double divByDenominator = x[i] / (y[i].real() * y[i].real() + y[i].imag() * y[i].imag());
z[i] = thrust::complex<double>(y[i].real() * divByDenominator, -y[i].imag() * divByDenominator);
}
//divide the arrays by explicitly turning the double into a complex double
__global__ void divideDZupcast(double* x, thrust::complex<double>* y, thrust::complex<double>* z) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
z[i] = thrust::complex<double>(x[i], 0) / y[i];
}
//float math for comparison
__global__ void divideWithThrustFloat(float* x, thrust::complex<float>* y, thrust::complex<float>* z) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
z[i] = x[i] / y[i];
}
//float by hand for comparison
__global__ void divideFC(float* x, thrust::complex<float>* y, thrust::complex<float>* z) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
float divByDenominator = x[i] / (y[i].real() * y[i].real() + y[i].imag() * y[i].imag());
z[i] = thrust::complex<float>(y[i].real() * divByDenominator, -y[i].imag() * divByDenominator);
}
//fill arrays
__global__ void initArrays(double* x, thrust::complex<double>* y) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
x[i] = sin(0.1 * i);
y[i] = thrust::complex<double>(cos(0.2 * i), sin(0.5 * i));
}
__global__ void initArraysFloat(float* x, thrust::complex<float>* y) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
x[i] = sin(0.1 * i);
y[i] = thrust::complex<float>(cos(0.2 * i), sin(0.5 * i));
}
int main()
{
//first check with doubles
double *x;
thrust::complex<double> *y, *z;
cudaMalloc(&x, TESTSIZE * sizeof(double));
cudaMalloc(&y, TESTSIZE * sizeof(thrust::complex<double>));
cudaMalloc(&z, TESTSIZE * sizeof(thrust::complex<double>));
//divide by hand
initArrays<<<TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(x, y);
for (int i = 0; i < NLAUNCHES; i++) {
divideDZ<<<TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(x, y, z);
}
//divide with thrust
initArrays<<<TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(x, y);
for (int i = 0; i < NLAUNCHES; i++) {
divideWithThrust<<<TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(x, y, z);
}
//divide by turning double into complex explicitly
initArrays << <TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK >> > (x, y);
for (int i = 0; i < NLAUNCHES; i++) {
divideDZupcast << <TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK >> > (x, y, z);
}
cudaFree(x);
cudaFree(y);
cudaFree(z);
//compare float division
float *xf;
thrust::complex<float> *yf, * zf;
cudaMalloc(&xf, TESTSIZE * sizeof(float));
cudaMalloc(&yf, TESTSIZE * sizeof(thrust::complex<float>));
cudaMalloc(&zf, TESTSIZE * sizeof(thrust::complex<float>));
initArraysFloat<<<TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(xf, yf);
for (int i = 0; i < NLAUNCHES; i++) {
divideFC<<<TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(xf, yf, zf);
}
initArraysFloat<<<TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(xf, yf);
for (int i = 0; i < NLAUNCHES; i++) {
divideWithThrustFloat<<<TESTSIZE / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(xf, yf, zf);
}
cudaFree(xf);
cudaFree(yf);
cudaFree(zf);
return 0;
}