With ‘-cl-mad-enable’ or without it, results are approximately the same (as compared to OpenCL-Cuda difference, at least).

As for the kernels - here’s an example (it is the template, hence all these ${…}):

```
${p.complex.name} complex_mul_scalar(${p.complex.name} a, ${p.scalar.name} b)
{
return ${p.complex.ctr}(a.x * b, a.y * b);
}
${p.complex.name} complex_mul(${p.complex.name} a, ${p.complex.name} b)
{
return ${p.complex.ctr}(mad(-a.y, b.y, a.x * b.x), mad(a.y, b.x, a.x * b.y));
}
${p.scalar.name} squared_abs(${p.complex.name} a)
{
return a.x * a.x + a.y * a.y;
}
${p.complex.name} cexp(${p.complex.name} a)
{
${p.scalar.name} module = exp(a.x);
${p.scalar.name} angle = a.y;
return ${p.complex.ctr}(module * native_cos(angle), module * native_sin(angle));
}
float get_float_from_image(read_only image3d_t image, int i, int j, int k)
{
sampler_t sampler = CLK_FILTER_NEAREST | CLK_ADDRESS_CLAMP |
CLK_NORMALIZED_COORDS_FALSE;
uint4 image_data = read_imageui(image, sampler,
(int4)(i, j, k, 0));
return *((float*)&image_data);
}
#define DEFINE_INDEXES int i = get_global_id(0), j = get_global_id(1), k = get_global_id(2), index = (k << ${c.nvx_pow + c.nvy_pow}) + (j << ${c.nvx_pow}) + i
// Propagates state vector in x-space for evolution calculation
__kernel void propagateXSpaceTwoComponent(__global ${p.complex.name} *aa,
__global ${p.complex.name} *bb, ${p.scalar.name} dt,
read_only image3d_t potentials)
{
DEFINE_INDEXES;
${p.scalar.name} V = get_float_from_image(potentials, i, j, k % ${c.nvz});
${p.complex.name} a = aa[index];
${p.complex.name} b = bb[index];
//store initial x-space field
${p.complex.name} a0 = a;
${p.complex.name} b0 = b;
${p.complex.name} pa, pb, da = ${p.complex.ctr}(0, 0), db = ${p.complex.ctr}(0, 0);
${p.scalar.name} n_a, n_b;
//iterate to midpoint solution
%for iter in range(c.itmax):
n_a = squared_abs(a);
n_b = squared_abs(b);
// TODO: there must be no minus sign before imaginary part,
// but without it the whole thing diverges
pa = ${p.complex.ctr}(
-(${c.l111} * n_a * n_a + ${c.l12} * n_b) / 2,
-(-V - ${c.g11} * n_a - ${c.g12} * n_b));
pb = ${p.complex.ctr}(
-(${c.l22} * n_b + ${c.l12} * n_a) / 2,
-(-V - ${c.g22} * n_b - ${c.g12} * n_a + ${c.detuning}));
// calculate midpoint log derivative and exponentiate
da = cexp(complex_mul_scalar(pa, (dt / 2)));
db = cexp(complex_mul_scalar(pb, (dt / 2)));
//propagate to midpoint using log derivative
a = complex_mul(a0, da);
b = complex_mul(b0, db);
%endfor
//propagate to endpoint using log derivative
aa[index] = complex_mul(a, da);
bb[index] = complex_mul(b, db);
}
```

It is the part of this program, file evolution.py. It was ported to OpenCL a few days ago. As you can see, no optimization magic at all, just an elementwise kernel. Tried to get rid of reading from image (used in-place calculation) - did not change the performance.

Other example is my pyfft module which has performance tests for both OpenCL and Cuda, and uses same kernels for both of them. They are pretty big, and located in file pyfft/kernel.mako (it is the template too).

And, according to profiler, Python overhead does not seem to play some significant role. PyCuda and PyOpenCL are very thin and fast wrappers of corresponding APIs.