Thanks so much txbob. Ok, so I submitted a really simple one here:

https://developer.nvidia.com/nvidia_bug/1951805

The OS is Mac. Now I need to add the test kernel as a comment, but can’t do it. I think that’s where the error comes from. Can you add this to it as a comment please? Thank you.

#if defined(cl_amd_fp64)

#pragma OPENCL EXTENSION cl_amd_fp64 : enable

#endif

#if defined(cl_khr_fp64)

#pragma OPENCL EXTENSION cl_khr_fp64 : enable

#endif

#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable

typedef long intPrec;

typedef uint atomi;

typedef double real_t;

typedef float real_bucket_t;

typedef double2 real2;

typedef double3 real3;

typedef double4 real4;

typedef float4 real4_bucket;

#define EPS (DBL_EPSILON)

#define TLOW (DBL_MIN)

#define TMAX (DBL_MAX)

typedef long int int64;

typedef unsigned long int uint64;

#define EPS6 ((1e-6))

#define NTHREADS 256u

#define THREADS_PER_WARP 32u

#define NWARPS (NTHREADS / THREADS_PER_WARP)

#define COLORMAP_LENGTH 256u

#define COLORMAP_LENGTH_MINUS_1 255u

#define DE_THRESH 100u

#define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))

#define SQR(x) ((x) * (x))

#define CUBE(x) ((x) * (x) * (x))

#define M_2PI (M_PI * 2)

#define M_3PI (M_PI * 3)

#define SQRT5 2.2360679774997896964091736687313

#define M_PHI 1.61803398874989484820458683436563

#define DEG_2_RAD (M_PI / 180)

//Index in each dimension of a thread within a block.

#define THREAD_ID_X (get_local_id(0))

#define THREAD_ID_Y (get_local_id(1))

#define THREAD_ID_Z (get_local_id(2))

//Index in each dimension of a block within a grid.

#define BLOCK_ID_X (get_group_id(0))

#define BLOCK_ID_Y (get_group_id(1))

#define BLOCK_ID_Z (get_group_id(2))

//Absolute index in each dimension of a thread within a grid.

#define GLOBAL_ID_X (get_global_id(0))

#define GLOBAL_ID_Y (get_global_id(1))

#define GLOBAL_ID_Z (get_global_id(2))

//Dimensions of a block.

#define BLOCK_SIZE_X (get_local_size(0))

#define BLOCK_SIZE_Y (get_local_size(1))

#define BLOCK_SIZE_Z (get_local_size(2))

//Dimensions of a grid, in terms of blocks.

#define GRID_SIZE_X (get_num_groups(0))

#define GRID_SIZE_Y (get_num_groups(1))

#define GRID_SIZE_Z (get_num_groups(2))

//Dimensions of a grid, in terms of threads.

#define GLOBAL_SIZE_X (get_global_size(0))

#define GLOBAL_SIZE_Y (get_global_size(1))

#define GLOBAL_SIZE_Z (get_global_size(2))

#define INDEX_IN_BLOCK_2D (THREAD_ID_Y * BLOCK_SIZE_X + THREAD_ID_X)

#define INDEX_IN_BLOCK_3D ((BLOCK_SIZE_X * BLOCK_SIZE_Y * THREAD_ID_Z) + INDEX_IN_BLOCK_2D)

#define INDEX_IN_GRID_2D (GLOBAL_ID_Y * GLOBAL_SIZE_X + GLOBAL_ID_X)

#define INDEX_IN_GRID_3D ((GLOBAL_SIZE_X * GLOBAL_SIZE_Y * GLOBAL_ID_Z) + INDEX_IN_GRID_2D)

enum { MWC64X_A = 4294883355u };

inline uint MwcNext(uint2* s)

{

uint res = (*s).x ^ (*s).y;

uint hi = mul_hi((*s).x, MWC64X_A);

(*s).x = (*s).x * MWC64X_A + (*s).y;

(*s).y = hi + ((*s).x < (*s).y);

return res;

}

inline uint MwcNextRange(uint2* s, uint val)

{

return (val == 0) ? MwcNext(s) : (MwcNext(s) % val);

}

inline real_t MwcNext01(uint2* s)

{

return MwcNext(s) * (1.0 / 4294967296.0);

}

inline real_t MwcNextNeg1Pos1(uint2* s)

{

real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;

return -1.0 + (f * 2.0);

}

inline real_t MwcNext0505(uint2* s)

{

real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;

return -0.5 + f;

}

typedef struct **attribute** ((aligned (16))) _Point

{

real_t m_X;

real_t m_Y;

real_t m_Z;

real_t m_ColorX;

uint m_LastXfUsed;

} Point;

typedef struct **attribute** ((aligned (16))) _XformCL

{

real_t m_A, m_B, m_C, m_D, m_E, m_F;

real_t m_VariationWeights[8];

real_t m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;

real_t m_DirectColor;

real_t m_ColorSpeedCache;

real_t m_OneMinusColorCache;

real_t m_Opacity;

} XformCL;

typedef struct **attribute** ((aligned (16))) _EmberCL

{

real_t m_CamZPos;

real_t m_CamPerspective;

real_t m_CamYaw;

real_t m_CamPitch;

real_t m_CamDepthBlur;

real_t m_BlurCoef;

real_t m_C00;

real_t m_C01;

real_t m_C02;

real_t m_C10;

real_t m_C11;

real_t m_C12;

real_t m_C20;

real_t m_C21;

real_t m_C22;

real_t m_CenterX, m_CenterY;

real_t m_RotA, m_RotB, m_RotD, m_RotE;

} EmberCL;

typedef union

{

uchar3 m_Uchar3;

uchar m_Uchars[3];

} uchar3uchars;

typedef union

{

uchar4 m_Uchar4;

uchar m_Uchars[4];

} uchar4uchars;

typedef union

{

uint4 m_Uint4;

uint m_Uints[4];

} uint4uints;

typedef union

{

float4 m_Float4;

float m_Floats[4];

} float4floats;

typedef union

{

real4 m_Real4;

real_t m_Reals[4];

} real4reals;

typedef union

{

real4_bucket m_Real4;

real_bucket_t m_Reals[4];

} real4reals_bucket;

typedef struct **attribute** ((aligned (16))) _CarToRasCL

{

real_t m_PixPerImageUnitW, m_RasLlX;

uint m_RasWidth;

real_t m_PixPerImageUnitH, m_RasLlY;

real_t m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;

} CarToRasCL;

typedef struct **attribute** ((aligned (16))) _VariationState

{

} VariationState;

void Xform0(__constant XformCL* xform, __constant real_t* parVars, __global real_t* globalShared, Point* inPoint, Point* outPoint, uint2* mwc, VariationState* varState)

{

real_t transX, transY, transZ;

real4 vIn, vOut = 0.0;

real_t tempColor = outPoint->m_ColorX = xform->m_ColorSpeedCache + (xform->m_OneMinusColorCache * inPoint->m_ColorX);

transX = (xform->m_A * inPoint->m_X) + (xform->m_B * inPoint->m_Y) + xform->m_C;

transY = (xform->m_D * inPoint->m_X) + (xform->m_E * inPoint->m_Y) + xform->m_F;

transZ = inPoint->m_Z;

outPoint->m_X = 0;

outPoint->m_Y = 0;

outPoint->m_Z = 0;

//Apply each of the 1 regular variations in this xform.

vIn.x = transX;

vIn.y = transY;

vIn.z = transZ;

//cosine.

{

real_t a = vIn.x * M_PI;

real_t nx = cos(a) * cosh(vIn.y);

real_t ny = -sin(a) * sinh(vIn.y);

vOut.x = xform->m_VariationWeights[0] * nx;

vOut.y = xform->m_VariationWeights[0] * ny;

vOut.z = 0;

}

outPoint->m_X += vOut.x;

outPoint->m_Y += vOut.y;

outPoint->m_Z += vOut.z;

outPoint->m_ColorX = tempColor + xform->m_DirectColor * (outPoint->m_ColorX - tempColor);

if (isnan(outPoint->m_ColorX))

outPoint->m_ColorX = 0.0;

}

__kernel void IterateKernel(

uint iterCount,

uint fuseCount,

__global uint2* seeds,

__constant EmberCL* ember,

__constant XformCL* xforms,

__constant real_t* parVars,

__global real_t* globalShared,

__global uchar* xformDistributions,

__constant CarToRasCL* carToRas,

__global real4reals_bucket* histogram,

uint histSize,

__read_only image2d_t palette,

__global Point* points

)

{

bool fuse, ok;

uint threadIndex = INDEX_IN_BLOCK_2D;

uint pointsIndex = INDEX_IN_GRID_2D;

uint i, itersToDo;

uint consec = 0;

uint histIndex;

real_t p00, p01;

Point firstPoint, secondPoint, tempPoint;

uint2 mwc = seeds[pointsIndex];

float4 palColor1;

int2 iPaletteCoord;

const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |

CLK_ADDRESS_CLAMP_TO_EDGE |

CLK_FILTER_NEAREST;

uint threadXY = (THREAD_ID_X + THREAD_ID_Y);

uint threadXDivRows = (THREAD_ID_X / NWARPS);

uint threadsMinus1 = NTHREADS - 1;

VariationState varState;

__local Point swap[NTHREADS];

__local uint xfsel[NWARPS];

if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)

xfsel[THREAD_ID_X] = MwcNext(&mwc) & 16383;

barrier(CLK_LOCAL_MEM_FENCE);

Xform0(&(xforms[0]), parVars, globalShared, &firstPoint, &secondPoint, &mwc, &varState);

}