Thanks so much txbob. Ok, so I submitted a really simple one here:
https://developer.nvidia.com/nvidia_bug/1951805
The OS is Mac. Now I need to add the test kernel as a comment, but can’t do it. I think that’s where the error comes from. Can you add this to it as a comment please? Thank you.
#if defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif
#if defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
typedef long intPrec;
typedef uint atomi;
typedef double real_t;
typedef float real_bucket_t;
typedef double2 real2;
typedef double3 real3;
typedef double4 real4;
typedef float4 real4_bucket;
#define EPS (DBL_EPSILON)
#define TLOW (DBL_MIN)
#define TMAX (DBL_MAX)
typedef long int int64;
typedef unsigned long int uint64;
#define EPS6 ((1e-6))
#define NTHREADS 256u
#define THREADS_PER_WARP 32u
#define NWARPS (NTHREADS / THREADS_PER_WARP)
#define COLORMAP_LENGTH 256u
#define COLORMAP_LENGTH_MINUS_1 255u
#define DE_THRESH 100u
#define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))
#define SQR(x) ((x) * (x))
#define CUBE(x) ((x) * (x) * (x))
#define M_2PI (M_PI * 2)
#define M_3PI (M_PI * 3)
#define SQRT5 2.2360679774997896964091736687313
#define M_PHI 1.61803398874989484820458683436563
#define DEG_2_RAD (M_PI / 180)
//Index in each dimension of a thread within a block.
#define THREAD_ID_X (get_local_id(0))
#define THREAD_ID_Y (get_local_id(1))
#define THREAD_ID_Z (get_local_id(2))
//Index in each dimension of a block within a grid.
#define BLOCK_ID_X (get_group_id(0))
#define BLOCK_ID_Y (get_group_id(1))
#define BLOCK_ID_Z (get_group_id(2))
//Absolute index in each dimension of a thread within a grid.
#define GLOBAL_ID_X (get_global_id(0))
#define GLOBAL_ID_Y (get_global_id(1))
#define GLOBAL_ID_Z (get_global_id(2))
//Dimensions of a block.
#define BLOCK_SIZE_X (get_local_size(0))
#define BLOCK_SIZE_Y (get_local_size(1))
#define BLOCK_SIZE_Z (get_local_size(2))
//Dimensions of a grid, in terms of blocks.
#define GRID_SIZE_X (get_num_groups(0))
#define GRID_SIZE_Y (get_num_groups(1))
#define GRID_SIZE_Z (get_num_groups(2))
//Dimensions of a grid, in terms of threads.
#define GLOBAL_SIZE_X (get_global_size(0))
#define GLOBAL_SIZE_Y (get_global_size(1))
#define GLOBAL_SIZE_Z (get_global_size(2))
#define INDEX_IN_BLOCK_2D (THREAD_ID_Y * BLOCK_SIZE_X + THREAD_ID_X)
#define INDEX_IN_BLOCK_3D ((BLOCK_SIZE_X * BLOCK_SIZE_Y * THREAD_ID_Z) + INDEX_IN_BLOCK_2D)
#define INDEX_IN_GRID_2D (GLOBAL_ID_Y * GLOBAL_SIZE_X + GLOBAL_ID_X)
#define INDEX_IN_GRID_3D ((GLOBAL_SIZE_X * GLOBAL_SIZE_Y * GLOBAL_ID_Z) + INDEX_IN_GRID_2D)
enum { MWC64X_A = 4294883355u };
inline uint MwcNext(uint2* s)
{
uint res = (*s).x ^ (*s).y;
uint hi = mul_hi((*s).x, MWC64X_A);
(*s).x = (*s).x * MWC64X_A + (*s).y;
(*s).y = hi + ((*s).x < (*s).y);
return res;
}
inline uint MwcNextRange(uint2* s, uint val)
{
return (val == 0) ? MwcNext(s) : (MwcNext(s) % val);
}
inline real_t MwcNext01(uint2* s)
{
return MwcNext(s) * (1.0 / 4294967296.0);
}
inline real_t MwcNextNeg1Pos1(uint2* s)
{
real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;
return -1.0 + (f * 2.0);
}
inline real_t MwcNext0505(uint2* s)
{
real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;
return -0.5 + f;
}
typedef struct attribute ((aligned (16))) _Point
{
real_t m_X;
real_t m_Y;
real_t m_Z;
real_t m_ColorX;
uint m_LastXfUsed;
} Point;
typedef struct attribute ((aligned (16))) _XformCL
{
real_t m_A, m_B, m_C, m_D, m_E, m_F;
real_t m_VariationWeights[8];
real_t m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;
real_t m_DirectColor;
real_t m_ColorSpeedCache;
real_t m_OneMinusColorCache;
real_t m_Opacity;
} XformCL;
typedef struct attribute ((aligned (16))) _EmberCL
{
real_t m_CamZPos;
real_t m_CamPerspective;
real_t m_CamYaw;
real_t m_CamPitch;
real_t m_CamDepthBlur;
real_t m_BlurCoef;
real_t m_C00;
real_t m_C01;
real_t m_C02;
real_t m_C10;
real_t m_C11;
real_t m_C12;
real_t m_C20;
real_t m_C21;
real_t m_C22;
real_t m_CenterX, m_CenterY;
real_t m_RotA, m_RotB, m_RotD, m_RotE;
} EmberCL;
typedef union
{
uchar3 m_Uchar3;
uchar m_Uchars[3];
} uchar3uchars;
typedef union
{
uchar4 m_Uchar4;
uchar m_Uchars[4];
} uchar4uchars;
typedef union
{
uint4 m_Uint4;
uint m_Uints[4];
} uint4uints;
typedef union
{
float4 m_Float4;
float m_Floats[4];
} float4floats;
typedef union
{
real4 m_Real4;
real_t m_Reals[4];
} real4reals;
typedef union
{
real4_bucket m_Real4;
real_bucket_t m_Reals[4];
} real4reals_bucket;
typedef struct attribute ((aligned (16))) _CarToRasCL
{
real_t m_PixPerImageUnitW, m_RasLlX;
uint m_RasWidth;
real_t m_PixPerImageUnitH, m_RasLlY;
real_t m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;
} CarToRasCL;
typedef struct attribute ((aligned (16))) _VariationState
{
} VariationState;
void Xform0(__constant XformCL* xform, __constant real_t* parVars, __global real_t* globalShared, Point* inPoint, Point* outPoint, uint2* mwc, VariationState* varState)
{
real_t transX, transY, transZ;
real4 vIn, vOut = 0.0;
real_t tempColor = outPoint->m_ColorX = xform->m_ColorSpeedCache + (xform->m_OneMinusColorCache * inPoint->m_ColorX);
transX = (xform->m_A * inPoint->m_X) + (xform->m_B * inPoint->m_Y) + xform->m_C;
transY = (xform->m_D * inPoint->m_X) + (xform->m_E * inPoint->m_Y) + xform->m_F;
transZ = inPoint->m_Z;
outPoint->m_X = 0;
outPoint->m_Y = 0;
outPoint->m_Z = 0;
//Apply each of the 1 regular variations in this xform.
vIn.x = transX;
vIn.y = transY;
vIn.z = transZ;
//cosine.
{
real_t a = vIn.x * M_PI;
real_t nx = cos(a) * cosh(vIn.y);
real_t ny = -sin(a) * sinh(vIn.y);
vOut.x = xform->m_VariationWeights[0] * nx;
vOut.y = xform->m_VariationWeights[0] * ny;
vOut.z = 0;
}
outPoint->m_X += vOut.x;
outPoint->m_Y += vOut.y;
outPoint->m_Z += vOut.z;
outPoint->m_ColorX = tempColor + xform->m_DirectColor * (outPoint->m_ColorX - tempColor);
if (isnan(outPoint->m_ColorX))
outPoint->m_ColorX = 0.0;
}
__kernel void IterateKernel(
uint iterCount,
uint fuseCount,
__global uint2* seeds,
__constant EmberCL* ember,
__constant XformCL* xforms,
__constant real_t* parVars,
__global real_t* globalShared,
__global uchar* xformDistributions,
__constant CarToRasCL* carToRas,
__global real4reals_bucket* histogram,
uint histSize,
__read_only image2d_t palette,
__global Point* points
)
{
bool fuse, ok;
uint threadIndex = INDEX_IN_BLOCK_2D;
uint pointsIndex = INDEX_IN_GRID_2D;
uint i, itersToDo;
uint consec = 0;
uint histIndex;
real_t p00, p01;
Point firstPoint, secondPoint, tempPoint;
uint2 mwc = seeds[pointsIndex];
float4 palColor1;
int2 iPaletteCoord;
const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_NEAREST;
uint threadXY = (THREAD_ID_X + THREAD_ID_Y);
uint threadXDivRows = (THREAD_ID_X / NWARPS);
uint threadsMinus1 = NTHREADS - 1;
VariationState varState;
__local Point swap[NTHREADS];
__local uint xfsel[NWARPS];
if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)
xfsel[THREAD_ID_X] = MwcNext(&mwc) & 16383;
barrier(CLK_LOCAL_MEM_FENCE);
Xform0(&(xforms[0]), parVars, globalShared, &firstPoint, &secondPoint, &mwc, &varState);
}