Possible compiler bug in Mac NVIDIA Web Driver 346.03.15f08 on OSX 10.11.6

DariusUndertow · June 27, 2017, 12:21am

A tester of mine is getting a compilation error with this kernel when the typedef real_t is defined as double, however it compiles fine when it’s defined as float.

The error is on this line:

real_t a = vIn.x * M_PI;

and it says:

ptxas error : Program using constant pointers passed as entry function parameter cannot use cvta.const

This is very weird because it compiles fine on Windows and Linux, and used to compile fine on Mac until he upgraded to the version listed above.

Please see the kernel below. If this is the wrong place to submit these bugs, can you please direct me to the correct place? Thank you.

#if defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif
#if defined(cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
typedef long intPrec;
typedef uint atomi;
typedef double real_t;
typedef float real_bucket_t;
typedef double2 real2;
typedef double3 real3;
typedef double4 real4;
typedef float4 real4_bucket;
#define EPS (DBL_EPSILON)
#define TLOW (DBL_MIN)
#define TMAX (DBL_MAX)
typedef long int int64;
typedef unsigned long int uint64;

#define EPS6 ((1e-6))

//The number of threads per block used in the iteration function. Don’t change
//it lightly; the block size is hard coded to be exactly 32 x 8.
#define NTHREADS 256u
#define THREADS_PER_WARP 32u
#define NWARPS (NTHREADS / THREADS_PER_WARP)
#define COLORMAP_LENGTH 256u
#define COLORMAP_LENGTH_MINUS_1 255u
#define DE_THRESH 100u
#define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))
#define SQR(x) ((x) * (x))
#define CUBE(x) ((x) * (x) * (x))
#define M_2PI (M_PI * 2)
#define M_3PI (M_PI * 3)
#define SQRT5 2.2360679774997896964091736687313
#define M_PHI 1.61803398874989484820458683436563
#define DEG_2_RAD (M_PI / 180)

//Index in each dimension of a thread within a block.
#define THREAD_ID_X (get_local_id(0))
#define THREAD_ID_Y (get_local_id(1))
#define THREAD_ID_Z (get_local_id(2))

//Index in each dimension of a block within a grid.
#define BLOCK_ID_X (get_group_id(0))
#define BLOCK_ID_Y (get_group_id(1))
#define BLOCK_ID_Z (get_group_id(2))

//Absolute index in each dimension of a thread within a grid.
#define GLOBAL_ID_X (get_global_id(0))
#define GLOBAL_ID_Y (get_global_id(1))
#define GLOBAL_ID_Z (get_global_id(2))

//Dimensions of a block.
#define BLOCK_SIZE_X (get_local_size(0))
#define BLOCK_SIZE_Y (get_local_size(1))
#define BLOCK_SIZE_Z (get_local_size(2))

//Dimensions of a grid, in terms of blocks.
#define GRID_SIZE_X (get_num_groups(0))
#define GRID_SIZE_Y (get_num_groups(1))
#define GRID_SIZE_Z (get_num_groups(2))

//Dimensions of a grid, in terms of threads.
#define GLOBAL_SIZE_X (get_global_size(0))
#define GLOBAL_SIZE_Y (get_global_size(1))
#define GLOBAL_SIZE_Z (get_global_size(2))

#define INDEX_IN_BLOCK_2D (THREAD_ID_Y * BLOCK_SIZE_X + THREAD_ID_X)
#define INDEX_IN_BLOCK_3D ((BLOCK_SIZE_X * BLOCK_SIZE_Y * THREAD_ID_Z) + INDEX_IN_BLOCK_2D)

#define INDEX_IN_GRID_2D (GLOBAL_ID_Y * GLOBAL_SIZE_X + GLOBAL_ID_X)
#define INDEX_IN_GRID_3D ((GLOBAL_SIZE_X * GLOBAL_SIZE_Y * GLOBAL_ID_Z) + INDEX_IN_GRID_2D)

enum { MWC64X_A = 4294883355u };

inline uint MwcNext(uint2* s)
{
uint res = (*s).x ^ (*s).y;
uint hi = mul_hi((*s).x, MWC64X_A);
(*s).x = (*s).x * MWC64X_A + (*s).y;
(*s).y = hi + ((*s).x < (*s).y);
return res;
}

inline uint MwcNextRange(uint2* s, uint val)
{
return (val == 0) ? MwcNext(s) : (MwcNext(s) % val);
}

inline real_t MwcNext01(uint2* s)
{
return MwcNext(s) * (1.0 / 4294967296.0);
}

inline real_t MwcNextNeg1Pos1(uint2* s)
{
real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;
return -1.0 + (f * 2.0);
}

inline real_t MwcNext0505(uint2* s)
{
real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;
return -0.5 + f;
}

typedef struct attribute ((aligned (16))) _Point
{
real_t m_X;
real_t m_Y;
real_t m_Z;
real_t m_ColorX;
uint m_LastXfUsed;
} Point;

typedef struct attribute ((aligned (16))) _XformCL
{
real_t m_A, m_B, m_C, m_D, m_E, m_F;
real_t m_VariationWeights[8];
real_t m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;
real_t m_DirectColor;
real_t m_ColorSpeedCache;
real_t m_OneMinusColorCache;
real_t m_Opacity;
} XformCL;

typedef struct attribute ((aligned (16))) _EmberCL
{
real_t m_CamZPos;
real_t m_CamPerspective;
real_t m_CamYaw;
real_t m_CamPitch;
real_t m_CamDepthBlur;
real_t m_BlurCoef;
real_t m_C00;
real_t m_C01;
real_t m_C02;
real_t m_C10;
real_t m_C11;
real_t m_C12;
real_t m_C20;
real_t m_C21;
real_t m_C22;
real_t m_CenterX, m_CenterY;
real_t m_RotA, m_RotB, m_RotD, m_RotE;
} EmberCL;

typedef union
{
uchar3 m_Uchar3;
uchar m_Uchars[3];
} uchar3uchars;

typedef union
{
uchar4 m_Uchar4;
uchar m_Uchars[4];
} uchar4uchars;

typedef union
{
uint4 m_Uint4;
uint m_Uints[4];
} uint4uints;

typedef union
{
float4 m_Float4;
float m_Floats[4];
} float4floats;

typedef union
{
real4 m_Real4;
real_t m_Reals[4];
} real4reals;

typedef union
{
real4_bucket m_Real4;
real_bucket_t m_Reals[4];
} real4reals_bucket;

typedef struct attribute ((aligned (16))) _CarToRasCL
{
real_t m_PixPerImageUnitW, m_RasLlX;
uint m_RasWidth;
real_t m_PixPerImageUnitH, m_RasLlY;
real_t m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;
} CarToRasCL;

inline void CarToRasConvertPointToSingle(__constant CarToRasCL* carToRas, Point* point, uint* singleBufferIndex)
{
*singleBufferIndex = (uint)(carToRas->m_PixPerImageUnitW * point->m_X - carToRas->m_RasLlX) + (carToRas->m_RasWidth * (uint)(carToRas->m_PixPerImageUnitH * point->m_Y - carToRas->m_RasLlY));
}

inline bool CarToRasInBounds(__constant CarToRasCL* carToRas, Point* point)
{
return point->m_X >= carToRas->m_CarLlX &&
point->m_X < carToRas->m_CarUrX &&
point->m_Y < carToRas->m_CarUrY &&
point->m_Y >= carToRas->m_CarLlY;
}

typedef struct attribute ((aligned (16))) _VariationState
{
} VariationState;

void Xform0(__constant XformCL* xform, __constant real_t* parVars, __global real_t* globalShared, Point* inPoint, Point* outPoint, uint2* mwc, VariationState* varState)
{
real_t transX, transY, transZ;
real4 vIn, vOut = 0.0;
real_t tempColor = outPoint->m_ColorX = xform->m_ColorSpeedCache + (xform->m_OneMinusColorCache * inPoint->m_ColorX);

transX = (xform->m_A * inPoint->m_X) + (xform->m_B * inPoint->m_Y) + xform->m_C;
transY = (xform->m_D * inPoint->m_X) + (xform->m_E * inPoint->m_Y) + xform->m_F;
transZ = inPoint->m_Z;

outPoint->m_X = 0;
outPoint->m_Y = 0;
outPoint->m_Z = 0;

//Apply each of the 1 regular variations in this xform.

vIn.x = transX;
vIn.y = transY;
vIn.z = transZ;

//cosine.
{
	real_t a = vIn.x * M_PI;
	real_t nx = cos(a) * cosh(vIn.y);
	real_t ny = -sin(a) * sinh(vIn.y);

	vOut.x = xform->m_VariationWeights[0] * nx;
	vOut.y = xform->m_VariationWeights[0] * ny;
	vOut.z = 0;
}

outPoint->m_X += vOut.x;
outPoint->m_Y += vOut.y;
outPoint->m_Z += vOut.z;
outPoint->m_ColorX = tempColor + xform->m_DirectColor * (outPoint->m_ColorX - tempColor);

if (isnan(outPoint->m_ColorX))
	outPoint->m_ColorX = 0.0;

}

__kernel void IterateKernel(
uint iterCount,
uint fuseCount,
__global uint2* seeds,
__constant EmberCL* ember,
__constant XformCL* xforms,
__constant real_t* parVars,
__global real_t* globalShared,
__global uchar* xformDistributions,
__constant CarToRasCL* carToRas,
__global real4reals_bucket* histogram,
uint histSize,
__read_only image2d_t palette,
__global Point* points
)
{
bool fuse, ok;
uint threadIndex = INDEX_IN_BLOCK_2D;
uint pointsIndex = INDEX_IN_GRID_2D;
uint i, itersToDo;
uint consec = 0;
uint histIndex;
real_t p00, p01;
Point firstPoint, secondPoint, tempPoint;
uint2 mwc = seeds[pointsIndex];
float4 palColor1;
int2 iPaletteCoord;
const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_NEAREST;
uint threadXY = (THREAD_ID_X + THREAD_ID_Y);
uint threadXDivRows = (THREAD_ID_X / NWARPS);
uint threadsMinus1 = NTHREADS - 1;
VariationState varState;

__local Point swap[NTHREADS];
__local uint xfsel[NWARPS];

iPaletteCoord.y = 0;

if (fuseCount > 0)
{
	fuse = true;
	itersToDo = fuseCount;
	firstPoint.m_X = MwcNextNeg1Pos1(&mwc);
	firstPoint.m_X = MwcNextNeg1Pos1(&mwc);
	firstPoint.m_Y = MwcNextNeg1Pos1(&mwc);
	firstPoint.m_Z = 0.0;
	firstPoint.m_ColorX = MwcNext01(&mwc);
	firstPoint.m_LastXfUsed = 0 - 1;
}
else
{
	fuse = false;
	itersToDo = iterCount;
	firstPoint = points[pointsIndex];
}

if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)
	xfsel[THREAD_ID_X] = MwcNext(&mwc) & 16383;

barrier(CLK_LOCAL_MEM_FENCE);

for (i = 0; i < itersToDo; i++)
{
	consec = 0;

	do
	{
		secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y]];

		switch (secondPoint.m_LastXfUsed)
		{
			case 0:
			{
				Xform0(&(xforms[0]), parVars, globalShared, &firstPoint, &secondPoint, &mwc, &varState);
				break;
			}
		}

		ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y);

		if (!ok)
		{
			firstPoint.m_X = MwcNextNeg1Pos1(&mwc);
			firstPoint.m_Y = MwcNextNeg1Pos1(&mwc);
			firstPoint.m_Z = 0.0;
			firstPoint.m_ColorX = secondPoint.m_ColorX;
			consec++;
		}
	}
	while (!ok && consec < 5);

	if (!ok)
	{
		secondPoint.m_X = MwcNextNeg1Pos1(&mwc);
		secondPoint.m_Y = MwcNextNeg1Pos1(&mwc);
		secondPoint.m_Z = 0.0;
	}

	uint swr = threadXY + ((i & 1u) * threadXDivRows);
	uint sw = (swr * THREADS_PER_WARP + THREAD_ID_X) & threadsMinus1;

	swap[sw] = secondPoint;

	if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)
		xfsel[THREAD_ID_X] = MwcNext(&mwc) & 16383;

	barrier(CLK_LOCAL_MEM_FENCE);
	firstPoint = swap[threadIndex];

	if (fuse)
	{
		if (i >= fuseCount - 1)
		{
			i = 0;
			fuse = false;
			itersToDo = iterCount;
			barrier(CLK_LOCAL_MEM_FENCE);
		}

		continue;
	}

	p00 = secondPoint.m_X - ember->m_CenterX;
	p01 = secondPoint.m_Y - ember->m_CenterY;
	tempPoint.m_X = (p00 * ember->m_RotA) + (p01 * ember->m_RotB) + ember->m_CenterX;
	tempPoint.m_Y = (p00 * ember->m_RotD) + (p01 * ember->m_RotE) + ember->m_CenterY;

	if (CarToRasInBounds(carToRas, &tempPoint))
	{
		CarToRasConvertPointToSingle(carToRas, &tempPoint, &histIndex);

		if (histIndex < histSize)
		{
			real_t colorIndexFrac;
			real_t colorIndex = secondPoint.m_ColorX * COLORMAP_LENGTH_MINUS_1;
			int intColorIndex = (int)colorIndex;
			float4 palColor2;

			if (intColorIndex < 0)
			{
				intColorIndex = 0;
				colorIndexFrac = 0;
			}
			else if (intColorIndex >= COLORMAP_LENGTH_MINUS_1)
			{
				intColorIndex = COLORMAP_LENGTH_MINUS_1 - 1;
				colorIndexFrac = 1.0;
			}
			else
			{
				colorIndexFrac = colorIndex - (real_t)intColorIndex;
			}

			iPaletteCoord.x = intColorIndex;
			palColor1 = read_imagef(palette, paletteSampler, iPaletteCoord);
			iPaletteCoord.x += 1;
			palColor2 = read_imagef(palette, paletteSampler, iPaletteCoord);
			palColor1 = (palColor1 * (1.0f - (float)colorIndexFrac)) + (palColor2 * (float)colorIndexFrac);
			histogram[histIndex].m_Real4 += (palColor1 * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_Opacity);
		}
	}

	barrier(CLK_GLOBAL_MEM_FENCE);
}

points[pointsIndex] = firstPoint;
seeds[pointsIndex] = mwc;
barrier(CLK_GLOBAL_MEM_FENCE);

}

Robert_Crovella · June 27, 2017, 2:06am

I would suggest registering as a developer at developer.nvidia.com

Then, once your registration is complete, you can file a bug there. There is a bug submission link in the registered developer area.

DariusUndertow · June 27, 2017, 11:42pm

Thanks txbob, but now when I submit the bug there, I get an ajax error saying:

An AJAX HTTP error occurred.
HTTP Result Code: 403
Debugging information follows.
Path: /system/ajax
StatusText: Forbidden
ResponseText:
403 - Forbidden
403 - Forbidden

Is there a problem with their bug reporting system?