So, I have the following nasty piece of code:
char *clSource =
"#ifdef DOUBLES\n"
" #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"#endif\n"
""
"#define CALC_MAD_16(a, b)"
" a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;"
" a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;"
" a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;"
" a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;\n"
""
"#define CALC_MAD_256(a, b)"
" CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b)"
" CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b)"
" CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b)"
" CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b)\n"
""
"#define CALC_MAD_4096(a, b)"
" CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b)"
" CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b)"
" CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b)"
" CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b)\n"
""
"#define CALC_ADD_16(a, b)"
" a = a+b; b = a+b; a = a+b; b = a+b;"
" a = a+b; b = a+b; a = a+b; b = a+b;"
" a = a+b; b = a+b; a = a+b; b = a+b;"
" a = a+b; b = a+b; a = a+b; b = a+b;\n"
""
"#define CALC_ADD_256(a, b)"
" CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b)"
" CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b)"
" CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b)"
" CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b)\n"
""
"#define CALC_ADD_4096(a, b)"
" CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b)"
" CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b)"
" CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b)"
" CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b)\n"
""
"#define CALC_MUL_16(a, b)"
" a = a*b; b = a*b; a = a*b; b = a*b;"
" a = a*b; b = a*b; a = a*b; b = a*b;"
" a = a*b; b = a*b; a = a*b; b = a*b;"
" a = a*b; b = a*b; a = a*b; b = a*b;\n"
""
"#define CALC_MUL_256(a, b)"
" CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b)"
" CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b)"
" CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b)"
" CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b)\n"
""
"#define CALC_MUL_4096(a, b)"
" CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b)"
" CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b)"
" CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b)"
" CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b)\n"
""
"#define CALC_FUNC(func, type, macro)"
"__kernel void func() {"
" __local type a;"
" int index = get_global_id(0);"
" type val1 = (type)index;"
" type val2 = (type)2*(type)index;"
" for(int i = 0; i < 1024; i++){ macro(val1, val2) }"
" a = val1 + val2;"
"}\n"
""
"CALC_FUNC(calcFloatMAD, float, CALC_MAD_4096)\n"
"CALC_FUNC(calcFloatADD, float, CALC_ADD_4096)\n"
"CALC_FUNC(calcFloatMUL, float, CALC_MUL_4096)\n"
"\n"
"CALC_FUNC(calcIntMAD, int, CALC_MAD_4096)\n"
"CALC_FUNC(calcIntADD, int, CALC_ADD_4096)\n"
"CALC_FUNC(calcIntMUL, int, CALC_MUL_4096)\n"
"\n"
"#ifdef DOUBLES\n"
"CALC_FUNC(calcDoubleMAD, double, CALC_MAD_4096)\n"
"CALC_FUNC(calcDoubleADD, double, CALC_ADD_4096)\n"
"CALC_FUNC(calcDoubleMUL, double, CALC_MUL_4096)\n"
"#endif";
which can’t be build with CUDA SDK 3.0 and the last official drivers. The problem is that the stack overflows. This happens when CALC_FUNC macro is called with an int. Is this a compiler bug or there is something wrong with my code?