nvcompiler.dll - stack overflow

So, I have the following nasty piece of code:

char *clSource = 

"#ifdef DOUBLES\n"

"  #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"

"#endif\n"

""

"#define CALC_MAD_16(a, b)"

"  a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;"

"  a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;"

"  a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;"

"  a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;\n"

""

"#define CALC_MAD_256(a, b)"

"  CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b)"

"  CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b)"

"  CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b)"

"  CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b) CALC_MAD_16(a, b)\n"

""

"#define CALC_MAD_4096(a, b)"

"  CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b)"

"  CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b)"

"  CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b)"

"  CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b) CALC_MAD_256(a, b)\n"

""

"#define CALC_ADD_16(a, b)"

"  a = a+b; b = a+b; a = a+b; b = a+b;"

"  a = a+b; b = a+b; a = a+b; b = a+b;"

"  a = a+b; b = a+b; a = a+b; b = a+b;"

"  a = a+b; b = a+b; a = a+b; b = a+b;\n"

""

"#define CALC_ADD_256(a, b)"

"  CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b)"

"  CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b)"

"  CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b)"

"  CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b) CALC_ADD_16(a, b)\n"

""

"#define CALC_ADD_4096(a, b)"

"  CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b)"

"  CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b)"

"  CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b)"

"  CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b) CALC_ADD_256(a, b)\n"

""

"#define CALC_MUL_16(a, b)"

"  a = a*b; b = a*b; a = a*b; b = a*b;"

"  a = a*b; b = a*b; a = a*b; b = a*b;"

"  a = a*b; b = a*b; a = a*b; b = a*b;"

"  a = a*b; b = a*b; a = a*b; b = a*b;\n"

""

"#define CALC_MUL_256(a, b)"

"  CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b)"

"  CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b)"

"  CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b)"

"  CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b) CALC_MUL_16(a, b)\n"

""

"#define CALC_MUL_4096(a, b)"

"  CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b)"

"  CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b)"

"  CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b)"

"  CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b) CALC_MUL_256(a, b)\n"

""

"#define CALC_FUNC(func, type, macro)"

"__kernel void func() {"

"  __local type a;"

"  int index = get_global_id(0);"

"  type val1 = (type)index;"

"  type val2 = (type)2*(type)index;"

"  for(int i = 0; i < 1024; i++){ macro(val1, val2) }"

"  a = val1 + val2;"

"}\n"

""

"CALC_FUNC(calcFloatMAD, float, CALC_MAD_4096)\n"

"CALC_FUNC(calcFloatADD, float, CALC_ADD_4096)\n"

"CALC_FUNC(calcFloatMUL, float, CALC_MUL_4096)\n"

"\n"

"CALC_FUNC(calcIntMAD, int, CALC_MAD_4096)\n"

"CALC_FUNC(calcIntADD, int, CALC_ADD_4096)\n"

"CALC_FUNC(calcIntMUL, int, CALC_MUL_4096)\n"

"\n"

"#ifdef DOUBLES\n"

"CALC_FUNC(calcDoubleMAD, double, CALC_MAD_4096)\n"

"CALC_FUNC(calcDoubleADD, double, CALC_ADD_4096)\n"

"CALC_FUNC(calcDoubleMUL, double, CALC_MUL_4096)\n"

"#endif";

which can’t be build with CUDA SDK 3.0 and the last official drivers. The problem is that the stack overflows. This happens when CALC_FUNC macro is called with an int. Is this a compiler bug or there is something wrong with my code?

Shorter version with the same effect

char *clSource = 

"#define CALC_MAD_16"

"  a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;"

"  a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;"

"  a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;"

"  a = b*a + b; b = a*b + a; a = b*a + b; b = a*b + a;\n"

""

"#define CALC_ADD_16"

"  a = a+b; b = a+b; a = a+b; b = a+b;"

"  a = a+b; b = a+b; a = a+b; b = a+b;"

"  a = a+b; b = a+b; a = a+b; b = a+b;"

"  a = a+b; b = a+b; a = a+b; b = a+b;\n"

""

"#define CALC_MUL_16"

"  a = a*b; b = a*b; a = a*b; b = a*b;"

"  a = a*b; b = a*b; a = a*b; b = a*b;"

"  a = a*b; b = a*b; a = a*b; b = a*b;"

"  a = a*b; b = a*b; a = a*b; b = a*b;\n"

""

"#define CALC(macro)"

"  macro macro macro macro macro macro macro macro macro macro macro macro macro macro macro macro\n"

""

"#define CALC_FUNC(func, type, macro)"

"  __kernel void func() {"

"	__local type c;"

"	int index = get_global_id(0);"

"	 type a = (type)index;"

"	 type b = (type)2*(type)index;"

"	 for(int i = 0; i < 64; i++){ macro }"

"	 c = a + b;"

"  }\n"

""

"CALC_FUNC(calcIntADD, int, CALC(CALC(CALC_ADD_16)))\n"

"CALC_FUNC(calcIntMUL, int, CALC(CALC(CALC_MUL_16)))\n"

"CALC_FUNC(calcIntMAD, int, CALC(CALC(CALC_MAD_16)))\n"

So, to answer myself - the problem seems to be with the optimizer. With integers, it commits suicide, resulting in stack overflow, or allocation of all available address space (which is not an easy task on 64-bit systems, but it tries really hard with couple of gigabytes per second).