Compiler Stuck using 100% CPU

Im working on a rainbow table generator in CUDA and have come to a stopping point. Whenever I put my function call inside a for loop, the compiler? (be.exe) never finishes and gets stuck.

Relevant code:

#pragma unroll 1

	for(i = 0; i < 2; i++)

	{

		index = plainToHash( 1, 2,3, 12);

	}
__device__ uint64 plainToHash(uint32 x0, uint32 x1, uint32 x2, uint32 size)

{

	uint32 A, B, C, D;

	uint32 X[16];

	A = 0x67452301; B = 0xEFCDAB89; C = 0x98BADCFE;  D = 0x10325476;

	X[0]  = x0; 

	X[1]  = x1; 

	X[2]  = x2; 

	X[3]  = 0;

	X[4]  = 0; 

	X[5]  = 0; 

	X[6]  = 0; 

	X[7]  = 0;

	X[8]  = 0; 

	X[9]  = 0; 

	X[10] = 0; 

	X[11] = 0;

	X[12] = 0; 

	X[13] = 0; 

	X[14] = size<<3; 

	X[15] = 0;

	((uchar8 *)X) = 0x80;

	R0(A,B,C,D,X[ 0], 3,0); R0(D,A,B,C,X[ 1], 7,0);	

	R0(C,D,A,B,X[ 2],11,0);	R0(B,C,D,A,X[ 3],19,0);	

	R0(A,B,C,D,X[ 4], 3,0);	R0(D,A,B,C,X[ 5], 7,0);	

	R0(C,D,A,B,X[ 6],11,0);	R0(B,C,D,A,X[ 7],19,0);	

	R0(A,B,C,D,X[ 8], 3,0);	R0(D,A,B,C,X[ 9], 7,0);	

	R0(C,D,A,B,X[10],11,0);	R0(B,C,D,A,X[11],19,0);	

	R0(A,B,C,D,X[12], 3,0);	R0(D,A,B,C,X[13], 7,0); 

	R0(C,D,A,B,X[14],11,0); R0(B,C,D,A,X[15],19,0);

	R1(A,B,C,D,X[ 0], 3,0x5A827999); R1(D,A,B,C,X[ 4], 5,0x5A827999);

	R1(C,D,A,B,X[ 8], 9,0x5A827999); R1(B,C,D,A,X[12],13,0x5A827999);

	R1(A,B,C,D,X[ 1], 3,0x5A827999); R1(D,A,B,C,X[ 5], 5,0x5A827999);

	R1(C,D,A,B,X[ 9], 9,0x5A827999); R1(B,C,D,A,X[13],13,0x5A827999);

	R1(A,B,C,D,X[ 2], 3,0x5A827999); R1(D,A,B,C,X[ 6], 5,0x5A827999);

	R1(C,D,A,B,X[10], 9,0x5A827999); R1(B,C,D,A,X[14],13,0x5A827999);

	R1(A,B,C,D,X[ 3], 3,0x5A827999); R1(D,A,B,C,X[ 7], 5,0x5A827999);

	R1(C,D,A,B,X[11], 9,0x5A827999); R1(B,C,D,A,X[15],13,0x5A827999);

	R2(A,B,C,D,X[ 0], 3,0x6ED9EBA1); R2(D,A,B,C,X[ 8], 9,0x6ED9EBA1);

	R2(C,D,A,B,X[ 4],11,0x6ED9EBA1); R2(B,C,D,A,X[12],15,0x6ED9EBA1);

	R2(A,B,C,D,X[ 2], 3,0x6ED9EBA1); R2(D,A,B,C,X[10], 9,0x6ED9EBA1);

	R2(C,D,A,B,X[ 6],11,0x6ED9EBA1); R2(B,C,D,A,X[14],15,0x6ED9EBA1);

	R2(A,B,C,D,X[ 1], 3,0x6ED9EBA1); R2(D,A,B,C,X[ 9], 9,0x6ED9EBA1);

	R2(C,D,A,B,X[ 5],11,0x6ED9EBA1); R2(B,C,D,A,X[13],15,0x6ED9EBA1);

	R2(A,B,C,D,X[ 3], 3,0x6ED9EBA1); R2(D,A,B,C,X[11], 9,0x6ED9EBA1);

	R2(C,D,A,B,X[ 7],11,0x6ED9EBA1); R2(B,C,D,A,X[15],15,0x6ED9EBA1);

	A += 0x67452301; B += 0xEFCDAB89; C += 0x98BADCFE; D += 0x10325476;

	return ((uint64)B <<32) | A;

}

Removing the loop makes it compile, and the pragma doesn’t help either.

What do R0, R1, and R2 do?

They are the macros involved in the Hash:

#define	F(b,c,d)	((((c) ^ (d)) & (b)) ^ (d))

#define G(b,c,d)	(((b) & (c)) | ((b) & (d)) | ((c) & (d)))

#define	H(b,c,d)	((b) ^ (c) ^ (d))

#define ROTATE(a,n)	 (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))

#define R0(a,b,c,d,k,s,t) { \

	a+=((k)+(t)+F((b),(c),(d))); \

	a=ROTATE(a,s); };

#define R1(a,b,c,d,k,s,t) { \

	a+=((k)+(t)+G((b),(c),(d))); \

	a=ROTATE(a,s); };\

#define R2(a,b,c,d,k,s,t) { \

	a+=((k)+(t)+H((b),(c),(d))); \

	a=ROTATE(a,s); };

I can’t reproduce the problem (Visual C++ 2008 Express on Vista), but once before I did have a kernel which took much much longer to compile than it should have. In that case I messed with the code somewhat and removed all intermediate files, and it went away. Could be an intermittent issue with the compiler and not with your particular code. It happened to me when looking into a potential bug with string constants, so it might be related to having a lot of constant literals in your code. Sorry if that’s not much help.

Here is the kernel. I am compiling this as a .cubin file with the following command line:

$(CUDA_BIN_PATH)\nvcc.exe --opencc-options -LIST:source=on  -ccbin "$(VCInstallDir)bin" -cubin -D_CONSOLE -D_MBCS -Xcompiler /EHsc,/W3,/nologo,/Wp64,/O2,/Zi,/MT --ptxas-options=-v -I"$(CUDA_INC_PATH)"; -I./ -I../../common/inc worker.cu -o $(IntDir)\worker.cubin

When I remove the “noinline” from the beginning of plainToHash() the compiler takes forever. Can anyone reproduce/explain this behavior?
worker.zip (1.54 KB)

bump

Did anyone else get similar results or am I just crazy?