int64 build error incorrect register class for operand 0

Hey,guys
I tried to compile sha512 with cuda v3.1, but i met a very strange compiling error with int64.
just look at the following code.

typedef unsigned long long uint64_t;

typedef union
{
uint8_t b[64];
uint32_t d[16];
uint64_t ll[8];
} Byte64;

typedef union
{
uint8_t b[128];
uint32_t d[32];
uint64_t ll[16];
}Byte128;
typedef struct
{
Byte64 hash;
}SHA512_CTX;

device void SHA512_Init(SHA512_CTX *ctx)
{
ctx->hash.ll[0] = INIT_A0;
ctx->hash.ll[1] = INIT_A1;
ctx->hash.ll[2] = INIT_A2;
ctx->hash.ll[3] = INIT_A3;
ctx->hash.ll[4] = INIT_A4;
ctx->hash.ll[5] = INIT_A5;
ctx->hash.ll[6] = INIT_A6;
ctx->hash.ll[7] = INIT_A7;
}

device void SHA512_Block(SHA512_CTX *ctx, Byte128 *b128)
{
uint64_t a=ctx->hash.ll[0];
uint64_t b=ctx->hash.ll[1];
uint64_t c=ctx->hash.ll[2];
uint64_t d=ctx->hash.ll[3];
uint64_t e=ctx->hash.ll[4];
uint64_t f=ctx->hash.ll[5];
uint64_t g=ctx->hash.ll[6];
uint64_t h=ctx->hash.ll[7];
uint64_t t1=0,t2=0;
_RSHA512(a,b,c,d,e,f,g,h,0,b128->ll[0]);b128->ll[0]+=sigma1(b128->ll[14])+b128->ll[9]+sigma0(b128->ll[1]);
_RSHA512(h,a,b,c,d,e,f,g,1,b128->ll[1]);b128->ll[1]+=sigma1(b128->ll[15])+b128->ll[10]+sigma0(b128->ll[2]);
_RSHA512(g,h,a,b,c,d,e,f,2,b128->ll[2]);b128->ll[2]+=sigma1(b128->ll[0])+b128->ll[11]+sigma0(b128->ll[3]);
_RSHA512(f,g,h,a,b,c,d,e,3,b128->ll[3]);b128->ll[3]+=sigma1(b128->ll[1])+b128->ll[12]+sigma0(b128->ll[4]);
_RSHA512(e,f,g,h,a,b,c,d,4,b128->ll[4]);b128->ll[4]+=sigma1(b128->ll[2])+b128->ll[13]+sigma0(b128->ll[5]);
_RSHA512(d,e,f,g,h,a,b,c,5,b128->ll[5]);b128->ll[5]+=sigma1(b128->ll[3])+b128->ll[14]+sigma0(b128->ll[6]);
_RSHA512(c,d,e,f,g,h,a,b,6,b128->ll[6]);b128->ll[6]+=sigma1(b128->ll[4])+b128->ll[15]+sigma0(b128->ll[7]);
_RSHA512(b,c,d,e,f,g,h,a,7,b128->ll[7]);b128->ll[7]+=sigma1(b128->ll[5])+b128->ll[0]+sigma0(b128->ll[8]);
_RSHA512(a,b,c,d,e,f,g,h,8,b128->ll[8]);b128->ll[8]+=sigma1(b128->ll[6])+b128->ll[1]+sigma0(b128->ll[9]);
_RSHA512(h,a,b,c,d,e,f,g,9,b128->ll[9]);b128->ll[9]+=sigma1(b128->ll[7])+b128->ll[2]+sigma0(b128->ll[10]);
_RSHA512(g,h,a,b,c,d,e,f,10,b128->ll[10]);b128->ll[10]+=sigma1(b128->ll[8])+b128->ll[3]+sigma0(b128->ll[11]);
_RSHA512(f,g,h,a,b,c,d,e,11,b128->ll[11]);b128->ll[11]+=sigma1(b128->ll[9])+b128->ll[4]+sigma0(b128->ll[12]);
_RSHA512(e,f,g,h,a,b,c,d,12,b128->ll[12]);b128->ll[12]+=sigma1(b128->ll[10])+b128->ll[5]+sigma0(b128->ll[13]);
_RSHA512(d,e,f,g,h,a,b,c,13,b128->ll[13]);b128->ll[13]+=sigma1(b128->ll[11])+b128->ll[6]+sigma0(b128->ll[14]);
_RSHA512(c,d,e,f,g,h,a,b,14,b128->ll[14]);b128->ll[14]+=sigma1(b128->ll[12])+b128->ll[7]+sigma0(b128->ll[15]);
_RSHA512(b,c,d,e,f,g,h,a,15,b128->ll[15]);b128->ll[15]+=sigma1(b128->ll[13])+b128->ll[8]+sigma0(b128->ll[0]);

_RSHA512(a,b,c,d,e,f,g,h,64,b128->ll[0]);
_RSHA512(h,a,b,c,d,e,f,g,65,b128->ll[1]);
_RSHA512(g,h,a,b,c,d,e,f,66,b128->ll[2]);
_RSHA512(f,g,h,a,b,c,d,e,67,b128->ll[3]);
_RSHA512(e,f,g,h,a,b,c,d,68,b128->ll[4]);
_RSHA512(d,e,f,g,h,a,b,c,69,b128->ll[5]);
_RSHA512(c,d,e,f,g,h,a,b,70,b128->ll[6]);
_RSHA512(b,c,d,e,f,g,h,a,71,b128->ll[7]);
_RSHA512(a,b,c,d,e,f,g,h,72,b128->ll[8]);
_RSHA512(h,a,b,c,d,e,f,g,73,b128->ll[9]);
_RSHA512(g,h,a,b,c,d,e,f,74,b128->ll[10]);
_RSHA512(f,g,h,a,b,c,d,e,75,b128->ll[11]);
_RSHA512(e,f,g,h,a,b,c,d,76,b128->ll[12]);
_RSHA512(d,e,f,g,h,a,b,c,77,b128->ll[13]);
_RSHA512(c,d,e,f,g,h,a,b,78,b128->ll[14]);
_RSHA512(b,c,d,e,f,g,h,a,79,b128->ll[15]);


ctx->hash.ll[0]+=a;
ctx->hash.ll[1]+=b;
ctx->hash.ll[2]+=c;
ctx->hash.ll[3]+=d;
ctx->hash.ll[4]+=e;
ctx->hash.ll[5]+=f;
ctx->hash.ll[6]+=g;
ctx->hash.ll[7]+=h;

}

_RSHA512,sigma1 and sigma0 are sha512 MACROs;
code snippet:

    SHA512_CTX i_Ctx;
Byte128 k_iPad;
SHA512_CTX o_Ctx;
Byte128 k_oPad;
    SHA512_Init(&i_Ctx);
    SHA512_Block(&i_Ctx,&k_iPad);// first call SHA512_Block, no error
    SHA512_Init(&o_Ctx);
    SHA512_Block(&o_Ctx,&k_oPad); //error occurs here   

compile error:
1>Kernel.compute_10.cudafe1.gpu
1>Kernel.compute_10.cudafe2.gpu
1>### Assertion failure at line 1923 of …/…/be/cg/cgemit.cxx:
1>### Compiler Error in file Kernel.compute_20.cpp3.i during Assembly phase:
1>### incorrect register class for operand 0
1>nvopencc ERROR: C:\CUDA\bin/…/open64/lib//be.exe returned non-zero status 1

Does anybody know why it happened? It really trapped me.
thanks!

The error message indicates an internal compiler error. I see you are building with the toolchain from CUDA 3.1. As CUDA 3.2 final release is out, I would suggest you try that. If the problem persists, please attach a self-contained repro case (or sent one in a PM via the forum), and I’d be happy to take a look.

Hi, njuffa
I have updated cuda from v3.1 to v3.2, but this error persists,
that is, when compiling code for sm_20, error appears.
It is caused by several callings of SHA512_Block.
It really trapped me. Thank you for your kindness.


compile with cuda v3.2 error info:
1>Compiling with CUDA Build Rule…
1>“C:\CUDA\v3.2\bin\nvcc.exe” -

gencode=arch=compute_10,code="sm_10,compute_10" -

gencode=arch=compute_20,code="sm_20,compute_20" --machine 32 -

ccbin “C:\Program Files\Microsoft Visual Studio 8\VC\bin” -

Xcompiler "/EHsc /W3 /nologo /O2 /Zi /MT " -I"C:\CUDA\v3.2

\include" -maxrregcount=32 --compile -o “Release/Kernel.cu.obj”

Kernel.cu
1>Kernel.cu
1>tmpxft_00000e48_00000000-6_Kernel.compute_10.cudafe1.gpu
1>tmpxft_00000e48_00000000-10_Kernel.compute_10.cudafe2.gpu
1>Kernel.cu
1>tmpxft_00000e48_00000000-3_Kernel.compute_20.cudafe1.gpu
1>tmpxft_00000e48_00000000-14_Kernel.compute_20.cudafe2.gpu
1>Kernel.cu
1>Kernel.cu
1>### Assertion failure at line 1923 of …/…/be/cg/cgemit.cxx:
1>### Compiler Error in file

C:/DOCUME~1/ADMINI~1/LOCALS~1/Temp/tmpxft_00000e48_00000000-

15_Kernel.compute_20.cpp3.i during Assembly phase:
1>### incorrect register class for operand 0
1>nvopencc ERROR: C:\CUDA\v3.2\bin/…/open64/lib//be.exe returned

non-zero status 1
1>Project : error PRJ0019: “Compiling with CUDA Build Rule…”


compile with cuda v3.0 error info:
Compiling with CUDA Build Rule…
“C:\CUDA\bin\nvcc.exe” -ccbin "d:\Program Files\Microsoft Visual

Studio 8\VC\bin" -I"C:\CUDA\include" -I"./" -I"…/…/common/inc"

-I"…/…/…/shared/inc" -Xcompiler "/EHsc /W3 /nologo /O2 /Zi /MT

" -maxrregcount=32 -gencode=arch=compute_10,code="sm_10,compute_10

" -gencode=arch=compute_20,code="sm_20,compute_20" --compile -o

“Release\Kernel.cu.obj” “.\Kernel.cu”
Kernel.cu
tmpxft_00000160_00000000-6_Kernel.compute_10.cudafe1.gpu
tmpxft_00000160_00000000-10_Kernel.compute_10.cudafe2.gpu
Kernel.cu
tmpxft_00000160_00000000-3_Kernel.compute_20.cudafe1.gpu
tmpxft_00000160_00000000-14_Kernel.compute_20.cudafe2.gpu
ptxas error : Entry function uses too much local data (0x4b4c

bytes, 0x4000 max)
kernel_update.cu (14.8 KB)

I am unable to repro as the attached file causes compiler unhappiness at an earlier stage. If you could attach code that has the issues below fixed, I’d be happy to take another look. BTW, are you on WinXP32 or WinXP64?

kernel.cu

"C:[…]/kernel.cu(272): error: identifier “SHA512_DIGESTSIZE” is undefined

"C:[…]/kernel.cu(297): error: argument of type “uint64_t *” is incompatible with parameter of type “Byte128 *”

"C:[…]/kernel.cu(304): error: argument of type “uint64_t *” is incompatible with parameter of type “Byte128 *”

"C:[…]/kernel.cu(316): error: argument of type “uint64_t *” is incompatible with parameter of type “Byte128 *”

"C:[…]/kernel.cu(322): error: argument of type “uint64_t *” is incompatible with parameter of type “Byte128 *”

5 errors detected in the compilation of “kernel.compute_10.cpp1.ii”.

Hi, njuffa, I have updated kernel.cu, just see kernel_update.cu.

It is on WinXP32.

I am able to repro on Win32 (see below). I will followup with our compiler team. Thank you for bringing this issue to our attention.

Compiling C:[…]kernel.cu

kernel.cu

kernel.compute_10.cudafe1.gpu

kernel.compute_10.cudafe2.gpu

kernel.cu

kernel.compute_20.cudafe1.gpu

kernel.compute_20.cudafe2.gpu

kernel.cu

kernel.cu

Assertion failure at line 2761 of …/…/be/cg/NVISA/cgtarget.cxx:

Compiler Error in file kernel.compute_20.cpp3.i during Register Allocation phase:

ran out of registers in integer64

nvopencc ERROR: C:[…]/open64/lib//be.exe returned non-zero status 1

Is it a bug of compiler?

Yes, this kind of assertion failure indicates an internal compiler error. If the compiler team identifies a workaround, I will let you know.

The compiler team suggests as a workaround to add the noinline attribute to the SHA512_Block() function, changing the code to:

device noinline void SHA512_Block(SHA512_CTX *ctx, Byte128 *b128)

This fixes the sm_20 compilation issue for me. Note that the above will result in a warning

kernel.cu(151): Warning: Pointer parameters must be inlined, so overriding noinline attribute on ‘_Z12SHA512_BlockP10SHA512_CTXP7Byte128’

when compiling for sm_1x. This warning can be ignored. To eliminate the warning, one could conditional use noinline based on architecture:

device
#if defined (CUDA_ARCH) && (CUDA_ARCH >= 200)
noinline
#endif
void SHA512_Block(SHA512_CTX *ctx, Byte128 *b128)