Consider the following 4 variants to intialize an array with a constant size defined via a define statement.
#define FAC 4
extern "C" __global__ void mykernel1 (void)
{
// This is not handled well by the compiler
double C[FAC*FAC] = {0.0};
}
extern "C" __global__ void mykernel2 (void)
{
double C[FAC*FAC];
#pragma unroll
for(int i=0; i< FAC*FAC; ++i)
C[i] = 0.0;
}
extern "C" __global__ void mykernel3 (void)
{
// Not generic
double C[FAC*FAC] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
}
extern "C" __global__ void mykernel4 (void)
{
// Missing one value.
double C[FAC*FAC] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0};
}
The assembler output from
nvcc --maxrregcount 63 -arch sm_20 --ptxas-options=“-v” --cubin bug.cu
cuobjdump --dump-sass bug.cubin > bug.ass
is
code for sm_20
Function : mykernel1
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x00105d034800c002*/ IADD R1, R1, -0x80;
/*0010*/ /*0x001fdca5c8000000*/ STL.64 [R1], RZ;
/*0018*/ /*0x201fdca5c8000000*/ STL.64 [R1+0x8], RZ;
/*0020*/ /*0x401fdca5c8000000*/ STL.64 [R1+0x10], RZ;
/*0028*/ /*0x601fdca5c8000000*/ STL.64 [R1+0x18], RZ;
/*0030*/ /*0x801fdca5c8000000*/ STL.64 [R1+0x20], RZ;
/*0038*/ /*0xa01fdca5c8000000*/ STL.64 [R1+0x28], RZ;
/*0040*/ /*0xc01fdca5c8000000*/ STL.64 [R1+0x30], RZ;
/*0048*/ /*0xe01fdca5c8000000*/ STL.64 [R1+0x38], RZ;
/*0050*/ /*0x001fdca5c8000001*/ STL.64 [R1+0x40], RZ;
/*0058*/ /*0x201fdca5c8000001*/ STL.64 [R1+0x48], RZ;
/*0060*/ /*0x401fdca5c8000001*/ STL.64 [R1+0x50], RZ;
/*0068*/ /*0x601fdca5c8000001*/ STL.64 [R1+0x58], RZ;
/*0070*/ /*0x801fdca5c8000001*/ STL.64 [R1+0x60], RZ;
/*0078*/ /*0xa01fdca5c8000001*/ STL.64 [R1+0x68], RZ;
/*0080*/ /*0xc01fdca5c8000001*/ STL.64 [R1+0x70], RZ;
/*0088*/ /*0xe01fdca5c8000001*/ STL.64 [R1+0x78], RZ;
/*0090*/ /*0x00001de780000000*/ EXIT;
..........................
Function : mykernel2
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x00001de780000000*/ EXIT;
..........................
Function : mykernel3
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x00001de780000000*/ EXIT;
..........................
Function : mykernel4
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x00105d034800c002*/ IADD R1, R1, -0x80;
/*0010*/ /*0x001fdca5c8000000*/ STL.64 [R1], RZ;
/*0018*/ /*0x201fdca5c8000000*/ STL.64 [R1+0x8], RZ;
/*0020*/ /*0x401fdca5c8000000*/ STL.64 [R1+0x10], RZ;
/*0028*/ /*0x601fdca5c8000000*/ STL.64 [R1+0x18], RZ;
/*0030*/ /*0x801fdca5c8000000*/ STL.64 [R1+0x20], RZ;
/*0038*/ /*0xa01fdca5c8000000*/ STL.64 [R1+0x28], RZ;
/*0040*/ /*0xc01fdca5c8000000*/ STL.64 [R1+0x30], RZ;
/*0048*/ /*0xe01fdca5c8000000*/ STL.64 [R1+0x38], RZ;
/*0050*/ /*0x001fdca5c8000001*/ STL.64 [R1+0x40], RZ;
/*0058*/ /*0x201fdca5c8000001*/ STL.64 [R1+0x48], RZ;
/*0060*/ /*0x401fdca5c8000001*/ STL.64 [R1+0x50], RZ;
/*0068*/ /*0x601fdca5c8000001*/ STL.64 [R1+0x58], RZ;
/*0070*/ /*0x801fdca5c8000001*/ STL.64 [R1+0x60], RZ;
/*0078*/ /*0xa01fdca5c8000001*/ STL.64 [R1+0x68], RZ;
/*0080*/ /*0xc01fdca5c8000001*/ STL.64 [R1+0x70], RZ;
/*0088*/ /*0xe01fdca5c8000001*/ STL.64 [R1+0x78], RZ;
/*0090*/ /*0x00001de780000000*/ EXIT;
..........................
The needless usage of local memory kills the performance of my kernel completely.
Is this a known feature or bug? Thanks.
I think a similar question was raised in
local memory array initialization
BUG: Array initialization does not conform to C standard
some time ago.