I’m having issues with nvopencc not optimizing my memory reads, when I put larger arrays inside my structs.
Consider the following.
[codebox]struct align(16) primitive_testpair {
unsigned int tf;
unsigned int p1[1];
unsigned int _pad1;
unsigned int _pad2;
unsigned int _e0;
unsigned int _e1;
unsigned int _e2;
unsigned int _e3;
}
[/codebox]
Loading this structure from global memory looks like this in the PTX output:
[codebox] ld.global.v4.u32 {%r4,%r5,%r6,%r7}, [%rd4+0]; //
st.local.u32 [__cuda___cuda_tp032+0], %r4; // id:6746 __cuda___cuda_tp032+0x0
st.local.u32 [__cuda___cuda_tp032+4], %r5; // id:6745 __cuda___cuda_tp032+0x4
st.local.u32 [__cuda___cuda_tp032+8], %r6; // id:6754 __cuda___cuda_tp032+0x8
st.local.u32 [__cuda___cuda_tp032+12], %r7; // id:6756 __cuda___cuda_tp032+0xc
ld.global.v4.u32 {%r8,%r9,%r10,%r11}, [%rd4+16]; //
st.local.u32 [__cuda___cuda_tp032+16], %r8; // id:6758 __cuda___cuda_tp032+0x10
st.local.u32 [__cuda___cuda_tp032+20], %r9; // id:6760 __cuda___cuda_tp032+0x14
st.local.u32 [__cuda___cuda_tp032+24], %r10; // id:6762 __cuda___cuda_tp032+0x18
st.local.u32 [__cuda___cuda_tp032+28], %r11; // id:6764 __cuda___cuda_tp032+0x1c[/codebox]
Which gives two 128-bit loads as expected.
Using an array of 4 unsigned ints, instead of the individual members (e0-e3) gives another result though:
[codebox]struct align(16) primitive_testpair {
unsigned int tf;
unsigned int p1[1];
unsigned int _pad1;
unsigned int _pad2;
unsigned int _e[4];
}
[/codebox]
Loading this struct from global memory results in the following PTX:
[codebox]ld.global.v4.u32 {%r4,%r5,%r6,%r7}, [%rd4+0]; //
st.local.u32 [__cuda___cuda_tp032+0], %r4; // id:6746 __cuda___cuda_tp032+0x0
st.local.u32 [__cuda___cuda_tp032+4], %r5; // id:6745 __cuda___cuda_tp032+0x4
st.local.u32 [__cuda___cuda_tp032+8], %r6; // id:6754 __cuda___cuda_tp032+0x8
st.local.u32 [__cuda___cuda_tp032+12], %r7; // id:6756 __cuda___cuda_tp032+0xc
ld.global.u32 %r8, [%rd4+16]; // id:6757
st.local.u32 [__cuda___cuda_tp032+16], %r8; // id:6758 __cuda___cuda_tp032+0x10
ld.global.u32 %r9, [%rd4+20]; // id:6759
st.local.u32 [__cuda___cuda_tp032+20], %r9; // id:6760 __cuda___cuda_tp032+0x14
ld.global.u32 %r10, [%rd4+24]; // id:6761
st.local.u32 [__cuda___cuda_tp032+24], %r10; // id:6762 __cuda___cuda_tp032+0x18
ld.global.u32 %r11, [%rd4+28]; // id:6763
st.local.u32 [__cuda___cuda_tp032+28], %r11; // id:6764 __cuda___cuda_tp032+0x1c
ld.global.u32 %r12, [%rd4+32]; // id:6765
[/codebox]
That is one 128-bit load and four 32-bit loads.
Defining the array as
[codebox] unsigned int align(16) _e[4];
[/codebox]
which is kind of silly since I’ve already manually aligned, doesn’t work either.
Any hints on to what I’m missing here, to allow the compiler to recognize that it can load the array with a 128-bit vector load?
Thanks