@ njuffa
Ok, thanks and sorry, I actually thought my short explanations and small example would be sufficient for this problem. I’ve programmed a small programm around my example:
#include "string"
#include "nvrtc.h"
#include "cuda.h"
#include <iostream>
int main()
{
std::string MyKernel =
"extern \"C\" __global__ void Test_Kernel(float * __restrict__ Y_1,\n"
" float * __restrict__ Y_2,\n"
" const float * __restrict__ X_1,\n"
" const float * __restrict__ X_2)\n"
"{\n"
" int Index = blockIdx.x*blockDim.x + threadIdx.x;\n"
"\n"
" Y_1[Index] = 1.f * X_1[Index] + 2.f*X_2[Index];\n"
" Y_2[Index] = 3.f * X_1[Index] + 4.f*X_2[Index];\n"
"}\n";
//std::cout << TotalProgramString << '\n';
nvrtcProgram prog;
std::string PTXName = "CudaProgram";
nvrtcCreateProgram(&prog, MyKernel.c_str(), PTXName.c_str(), 0, NULL, NULL);
const char *opts[] = { "--gpu-architecture=compute_35"};
nvrtcCompileProgram(prog, 1, opts);
size_t LogSize;
nvrtcGetProgramLogSize(prog, &LogSize);
std::string Log;
Log.resize(LogSize);
nvrtcGetProgramLog(prog, (char*)(Log.c_str()));
std::cout << Log << '\n';
size_t ptxSize;
nvrtcGetPTXSize(prog, &ptxSize);
std::string PTXString;
PTXString.resize(ptxSize);
nvrtcGetPTX(prog, (char*)(PTXString.c_str()));
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction kernel;
cuInit(0);
cuDeviceGet(&cuDevice, 0);
cuCtxCreate(&context, 0, cuDevice);
cuModuleLoadDataEx(&module, PTXString.c_str(), 0, 0, 0);
cuModuleGetFunction(&kernel, module, "Test_Kernel");
CUdeviceptr Y1, Y2, X1, X2;
cuMemAlloc(&Y1, 4);
cuMemAlloc(&Y2, 4);
cuMemAlloc(&X1, 4);
cuMemAlloc(&X2, 4);
void *args[] = { &Y1, &Y2, &X1, &X2 };
cuLaunchKernel(kernel,
1, 1, 1, // grid dim
1, 1, 1, // block dim
0, NULL, // shared mem and stream
args, 0);
// arguments
cuCtxSynchronize();
}
This example creates the following SASS:
1 90848 MOV R1, c[0x0][0x44];
2 90850 S2R R0, SR_CTAID.X;
3 90858 MOV32I R5, 0x4;
4 90860 S2R R3, SR_TID.X;
5 90868 IMAD R4, R0, c[0x0][0x28], R3;
6 90870 IMAD R10.CC, R4, R5, c[0x0][0x150];
7 90878 IMAD.HI.X R11, R4, R5, c[0x0][0x154];
8 90888 IMAD R8.CC, R4, R5, c[0x0][0x158];
9 90890 IMAD.HI.X R9, R4, R5, c[0x0][0x15c];
10 90898 LD.E.CG R0, [R10];
11 908A0 LD.E.CG R3, [R8];
12 908A8 IMAD R6.CC, R4, R5, c[0x0][0x140];
13 908B0 IMAD.HI.X R7, R4, R5, c[0x0][0x144];
14 908B8 FFMA R3, R3, 2, R0;
15 908C8 ST.E [R6], R3;
16 908D0 LD.E.CG R0, [R8];
17 908D8 LD.E.CG R2, [R10];
18 908E0 IMAD R6.CC, R4, R5, c[0x0][0x148];
19 908E8 IMAD.HI.X R7, R4, R5, c[0x0][0x14c];
20 908F0 FMUL R3, R0, 4;
21 908F8 FFMA R0, R2, 3, R3;
22 90908 ST.E [R6], R0;
23 90910 EXIT;
24 90918 BRA 0xd8; # Target=0x00090918
25 90920 NOP;
26 90928 NOP;
27 90930 NOP;
28 90938 NOP;
Note that in line 15,16,17, there are two loads following a store instruction. This is a bad sign for the compiler harnessing the restrict keyword. Interestingly, if I use “–restrict” as a compiler option, then the compiler optimizes this simple example correctly. Unfortunately this also causes the compiler to use the ldg instruction, which is only polluting the the read only data cache without yielding any benefit. However, if I use more complex programms such as the following source code, then even this compiler option does not help:
float2 Stencil_3D(const float2 * __restrict__ UV, int3 Pos)
{
float Diffusion_Constant = 123;
float A = 0.1;
float B = 0.2;
int Index = (SIZE_Y * Pos.z + Pos.y) * SIZE_X + Pos.x;
int Index_X_Plus = Index + 1;
int Index_X_Minus = Index - 1;
int Index_Y_Plus = Index + SIZE_X;
int Index_Y_Minus = Index - SIZE_X;
int Index_Z_Plus = Index + SIZE_X*SIZE_Y;
int Index_Z_Minus = Index - SIZE_X*SIZE_Y;
if(Pos.x == 0)
Index_X_Minus += 2;
if(Pos.y == 0)
Index_Y_Minus += 2*SIZE_X;
if(Pos.z == 0)
Index_Z_Minus += 2*SIZE_X*SIZE_Y;
if(Pos.x == SIZE_X-1)
Index_X_Plus -= 2;
if(Pos.y == SIZE_Y-1)
Index_Y_Plus -= 2*SIZE_X;
if(Pos.z == SIZE_Z-1)
Index_Z_Plus -= 2*SIZE_X*SIZE_Y;
float2 UVCenter = __ldg(&UV[Index]);
// float2 Diffusion = Diffusion_Constant * (UV[Index_X_Minus]+UV[Index_X_Plus] +UV[Index_Y_Minus]+UV[Index_Y_Plus]+UV[Index_Z_Minus]+UV[Index_Z_Plus]);
float2 Diffusion = Diffusion_Constant * (__ldg(&UV[Index_X_Minus])+__ldg(&UV[Index_X_Plus]) +__ldg(&UV[Index_Y_Minus]) +__ldg(&UV[Index_Y_Plus]) +__ldg(&UV[Index_Z_Minus]) +__ldg(&UV[Index_Z_Plus]));
float U = UVCenter.x;
float V = UVCenter.y;
float2 DUVDT;
DUVDT.x= (A + U*U*V - B*U-U) -(6.f*Diffusion_Constant)*U + Diffusion.x;
DUVDT.y= (B*U - U*U*V)-(6.f*Diffusion_Constant)*V + Diffusion.y;
return DUVDT;
}
float __constant__ A[8][8];
float __constant__ B[8];
float __constant__ B_Tilde[8];
__global__ void Kernel_7(const float2 * __restrict__ Y_n,
const float2 * __restrict__ k_5,
const float2 * __restrict__ F_1,
const float2 * __restrict__ F_2,
const float2 * __restrict__ F_3,
const float2 * __restrict__ F_4,
float2 * __restrict__ k_6,
float2 * __restrict__ k_7,
float2 * __restrict__ k_8_p,
float2 * __restrict__ Y_n_plus_1_p,
float2 * __restrict__ E_p)
{
int3 Pos = make_int3(GET_GLOBAL_INDEX_X, GET_GLOBAL_INDEX_Y, GET_GLOBAL_INDEX_Z);
int Index = (SIZE_Y * GET_GLOBAL_INDEX_Z + GET_GLOBAL_INDEX_Y) * SIZE_X + GET_GLOBAL_INDEX_X;
float F_5_Reg = Stencil_3D(k_5, Pos);
k_6[Index]= Y_n[Index] + A[5][0]*F_1[Index] + A[5][1]*F_2[Index] + A[5][2] *F_3[Index]
+ A[5][3]*F_4[Index] + A[5][4]*F_5_Reg;
k_7[Index] = Y_n[Index] + A[6][0]*F_1[Index] + A[6][1]*F_2[Index] + A[6][2] *F_3[Index]
+ A[6][3]*F_4[Index] + A[6][4]*F_5_Reg;
k_8_p[Index] = Y_n[Index] + A[7][0]*F_1[Index] + A[7][1]*F_2[Index] + A[7][2] *F_3[Index]
+ A[7][3]*F_4[Index] + A[7][4]*F_5_Reg;
Y_n_plus_1_p[Index] = Y_n[Index] + B[0]*F_1[Index] + B[2] * F_3[Index] + B[3]*F_4[Index] + B[4]*F_5_Reg;
E_p[Index] = (-1.f) * Y_n[Index] - B_Tilde[0]*F_1[Index]- B_Tilde[2] * F_3[Index]
- B_Tilde[3]*F_4[Index]- B_Tilde[4] * F_5_Reg;
}
Thus Compiler creates the following SASS of this Programm and with the “-restrict” option:
1 92648 MOV R1, c[0x0][0x44];
2 92650 S2R R0, SR_CTAID.Z;
3 92658 MOV32I R27, 0x8;
4 92660 S2R R3, SR_TID.Z;
5 92668 S2R R2, SR_CTAID.Y;
6 92670 S2R R4, SR_TID.Y;
7 92678 IMAD R5, R0, c[0x0][0x30], R3;
8 92688 S2R R6, SR_CTAID.X;
9 92690 S2R R0, SR_TID.X;
10 92698 IMAD R3, R2, c[0x0][0x2c], R4;
11 926A0 IMAD R4, R6, c[0x0][0x28], R0;
12 926A8 ISCADD R0, R5, R3, 0x8;
13 926B0 ISETP.EQ.AND P0, PT, R4, 0xff, PT;
14 926B8 ISCADD R0, R0, R4, 0x8;
15 926C8 IADD R2, R0, 0x1;
16 926D0 IADD R7, R0, -0x1;
17 926D8 IMAD R8.CC, R0, R27, c[0x0][0x148];
18 926E0 IADD R14, R0, -0x100;
19 926E8 ICMP.EQ R6, R2, R7, R4;
20 926F0 SEL R4, R7, R2, P0;
21 926F8 IMAD.HI.X R9, R0, R27, c[0x0][0x14c];
22 92708 IADD R7, R0, 0x100;
23 92710 IMAD R12.CC, R6, R27, c[0x0][0x148];
24 92718 IADD R16, R0, 0x10000;
25 92720 ISETP.EQ.AND P0, PT, R3, 0xff, PT;
26 92728 IADD R17, R0, -0x10000;
27 92730 IMAD.HI.X R13, R6, R27, c[0x0][0x14c];
28 92738 ICMP.EQ R6, R7, R14, R3;
29 92748 SEL R14, R14, R7, P0;
30 92750 IMAD R2.CC, R4, R27, c[0x0][0x148];
31 92758 IMAD.HI.X R3, R4, R27, c[0x0][0x14c];
32 92760 ISETP.EQ.AND P0, PT, R5, 0x7f, PT;
33 92768 IMAD R10.CC, R6, R27, c[0x0][0x148];
34 92770 ICMP.EQ R15, R16, R17, R5;
35 92778 LDG.E.64 R28, [R2];
36 92788 IMAD.HI.X R11, R6, R27, c[0x0][0x14c];
37 92790 LDG.E.64 R6, [R8];
38 92798 LDG.E.64 R8, [R12];
39 927A0 IMAD R4.CC, R14, R27, c[0x0][0x148];
40 927A8 SEL R12, R17, R16, P0;
41 927B0 LDG.E.64 R20, [R10];
42 927B8 IMAD.HI.X R5, R14, R27, c[0x0][0x14c];
43 927C8 IMAD R2.CC, R15, R27, c[0x0][0x148];
44 927D0 LDG.E.64 R18, [R4];
45 927D8 IMAD.HI.X R3, R15, R27, c[0x0][0x14c];
46 927E0 IMAD R10.CC, R12, R27, c[0x0][0x148];
47 927E8 LDG.E.64 R4, [R2];
48 927F0 IMAD.HI.X R11, R12, R27, c[0x0][0x14c];
49 927F8 LDG.E.64 R2, [R10];
50 92808 IMAD R16.CC, R0, R27, c[0x0][0x140];
51 92810 IMAD.HI.X R17, R0, R27, c[0x0][0x144];
52 92818 IMAD R14.CC, R0, R27, c[0x0][0x150];
53 92820 LD.E.CG.64 R24, [R16];
54 92828 IMAD.HI.X R15, R0, R27, c[0x0][0x154];
55 92830 IMAD R12.CC, R0, R27, c[0x0][0x158];
56 92838 LD.E.CG.64 R30, [R14];
57 92848 IMAD.HI.X R13, R0, R27, c[0x0][0x15c];
58 92850 IMAD R10.CC, R0, R27, c[0x0][0x160];
59 92858 LD.E.CG.64 R22, [R12];
60 92860 IMAD.HI.X R11, R0, R27, c[0x0][0x164];
61 92868 TEXDEPBAR 0x5;
62 92870 FMUL.FTZ R26, R6, R6;
63 92878 TEXDEPBAR 0x4;
64 92888 FADD.FTZ R32, R8, R28;
65 92890 FADD.FTZ R9, R9, R29;
66 92898 IMAD R8.CC, R0, R27, c[0x0][0x168];
67 928A0 FFMA.FTZ R29, R30, c[0x3][0xa0], R24;
68 928A8 FFMA.FTZ R28, R31, c[0x3][0xa0], R25;
69 928B0 FMUL32I.FTZ R30, R6, 0.20000000298023224;
70 928B8 FFMA.FTZ R31, R7, R26, c[0x2][0x0];
71 928C8 TEXDEPBAR 0x3;
72 928D0 FADD.FTZ R20, R32, R20;
73 928D8 FADD.FTZ R21, R9, R21;
74 928E0 LD.E.CG.64 R24, [R10];
75 928E8 FADD.FTZ R31, R31, -R30;
76 928F0 IMAD.HI.X R9, R0, R27, c[0x0][0x16c];
77 928F8 TEXDEPBAR 0x2;
78 92908 FADD.FTZ R18, R20, R18;
79 92910 FADD.FTZ R19, R21, R19;
80 92918 FFMA.FTZ R29, R22, c[0x3][0xa4], R29;
81 92920 LD.E.CG.64 R20, [R8];
82 92928 FFMA.FTZ R22, R23, c[0x3][0xa4], R28;
83 92930 FFMA.FTZ R26, -R7, R26, R30;
84 92938 FADD.FTZ R23, R31, -R6;
85 92948 TEXDEPBAR 0x1;
86 92950 FADD.FTZ R28, R19, R5;
87 92958 FADD.FTZ R4, R18, R4;
88 92960 FFMA.FTZ R7, -R7, 738, R26;
89 92968 FFMA.FTZ R19, R25, c[0x3][0xa8], R22;
90 92970 FFMA.FTZ R18, R24, c[0x3][0xa8], R29;
91 92978 TEXDEPBAR 0x0;
92 92988 FADD.FTZ R3, R28, R3;
93 92990 FFMA.FTZ R5, -R6, 738, R23;
94 92998 FADD.FTZ R4, R4, R2;
95 929A0 IMAD R2.CC, R0, R27, c[0x0][0x170];
96 929A8 FFMA.FTZ R6, R20, c[0x3][0xac], R18;
97 929B0 FFMA.FTZ R20, R21, c[0x3][0xac], R19;
98 929B8 FFMA.FTZ R19, R3, 123, R7;
99 929C8 FFMA.FTZ R18, R4, 123, R5;
100 929D0 IMAD.HI.X R3, R0, R27, c[0x0][0x174];
101 929D8 FFMA.FTZ R21, R19, c[0x3][0xb0], R20;
102 929E0 FFMA.FTZ R20, R18, c[0x3][0xb0], R6;
103 929E8 ST.E.64 [R2], R20;
104 929F0 LD.E.CG.64 R4, [R16];
105 929F8 LD.E.CG.64 R24, [R14];
106 92A08 LD.E.CG.64 R6, [R12];
107 92A10 LD.E.CG.64 R2, [R10];
108 92A18 FFMA.FTZ R21, R24, c[0x3][0xc0], R4;
109 92A20 FFMA.FTZ R20, R25, c[0x3][0xc0], R5;
110 92A28 LD.E.CG.64 R4, [R8];
111 92A30 FFMA.FTZ R6, R6, c[0x3][0xc4], R21;
112 92A38 FFMA.FTZ R7, R7, c[0x3][0xc4], R20;
113 92A48 FFMA.FTZ R2, R2, c[0x3][0xc8], R6;
114 92A50 FFMA.FTZ R20, R3, c[0x3][0xc8], R7;
115 92A58 IMAD R6.CC, R0, R27, c[0x0][0x178];
116 92A60 FFMA.FTZ R3, R4, c[0x3][0xcc], R2;
117 92A68 FFMA.FTZ R2, R5, c[0x3][0xcc], R20;
118 92A70 IMAD.HI.X R7, R0, R27, c[0x0][0x17c];
119 92A78 FFMA.FTZ R20, R18, c[0x3][0xd0], R3;
120 92A88 FFMA.FTZ R21, R19, c[0x3][0xd0], R2;
121 92A90 ST.E.64 [R6], R20;
122 92A98 LD.E.CG.64 R4, [R16];
123 92AA0 LD.E.CG.64 R24, [R14];
124 92AA8 LD.E.CG.64 R2, [R12];
125 92AB0 LD.E.CG.64 R6, [R10];
126 92AB8 FFMA.FTZ R20, R24, c[0x3][0xe0], R4;
127 92AC8 FFMA.FTZ R21, R25, c[0x3][0xe0], R5;
128 92AD0 LD.E.CG.64 R4, [R8];
129 92AD8 FFMA.FTZ R2, R2, c[0x3][0xe4], R20;
130 92AE0 FFMA.FTZ R3, R3, c[0x3][0xe4], R21;
131 92AE8 FFMA.FTZ R2, R6, c[0x3][0xe8], R2;
132 92AF0 FFMA.FTZ R6, R7, c[0x3][0xe8], R3;
133 92AF8 IMAD R12.CC, R0, R27, c[0x0][0x180];
134 92B08 FFMA.FTZ R3, R4, c[0x3][0xec], R2;
135 92B10 FFMA.FTZ R2, R5, c[0x3][0xec], R6;
136 92B18 IMAD.HI.X R13, R0, R27, c[0x0][0x184];
137 92B20 FFMA.FTZ R20, R18, c[0x3][0xf0], R3;
138 92B28 FFMA.FTZ R21, R19, c[0x3][0xf0], R2;
139 92B30 ST.E.64 [R12], R20;
140 92B38 LD.E.CG.64 R6, [R16];
141 92B48 LD.E.CG.64 R24, [R14];
142 92B50 LD.E.CG.64 R2, [R10];
143 92B58 LD.E.CG.64 R4, [R8];
144 92B60 FFMA.FTZ R6, R24, c[0x3][0x100], R6;
145 92B68 FFMA.FTZ R7, R25, c[0x3][0x100], R7;
146 92B70 FFMA.FTZ R2, R2, c[0x3][0x108], R6;
147 92B78 FFMA.FTZ R12, R3, c[0x3][0x108], R7;
148 92B88 IMAD R6.CC, R0, R27, c[0x0][0x188];
149 92B90 FFMA.FTZ R3, R4, c[0x3][0x10c], R2;
150 92B98 FFMA.FTZ R2, R5, c[0x3][0x10c], R12;
151 92BA0 IMAD.HI.X R7, R0, R27, c[0x0][0x18c];
152 92BA8 FFMA.FTZ R22, R18, c[0x3][0x110], R3;
153 92BB0 FFMA.FTZ R23, R19, c[0x3][0x110], R2;
154 92BB8 ST.E.64 [R6], R22;
155 92BC8 LD.E.CG.64 R2, [R16];
156 92BD0 LD.E.CG.64 R4, [R14];
157 92BD8 IMAD R6.CC, R0, R27, c[0x0][0x190];
158 92BE0 LD.E.CG.64 R12, [R10];
159 92BE8 LD.E.CG.64 R20, [R8];
160 92BF0 FFMA.FTZ R2, -R4, c[0x3][0x120], -R2;
161 92BF8 FFMA.FTZ R3, -R5, c[0x3][0x120], -R3;
162 92C08 FFMA.FTZ R2, -R12, c[0x3][0x128], R2;
163 92C10 FFMA.FTZ R4, -R13, c[0x3][0x128], R3;
164 92C18 FFMA.FTZ R3, -R20, c[0x3][0x12c], R2;
165 92C20 FFMA.FTZ R2, -R21, c[0x3][0x12c], R4;
166 92C28 IMAD.HI.X R7, R0, R27, c[0x0][0x194];
167 92C30 FFMA.FTZ R4, -R18, c[0x3][0x130], R3;
168 92C38 FFMA.FTZ R5, -R19, c[0x3][0x130], R2;
169 92C48 ST.E.64 [R6], R4;
170 92C50 EXIT;
171 92C58 BRA 0x618; # Target=0x00092c58
172 92C60 NOP;
173 92C68 NOP;
174 92C70 NOP;
175 92C78 NOP;
Note that the compiler again places loads immediately after stores.