Hi,
I have an OpenCL kernel that applies a 27pt stencil onto a grid point, calculates a residual and writes it back to global memory. In version a, the stencil coefficients are stored in a global memory array. In version b, they are hardcoded using #define
statement, see code below.
I expected version b to be noticeably faster due to less global memory reads, however version a is faster! I used Nsight Compute to analyze these two versions for quiet a long time now, but I can’t figure out the reason why version a should be faster. Unfortunately, I seem to not be allowed to attach the ncu-rep file. However I exported the raw data as csv and added the .txt file ending, hoping it to be helpful.
raw_residual_profile.txt (145.8 KB)
I’m using pOCL to analyze with Nsight Compute. Executed on a Nvidia Quadro T2000 with CC 7.5.
Especially I’m wondering if the tex stall might be the culprit? I don’t understand why the instruction throughput is that much lower, too.
| metric | global | preprocessor |
| --------------- | --------------- | --------------- | --------------- |
| smsp__average_warps_issue_stalled_tex_throttle_per_issue_active.ratio [inst] | 12,8 | 41,46 (+223,85%) |
| sm__instruction_throughput.avg.pct_of_peak_sustained_active [%] | 95,7 | 84,01 (-12,22%)|
Any help on which metric I should focus at i very much appreciated. Thank you!
OpenCL kernels
// version a
__kernel void residual_27point_fixed_stencil_coeffs_as_global_buffer(
__global double* restrict v_in,
__global double* restrict f,
__global double* restrict r,
__global double* restrict coeffs,
const int mgh, const int ngh, const int ogh,
const int ghosts,
const int moff, const int noff, const int ooff)
{
int idx = get_global_id(0);
int no = ngh * ogh;
int i = idx / no;
int j = (idx - i * no) / ogh;
int k = idx % ogh;
int istart_v = ghosts + moff;
int jstart_v = ghosts + noff;
int kstart_v = ghosts + ooff;
int iend_v = mgh - ghosts - moff;
int jend_v = ngh - ghosts - noff;
int kend_v = ogh - ghosts - ooff;
// calculate residual only for relevant cells (off = 0: only real cells)
if (i >= istart_v && j >= jstart_v && k >= kstart_v && i < iend_v && j < jend_v && k < kend_v)
{
int ioff = ngh * ogh;
int joff = ogh;
int koff = 1;
int index = i * ioff + j * ogh + k;
// clang-format off
double stencilsum = coeffs[1*9 + 1*3 + 1] * v_in[index]
+ coeffs[1*9 + 1*3 + 0] * v_in[index - 1]
+ coeffs[1*9 + 1*3 + 2] * v_in[index + 1]
+ coeffs[1*9 + 0*3 + 1] * v_in[index - joff]
+ coeffs[1*9 + 2*3 + 1] * v_in[index + joff]
+ coeffs[0*9 + 1*3 + 1] * v_in[index - ioff]
+ coeffs[2*9 + 1*3 + 1] * v_in[index + ioff]
+ coeffs[1*9 + 0*3 + 0] * v_in[index - joff - koff]
+ coeffs[1*9 + 0*3 + 2] * v_in[index - joff + koff]
+ coeffs[1*9 + 2*3 + 0] * v_in[index + joff - koff]
+ coeffs[1*9 + 2*3 + 2] * v_in[index + joff + koff]
+ coeffs[0*9 + 1*3 + 0] * v_in[index - ioff - koff]
+ coeffs[0*9 + 1*3 + 2] * v_in[index - ioff + koff]
+ coeffs[2*9 + 1*3 + 0] * v_in[index + ioff - koff]
+ coeffs[2*9 + 1*3 + 2] * v_in[index + ioff + koff]
+ coeffs[0*9 + 0*3 + 1] * v_in[index - ioff - joff]
+ coeffs[0*9 + 2*3 + 1] * v_in[index - ioff + joff]
+ coeffs[2*9 + 0*3 + 1] * v_in[index + ioff - joff]
+ coeffs[2*9 + 2*3 + 1] * v_in[index + ioff + joff]
+ coeffs[0*9 + 0*3 + 0] * v_in[index - ioff - joff - koff]
+ coeffs[0*9 + 0*3 + 2] * v_in[index - ioff - joff + koff]
+ coeffs[0*9 + 2*3 + 0] * v_in[index - ioff + joff - koff]
+ coeffs[0*9 + 2*3 + 2] * v_in[index - ioff + joff + koff]
+ coeffs[2*9 + 0*3 + 0] * v_in[index + ioff - joff - koff]
+ coeffs[2*9 + 0*3 + 2] * v_in[index + ioff - joff + koff]
+ coeffs[2*9 + 2*3 + 0] * v_in[index + ioff + joff - koff]
+ coeffs[2*9 + 2*3 + 2] * v_in[index + ioff + joff + koff];
// clang-format on
r[index] = f[index] - stencilsum;
}
}
// version b
#define FS_COEFF000 0.0
#define FS_COEFF001 1.0
#define FS_COEFF002 2.0
#define FS_COEFF010 3.0
#define FS_COEFF011 4.0
#define FS_COEFF012 5.0
#define FS_COEFF020 6.0
#define FS_COEFF021 7.0
#define FS_COEFF022 8.0
#define FS_COEFF100 9.0
#define FS_COEFF101 10.0
#define FS_COEFF102 11.0
#define FS_COEFF110 12.0
#define FS_COEFF111 13.0
#define FS_COEFF112 14.0
#define FS_COEFF120 15.0
#define FS_COEFF121 16.0
#define FS_COEFF122 17.0
#define FS_COEFF200 18.0
#define FS_COEFF201 19.0
#define FS_COEFF202 20.0
#define FS_COEFF210 21.0
#define FS_COEFF211 22.0
#define FS_COEFF212 23.0
#define FS_COEFF220 24.0
#define FS_COEFF221 25.0
#define FS_COEFF222 26.0
// Gets called with one work-item per grid point
__kernel void residual_27point_fixed_stencil_coeffs_preprocessed(
__global double* restrict v_in,
__global double* restrict f,
__global double* restrict r,
const int mgh, const int ngh, const int ogh,
const int ghosts,
const int moff, const int noff, const int ooff)
{
int idx = get_global_id(0);
int no = ngh * ogh;
int i = idx / no;
int j = (idx - i * no) / ogh;
int k = idx % ogh;
int istart_v = ghosts + moff;
int jstart_v = ghosts + noff;
int kstart_v = ghosts + ooff;
int iend_v = mgh - ghosts - moff;
int jend_v = ngh - ghosts - noff;
int kend_v = ogh - ghosts - ooff;
if (i >= istart_v && j >= jstart_v && k >= kstart_v && i < iend_v && j < jend_v && k < kend_v)
{
int ioff = ngh * ogh;
int joff = ogh;
int koff = 1;
int index = i * ioff + j * ogh + k;
// A*v
// clang-format off
double stencilsum = FS_COEFF111 * v_in[index]
+ FS_COEFF110 * v_in[index - 1]
+ FS_COEFF112 * v_in[index + 1]
+ FS_COEFF101 * v_in[index - joff]
+ FS_COEFF121 * v_in[index + joff]
+ FS_COEFF011 * v_in[index - ioff]
+ FS_COEFF211 * v_in[index + ioff]
+ FS_COEFF100 * v_in[index - joff - koff]
+ FS_COEFF102 * v_in[index - joff + koff]
+ FS_COEFF120 * v_in[index + joff - koff]
+ FS_COEFF122 * v_in[index + joff + koff]
+ FS_COEFF010 * v_in[index - ioff - koff]
+ FS_COEFF012 * v_in[index - ioff + koff]
+ FS_COEFF210 * v_in[index + ioff - koff]
+ FS_COEFF212 * v_in[index + ioff + koff]
+ FS_COEFF001 * v_in[index - ioff - joff]
+ FS_COEFF021 * v_in[index - ioff + joff]
+ FS_COEFF201 * v_in[index + ioff - joff]
+ FS_COEFF221 * v_in[index + ioff + joff]
+ FS_COEFF000 * v_in[index - ioff - joff - koff]
+ FS_COEFF002 * v_in[index - ioff - joff + koff]
+ FS_COEFF020 * v_in[index - ioff + joff - koff]
+ FS_COEFF022 * v_in[index - ioff + joff + koff]
+ FS_COEFF200 * v_in[index + ioff - joff - koff]
+ FS_COEFF202 * v_in[index + ioff - joff + koff]
+ FS_COEFF220 * v_in[index + ioff + joff - koff]
+ FS_COEFF222 * v_in[index + ioff + joff + koff];
// clang-format on
r[index] = f[index] - stencilsum;
}
}
Generated PTX from storing OpenCL binary
// .globl residual_27point_fixed_stencil_coeffs_as_global_buffer
.entry residual_27point_fixed_stencil_coeffs_as_global_buffer(
.param .u64 .ptr .global .align 8 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_0,
.param .u64 .ptr .global .align 8 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_1,
.param .u64 .ptr .global .align 8 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_2,
.param .u64 .ptr .global .align 8 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_3,
.param .u32 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_4,
.param .u32 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_5,
.param .u32 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_6,
.param .u32 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_7,
.param .u32 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_8,
.param .u32 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_9,
.param .u32 residual_27point_fixed_stencil_coeffs_as_global_buffer_param_10
)
{
.reg .pred %p<12>;
.reg .b32 %r<43>;
.reg .f64 %fd<84>;
.reg .b64 %rd<32>;
ld.param.u64 %rd1, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_0];
ld.param.u64 %rd2, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_1];
ld.param.u64 %rd3, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_2];
ld.param.u64 %rd4, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_3];
ld.param.u32 %r6, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_4];
ld.param.u32 %r7, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_5];
ld.param.u32 %r5, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_6];
ld.param.u32 %r8, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_7];
ld.param.u32 %r9, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_8];
ld.param.u32 %r10, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_9];
ld.param.u32 %r11, [residual_27point_fixed_stencil_coeffs_as_global_buffer_param_10];
mov.b32 %r12, %envreg3;
mov.u32 %r13, %ctaid.x;
mov.u32 %r14, %ntid.x;
mov.u32 %r15, %tid.x;
add.s32 %r16, %r15, %r12;
mad.lo.s32 %r17, %r14, %r13, %r16;
mul.lo.s32 %r1, %r5, %r7;
div.s32 %r18, %r17, %r1;
mul.lo.s32 %r2, %r18, %r1;
sub.s32 %r19, %r17, %r2;
div.s32 %r3, %r19, %r5;
rem.s32 %r4, %r17, %r5;
sub.s32 %r20, %r6, %r8;
sub.s32 %r21, %r20, %r9;
sub.s32 %r22, %r7, %r8;
sub.s32 %r23, %r22, %r10;
sub.s32 %r24, %r5, %r8;
sub.s32 %r25, %r24, %r11;
add.s32 %r26, %r9, %r8;
setp.lt.s32 %p1, %r18, %r26;
add.s32 %r27, %r10, %r8;
setp.lt.s32 %p2, %r3, %r27;
or.pred %p3, %p1, %p2;
add.s32 %r28, %r11, %r8;
setp.lt.s32 %p4, %r4, %r28;
or.pred %p5, %p3, %p4;
setp.ge.s32 %p6, %r18, %r21;
or.pred %p7, %p6, %p5;
setp.ge.s32 %p8, %r3, %r23;
or.pred %p9, %p8, %p7;
setp.ge.s32 %p10, %r4, %r25;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB63_2;
mad.lo.s32 %r29, %r3, %r5, %r2;
add.s32 %r30, %r29, %r4;
mul.wide.s32 %rd5, %r30, 8;
add.s64 %rd6, %rd1, %rd5;
sub.s32 %r31, %r30, %r5;
mul.wide.s32 %rd7, %r31, 8;
add.s64 %rd8, %rd1, %rd7;
mul.wide.s32 %rd9, %r5, 8;
add.s64 %rd10, %rd6, %rd9;
sub.s32 %r32, %r30, %r1;
mul.wide.s32 %rd11, %r32, 8;
add.s64 %rd12, %rd1, %rd11;
add.s32 %r33, %r30, %r1;
mul.wide.s32 %rd13, %r1, 8;
add.s64 %rd14, %rd6, %rd13;
not.b32 %r34, %r5;
add.s32 %r35, %r30, %r34;
mul.wide.s32 %rd15, %r35, 8;
add.s64 %rd16, %rd1, %rd15;
not.b32 %r36, %r1;
add.s32 %r37, %r30, %r36;
mul.wide.s32 %rd17, %r37, 8;
add.s64 %rd18, %rd1, %rd17;
sub.s32 %r38, %r32, %r5;
mul.wide.s32 %rd19, %r38, 8;
add.s64 %rd20, %rd1, %rd19;
add.s32 %r39, %r32, %r5;
mul.wide.s32 %rd21, %r39, 8;
add.s64 %rd22, %rd1, %rd21;
sub.s32 %r40, %r33, %r5;
mul.wide.s32 %rd23, %r40, 8;
add.s64 %rd24, %rd1, %rd23;
add.s64 %rd25, %rd14, %rd9;
add.s32 %r41, %r32, %r34;
mul.wide.s32 %rd26, %r41, 8;
add.s64 %rd27, %rd1, %rd26;
add.s32 %r42, %r33, %r34;
mul.wide.s32 %rd28, %r42, 8;
add.s64 %rd29, %rd1, %rd28;
add.s64 %rd30, %rd2, %rd5;
ld.global.nc.f64 %fd1, [%rd6];
ld.global.nc.f64 %fd2, [%rd4+104];
mul.f64 %fd3, %fd1, %fd2;
ld.global.nc.f64 %fd4, [%rd6+-8];
ld.global.nc.f64 %fd5, [%rd4+96];
ld.global.nc.f64 %fd6, [%rd6+8];
ld.global.nc.f64 %fd7, [%rd4+112];
ld.global.nc.f64 %fd8, [%rd8];
ld.global.nc.f64 %fd9, [%rd4+80];
ld.global.nc.f64 %fd10, [%rd10];
ld.global.nc.f64 %fd11, [%rd4+128];
ld.global.nc.f64 %fd12, [%rd12];
ld.global.nc.f64 %fd13, [%rd4+32];
ld.global.nc.f64 %fd14, [%rd14];
ld.global.nc.f64 %fd15, [%rd4+176];
ld.global.nc.f64 %fd16, [%rd16];
ld.global.nc.f64 %fd17, [%rd4+72];
ld.global.nc.f64 %fd18, [%rd16+16];
ld.global.nc.f64 %fd19, [%rd4+88];
ld.global.nc.f64 %fd20, [%rd10+-8];
ld.global.nc.f64 %fd21, [%rd4+120];
ld.global.nc.f64 %fd22, [%rd10+8];
ld.global.nc.f64 %fd23, [%rd4+136];
ld.global.nc.f64 %fd24, [%rd18];
ld.global.nc.f64 %fd25, [%rd4+24];
ld.global.nc.f64 %fd26, [%rd18+16];
ld.global.nc.f64 %fd27, [%rd4+40];
ld.global.nc.f64 %fd28, [%rd14+-8];
ld.global.nc.f64 %fd29, [%rd4+168];
ld.global.nc.f64 %fd30, [%rd14+8];
ld.global.nc.f64 %fd31, [%rd4+184];
ld.global.nc.f64 %fd32, [%rd20];
ld.global.nc.f64 %fd33, [%rd4+8];
ld.global.nc.f64 %fd34, [%rd22];
ld.global.nc.f64 %fd35, [%rd4+56];
ld.global.nc.f64 %fd36, [%rd24];
ld.global.nc.f64 %fd37, [%rd4+152];
ld.global.nc.f64 %fd38, [%rd25];
ld.global.nc.f64 %fd39, [%rd4+200];
ld.global.nc.f64 %fd40, [%rd27];
ld.global.nc.f64 %fd41, [%rd4];
ld.global.nc.f64 %fd42, [%rd27+16];
ld.global.nc.f64 %fd43, [%rd4+16];
ld.global.nc.f64 %fd44, [%rd22+-8];
ld.global.nc.f64 %fd45, [%rd4+48];
ld.global.nc.f64 %fd46, [%rd22+8];
ld.global.nc.f64 %fd47, [%rd4+64];
ld.global.nc.f64 %fd48, [%rd29];
ld.global.nc.f64 %fd49, [%rd4+144];
ld.global.nc.f64 %fd50, [%rd29+16];
ld.global.nc.f64 %fd51, [%rd4+160];
ld.global.nc.f64 %fd52, [%rd25+-8];
ld.global.nc.f64 %fd53, [%rd4+192];
ld.global.nc.f64 %fd54, [%rd25+8];
ld.global.nc.f64 %fd55, [%rd4+208];
fma.rn.f64 %fd56, %fd4, %fd5, %fd3;
fma.rn.f64 %fd57, %fd6, %fd7, %fd56;
fma.rn.f64 %fd58, %fd8, %fd9, %fd57;
fma.rn.f64 %fd59, %fd10, %fd11, %fd58;
fma.rn.f64 %fd60, %fd12, %fd13, %fd59;
fma.rn.f64 %fd61, %fd14, %fd15, %fd60;
fma.rn.f64 %fd62, %fd16, %fd17, %fd61;
fma.rn.f64 %fd63, %fd18, %fd19, %fd62;
fma.rn.f64 %fd64, %fd20, %fd21, %fd63;
fma.rn.f64 %fd65, %fd22, %fd23, %fd64;
fma.rn.f64 %fd66, %fd24, %fd25, %fd65;
fma.rn.f64 %fd67, %fd26, %fd27, %fd66;
fma.rn.f64 %fd68, %fd28, %fd29, %fd67;
fma.rn.f64 %fd69, %fd30, %fd31, %fd68;
fma.rn.f64 %fd70, %fd32, %fd33, %fd69;
fma.rn.f64 %fd71, %fd34, %fd35, %fd70;
fma.rn.f64 %fd72, %fd36, %fd37, %fd71;
fma.rn.f64 %fd73, %fd38, %fd39, %fd72;
fma.rn.f64 %fd74, %fd40, %fd41, %fd73;
fma.rn.f64 %fd75, %fd42, %fd43, %fd74;
fma.rn.f64 %fd76, %fd44, %fd45, %fd75;
fma.rn.f64 %fd77, %fd46, %fd47, %fd76;
fma.rn.f64 %fd78, %fd48, %fd49, %fd77;
fma.rn.f64 %fd79, %fd50, %fd51, %fd78;
fma.rn.f64 %fd80, %fd52, %fd53, %fd79;
fma.rn.f64 %fd81, %fd54, %fd55, %fd80;
ld.global.nc.f64 %fd82, [%rd30];
sub.f64 %fd83, %fd82, %fd81;
add.s64 %rd31, %rd3, %rd5;
st.global.f64 [%rd31], %fd83;
$L__BB63_2:
ret;
}
// .globl residual_27point_fixed_stencil_coeffs_preprocessed
.entry residual_27point_fixed_stencil_coeffs_preprocessed(
.param .u64 .ptr .global .align 8 residual_27point_fixed_stencil_coeffs_preprocessed_param_0,
.param .u64 .ptr .global .align 8 residual_27point_fixed_stencil_coeffs_preprocessed_param_1,
.param .u64 .ptr .global .align 8 residual_27point_fixed_stencil_coeffs_preprocessed_param_2,
.param .u32 residual_27point_fixed_stencil_coeffs_preprocessed_param_3,
.param .u32 residual_27point_fixed_stencil_coeffs_preprocessed_param_4,
.param .u32 residual_27point_fixed_stencil_coeffs_preprocessed_param_5,
.param .u32 residual_27point_fixed_stencil_coeffs_preprocessed_param_6,
.param .u32 residual_27point_fixed_stencil_coeffs_preprocessed_param_7,
.param .u32 residual_27point_fixed_stencil_coeffs_preprocessed_param_8,
.param .u32 residual_27point_fixed_stencil_coeffs_preprocessed_param_9
)
{
.reg .pred %p<12>;
.reg .b32 %r<43>;
.reg .f64 %fd<56>;
.reg .b64 %rd<31>;
ld.param.u64 %rd1, [residual_27point_fixed_stencil_coeffs_preprocessed_param_0];
ld.param.u64 %rd2, [residual_27point_fixed_stencil_coeffs_preprocessed_param_1];
ld.param.u64 %rd3, [residual_27point_fixed_stencil_coeffs_preprocessed_param_2];
ld.param.u32 %r6, [residual_27point_fixed_stencil_coeffs_preprocessed_param_3];
ld.param.u32 %r7, [residual_27point_fixed_stencil_coeffs_preprocessed_param_4];
ld.param.u32 %r5, [residual_27point_fixed_stencil_coeffs_preprocessed_param_5];
ld.param.u32 %r8, [residual_27point_fixed_stencil_coeffs_preprocessed_param_6];
ld.param.u32 %r9, [residual_27point_fixed_stencil_coeffs_preprocessed_param_7];
ld.param.u32 %r10, [residual_27point_fixed_stencil_coeffs_preprocessed_param_8];
ld.param.u32 %r11, [residual_27point_fixed_stencil_coeffs_preprocessed_param_9];
mov.b32 %r12, %envreg3;
mov.u32 %r13, %ctaid.x;
mov.u32 %r14, %ntid.x;
mov.u32 %r15, %tid.x;
add.s32 %r16, %r15, %r12;
mad.lo.s32 %r17, %r14, %r13, %r16;
mul.lo.s32 %r1, %r5, %r7;
div.s32 %r18, %r17, %r1;
mul.lo.s32 %r2, %r18, %r1;
sub.s32 %r19, %r17, %r2;
div.s32 %r3, %r19, %r5;
rem.s32 %r4, %r17, %r5;
sub.s32 %r20, %r6, %r8;
sub.s32 %r21, %r20, %r9;
sub.s32 %r22, %r7, %r8;
sub.s32 %r23, %r22, %r10;
sub.s32 %r24, %r5, %r8;
sub.s32 %r25, %r24, %r11;
add.s32 %r26, %r9, %r8;
setp.lt.s32 %p1, %r18, %r26;
add.s32 %r27, %r10, %r8;
setp.lt.s32 %p2, %r3, %r27;
or.pred %p3, %p1, %p2;
add.s32 %r28, %r11, %r8;
setp.lt.s32 %p4, %r4, %r28;
or.pred %p5, %p3, %p4;
setp.ge.s32 %p6, %r18, %r21;
or.pred %p7, %p6, %p5;
setp.ge.s32 %p8, %r3, %r23;
or.pred %p9, %p8, %p7;
setp.ge.s32 %p10, %r4, %r25;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB64_2;
mad.lo.s32 %r29, %r3, %r5, %r2;
add.s32 %r30, %r29, %r4;
mul.wide.s32 %rd4, %r30, 8;
add.s64 %rd5, %rd1, %rd4;
sub.s32 %r31, %r30, %r5;
mul.wide.s32 %rd6, %r31, 8;
add.s64 %rd7, %rd1, %rd6;
mul.wide.s32 %rd8, %r5, 8;
add.s64 %rd9, %rd5, %rd8;
sub.s32 %r32, %r30, %r1;
mul.wide.s32 %rd10, %r32, 8;
add.s64 %rd11, %rd1, %rd10;
add.s32 %r33, %r30, %r1;
mul.wide.s32 %rd12, %r1, 8;
add.s64 %rd13, %rd5, %rd12;
not.b32 %r34, %r5;
add.s32 %r35, %r30, %r34;
mul.wide.s32 %rd14, %r35, 8;
add.s64 %rd15, %rd1, %rd14;
not.b32 %r36, %r1;
add.s32 %r37, %r30, %r36;
mul.wide.s32 %rd16, %r37, 8;
add.s64 %rd17, %rd1, %rd16;
sub.s32 %r38, %r32, %r5;
mul.wide.s32 %rd18, %r38, 8;
add.s64 %rd19, %rd1, %rd18;
add.s32 %r39, %r32, %r5;
mul.wide.s32 %rd20, %r39, 8;
add.s64 %rd21, %rd1, %rd20;
sub.s32 %r40, %r33, %r5;
mul.wide.s32 %rd22, %r40, 8;
add.s64 %rd23, %rd1, %rd22;
add.s64 %rd24, %rd13, %rd8;
add.s32 %r41, %r38, 1;
mul.wide.s32 %rd25, %r41, 8;
add.s64 %rd26, %rd1, %rd25;
add.s32 %r42, %r33, %r34;
mul.wide.s32 %rd27, %r42, 8;
add.s64 %rd28, %rd1, %rd27;
add.s64 %rd29, %rd2, %rd4;
ld.global.nc.f64 %fd1, [%rd24+8];
ld.global.nc.f64 %fd2, [%rd24+-8];
ld.global.nc.f64 %fd3, [%rd28+16];
ld.global.nc.f64 %fd4, [%rd28];
ld.global.nc.f64 %fd5, [%rd21+8];
ld.global.nc.f64 %fd6, [%rd21+-8];
ld.global.nc.f64 %fd7, [%rd26];
ld.global.nc.f64 %fd8, [%rd24];
ld.global.nc.f64 %fd9, [%rd23];
ld.global.nc.f64 %fd10, [%rd21];
ld.global.nc.f64 %fd11, [%rd13+8];
ld.global.nc.f64 %fd12, [%rd13+-8];
ld.global.nc.f64 %fd13, [%rd17+16];
ld.global.nc.f64 %fd14, [%rd17];
ld.global.nc.f64 %fd15, [%rd9+8];
ld.global.nc.f64 %fd16, [%rd9+-8];
ld.global.nc.f64 %fd17, [%rd15+16];
ld.global.nc.f64 %fd18, [%rd15];
ld.global.nc.f64 %fd19, [%rd13];
ld.global.nc.f64 %fd20, [%rd11];
ld.global.nc.f64 %fd21, [%rd9];
ld.global.nc.f64 %fd22, [%rd7];
ld.global.nc.f64 %fd23, [%rd5+8];
ld.global.nc.f64 %fd24, [%rd5];
mul.f64 %fd25, %fd24, 0dC02A000000000000;
ld.global.nc.f64 %fd26, [%rd5+-8];
fma.rn.f64 %fd27, %fd26, 0dC028000000000000, %fd25;
fma.rn.f64 %fd28, %fd23, 0dC02C000000000000, %fd27;
fma.rn.f64 %fd29, %fd22, 0dC024000000000000, %fd28;
fma.rn.f64 %fd30, %fd21, 0dC030000000000000, %fd29;
fma.rn.f64 %fd31, %fd20, 0dC010000000000000, %fd30;
fma.rn.f64 %fd32, %fd19, 0dC036000000000000, %fd31;
fma.rn.f64 %fd33, %fd18, 0dC022000000000000, %fd32;
fma.rn.f64 %fd34, %fd17, 0dC026000000000000, %fd33;
fma.rn.f64 %fd35, %fd16, 0dC02E000000000000, %fd34;
fma.rn.f64 %fd36, %fd15, 0dC031000000000000, %fd35;
fma.rn.f64 %fd37, %fd14, 0dC008000000000000, %fd36;
fma.rn.f64 %fd38, %fd13, 0dC014000000000000, %fd37;
fma.rn.f64 %fd39, %fd12, 0dC035000000000000, %fd38;
fma.rn.f64 %fd40, %fd11, 0dC037000000000000, %fd39;
ld.global.nc.f64 %fd41, [%rd19];
sub.f64 %fd42, %fd40, %fd41;
fma.rn.f64 %fd43, %fd10, 0dC01C000000000000, %fd42;
fma.rn.f64 %fd44, %fd9, 0dC033000000000000, %fd43;
fma.rn.f64 %fd45, %fd8, 0dC039000000000000, %fd44;
add.f64 %fd46, %fd7, %fd7;
sub.f64 %fd47, %fd45, %fd46;
fma.rn.f64 %fd48, %fd6, 0dC018000000000000, %fd47;
fma.rn.f64 %fd49, %fd5, 0dC020000000000000, %fd48;
fma.rn.f64 %fd50, %fd4, 0dC032000000000000, %fd49;
fma.rn.f64 %fd51, %fd3, 0dC034000000000000, %fd50;
fma.rn.f64 %fd52, %fd2, 0dC038000000000000, %fd51;
fma.rn.f64 %fd53, %fd1, 0dC03A000000000000, %fd52;
ld.global.nc.f64 %fd54, [%rd29];
add.f64 %fd55, %fd53, %fd54;
add.s64 %rd30, %rd3, %rd4;
st.global.f64 [%rd30], %fd55;
$L__BB64_2:
ret;
}
SASS showed in Nsight Compute
residual_27point_fixed_stencil_coeffs_as_global_buffer
IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28]
UMOV UR4, 0x0
IMAD.MOV.U32 R59, RZ, RZ, c[0x0][0x188]
S2R R0, SR_CTAID.X
S2R R7, SR_TID.X
ULDC.64 UR4, c[0x4][UR4]
IABS R11, c[0x0][0x188]
ULDC.64 UR10, c[0x0][0x188]
LDG.E.SYS R3, [UR4]
IMAD R2, R59, c[0x0][0x184], RZ
ULDC.64 UR4, c[0x0][0x190]
UIADD3 UR6, UR11, UR5, URZ
IABS R9, R2
UIADD3 UR4, UR11, UR4, URZ
ULDC UR5, c[0x0][0x198]
UIADD3 UR5, UR11, UR5, URZ
I2F.RP R6, R9
ULDC.64 UR8, c[0x0][0x180]
IMAD R0, R0, c[0x0][0x0], R7
MUFU.RCP R6, R6
IADD3 R4, R6, 0xffffffe, RZ
IABS R6, R2
F2I.FTZ.U32.TRUNC.NTZ R5, R4
IMAD.MOV.U32 R4, RZ, RZ, RZ
IMAD.MOV R8, RZ, RZ, -R5
IMAD R7, R8, R9, RZ
IMAD.HI.U32 R4, R5, R7, R4
IMAD.MOV R5, RZ, RZ, -R6
I2F.RP R6, R11
MUFU.RCP R6, R6
IADD3 R3, R0, R3, RZ
IABS R0, R3
IMAD.MOV.U32 R7, RZ, RZ, R0
MOV R0, R5
IADD3 R5, R6, 0xffffffe, RZ
IMAD.HI.U32 R4, R4, R7, RZ
IMAD R0, R4, R0, R7
F2I.FTZ.U32.TRUNC.NTZ R5, R5
ISETP.GT.U32.AND P1, PT, R9, R0, PT
@!P1 IMAD.IADD R0, R0, 0x1, -R9
@!P1 IADD3 R4, R4, 0x1, RZ
ISETP.NE.AND P1, PT, R2, RZ, PT
ISETP.GE.U32.AND P0, PT, R0, R9, PT
LOP3.LUT R0, R3, R2, RZ, 0x3c, !PT
ISETP.GE.AND P2, PT, R0, RZ, PT
@P0 IADD3 R4, R4, 0x1, RZ
IMAD.MOV.U32 R13, RZ, RZ, R4
IMAD.MOV R4, RZ, RZ, -R5
@!P2 IADD3 R13, -R13, RZ, RZ
IMAD R9, R4, R11, RZ
MOV R4, RZ
@!P1 LOP3.LUT R13, RZ, R2, RZ, 0x33, !PT
IMAD.HI.U32 R4, R5, R9, R4
IMAD R12, R2, R13, RZ
IMAD.IADD R0, R3, 0x1, -R12
IABS R6, R0
LOP3.LUT R0, R0, c[0x0][0x188], RZ, 0x3c, !PT
IMAD.HI.U32 R5, R4, R6, RZ
IMAD.HI.U32 R4, R4, R7, RZ
IMAD.MOV R8, RZ, RZ, -R5
IMAD.MOV R10, RZ, RZ, -R4
IMAD R4, R11, R8, R6
IMAD R6, R11, R10, R7
ISETP.GT.U32.AND P1, PT, R11, R4, PT
ISETP.GT.U32.AND P0, PT, R11, R6, PT
@!P1 IADD3 R4, R4, -R11, RZ
@!P0 IMAD.IADD R6, R6, 0x1, -R11
ISETP.GE.AND P0, PT, R0, RZ, PT
ISETP.GE.U32.AND P3, PT, R4, R11, PT
@!P1 IADD3 R5, R5, 0x1, RZ
ISETP.GT.U32.AND P2, PT, R11, R6, PT
ISETP.NE.AND P1, PT, RZ, c[0x0][0x188], PT
LOP3.LUT R0, RZ, c[0x0][0x188], RZ, 0x33, !PT
@P3 IADD3 R5, R5, 0x1, RZ
ISETP.GE.AND P3, PT, R3, RZ, PT
@!P0 IADD3 R5, -R5, RZ, RZ
@!P2 IMAD.IADD R6, R6, 0x1, -R11
SEL R3, R0, R5, !P1
ISETP.GE.AND P0, PT, R3, UR6, PT
UIADD3 UR6, -UR6, UR9, URZ
@!P3 IMAD.MOV R6, RZ, RZ, -R6
ISETP.LT.OR P0, PT, R13, UR4, !P0
UIADD3 UR4, -UR4, UR8, URZ
SEL R6, R0, R6, !P1
ISETP.LT.OR P0, PT, R6, UR5, P0
UIADD3 UR5, -UR5, UR10, URZ
ISETP.GE.OR P0, PT, R13, UR4, P0
ISETP.GE.OR P0, PT, R3, UR6, P0
ISETP.GE.OR P0, PT, R6, UR5, P0
@P0 EXIT
IMAD R3, R3, c[0x0][0x188], R12
ULDC.64 UR4, c[0x0][0x178]
IMAD.MOV.U32 R0, RZ, RZ, 0x8
LDG.E.64.CONSTANT.SYS R48, [UR4+0xb0]
IADD3 R3, R6, R3, RZ
LDG.E.64.CONSTANT.SYS R54, [UR4+0x58]
IADD3 R61, R3, -c[0x0][0x188], RZ
IMAD.WIDE R6, R3, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R56, [UR4+0x18]
LDG.E.64.CONSTANT.SYS R12, [UR4+0x68]
IMAD.WIDE R8, R2, 0x8, R6
LDG.E.64.CONSTANT.SYS R14, [R6]
IMAD.WIDE R60, R61, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R18, [UR4+0x8]
LDG.E.64.CONSTANT.SYS R44, [R8]
LDG.E.64.CONSTANT.SYS R50, [R60+0x8]
IMAD.IADD R5, R3, 0x1, -R2
IMAD.WIDE R20, R59, 0x8, R6
LDG.E.64.CONSTANT.SYS R24, [UR4+0x80]
IMAD.WIDE R10, R5, R0, c[0x0][0x160]
IADD3 R17, R5, -c[0x0][0x188], RZ
LDG.E.64.CONSTANT.SYS R28, [UR4+0x28]
LDG.E.64.CONSTANT.SYS R26, [R20]
LDG.E.64.CONSTANT.SYS R52, [R10+-0x8]
IMAD.WIDE R16, R17, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R30, [R10+0x8]
LDG.E.64.CONSTANT.SYS R32, [UR4+0x48]
LDG.E.64.CONSTANT.SYS R22, [R16]
LDG.E.64.CONSTANT.SYS R34, [R60+-0x8]
LDG.E.64.CONSTANT.SYS R36, [UR4+0xa8]
LDG.E.64.CONSTANT.SYS R38, [R8+-0x8]
LDG.E.64.CONSTANT.SYS R40, [R60]
LDG.E.64.CONSTANT.SYS R42, [UR4+0x50]
LDG.E.64.CONSTANT.SYS R46, [R20+0x8]
LDG.E.64.CONSTANT.SYS R10, [R10]
DMUL R48, R48, R44
LDG.E.64.CONSTANT.SYS R44, [UR4+0x88]
DFMA R54, R50, R54, R48
LDG.E.64.CONSTANT.SYS R48, [UR4+0x60]
LDG.E.64.CONSTANT.SYS R50, [R6+-0x8]
DFMA R52, R56, R52, R54
LDG.E.64.CONSTANT.SYS R54, [UR4+0xb8]
LDG.E.64.CONSTANT.SYS R56, [R8+0x8]
DFMA R12, R14, R12, R52
LDG.E.64.CONSTANT.SYS R14, [R20+-0x8]
LDG.E.64.CONSTANT.SYS R52, [UR4+0x78]
DFMA R18, R18, R22, R12
LDG.E.64.CONSTANT.SYS R12, [UR4+0x20]
IADD3 R5, R5, c[0x0][0x188], RZ
LDG.E.64.CONSTANT.SYS R6, [R6+0x8]
DFMA R24, R26, R24, R18
LDG.E.64.CONSTANT.SYS R18, [UR4+0x70]
IMAD.WIDE R4, R5, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R22, [UR4]
DFMA R28, R28, R30, R24
LDG.E.64.CONSTANT.SYS R20, [R16+-0x8]
IADD3 R25, R3, -c[0x0][0x188], R2
LDG.E.64.CONSTANT.SYS R26, [UR4+0x40]
IMAD.WIDE R24, R25, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R30, [R4+0x8]
DFMA R34, R34, R32, R28
LDG.E.64.CONSTANT.SYS R28, [UR4+0x98]
LDG.E.64.CONSTANT.SYS R32, [R24]
DFMA R38, R36, R38, R34
LDG.E.64.CONSTANT.SYS R34, [UR4+0x30]
LDG.E.64.CONSTANT.SYS R36, [R4+-0x8]
DFMA R40, R40, R42, R38
LDG.E.64.CONSTANT.SYS R16, [R16+0x8]
LDG.E.64.CONSTANT.SYS R38, [UR4+0x10]
IMAD.WIDE R8, R59, 0x8, R8
LDG.E.64.CONSTANT.SYS R42, [R24+-0x8]
LDG.E.64.CONSTANT.SYS R4, [R4]
LDG.E.64.CONSTANT.SYS R60, [R8+-0x8]
LDG.E.64.CONSTANT.SYS R58, [UR4+0xc0]
DFMA R44, R44, R46, R40
LDG.E.64.CONSTANT.SYS R40, [UR4+0x90]
LDG.E.64.CONSTANT.SYS R46, [R8]
DFMA R50, R50, R48, R44
LDG.E.64.CONSTANT.SYS R44, [UR4+0xc8]
LDG.E.64.CONSTANT.SYS R48, [UR4+0x38]
DFMA R56, R54, R56, R50
LDG.E.64.CONSTANT.SYS R50, [UR4+0xa0]
LDG.E.64.CONSTANT.SYS R54, [R24+0x8]
DFMA R56, R52, R14, R56
LDG.E.64.CONSTANT.SYS R14, [UR4+0xd0]
LDG.E.64.CONSTANT.SYS R52, [R8+0x8]
IMAD.WIDE R24, R3, R0, c[0x0][0x168]
LDG.E.64.CONSTANT.SYS R24, [R24]
DFMA R10, R12, R10, R56
DFMA R6, R6, R18, R10
DFMA R6, R22, R20, R6
DFMA R6, R26, R30, R6
DFMA R6, R28, R32, R6
DFMA R6, R34, R36, R6
DFMA R6, R38, R16, R6
IMAD.WIDE R2, R3, R0, c[0x0][0x170]
DFMA R6, R40, R42, R6
DFMA R6, R44, R46, R6
DFMA R4, R48, R4, R6
DFMA R4, R50, R54, R4
DFMA R4, R14, R52, R4
DFMA R4, R58, R60, R4
DADD R4, R24, -R4
STG.E.64.SYS [R2], R4
EXIT
BRA 0x7fee37efb130
NOP
NOP
NOP
NOP
residual_27point_fixed_stencil_coeffs_preprocessed
IMAD.MOV.U32 R1, RZ, RZ, c[0x0][0x28]
UMOV UR4, 0x0
IMAD.MOV.U32 R51, RZ, RZ, c[0x0][0x180]
S2R R0, SR_CTAID.X
S2R R7, SR_TID.X
ULDC.64 UR4, c[0x4][UR4]
IABS R11, c[0x0][0x180]
ULDC.64 UR10, c[0x0][0x180]
LDG.E.SYS R5, [UR4]
IMAD R8, R51, c[0x0][0x17c], RZ
ULDC.64 UR4, c[0x0][0x188]
UIADD3 UR6, UR11, UR5, URZ
IABS R9, R8
UIADD3 UR4, UR11, UR4, URZ
ULDC UR5, c[0x0][0x190]
UIADD3 UR5, UR11, UR5, URZ
I2F.RP R4, R9
ULDC.64 UR8, c[0x0][0x178]
IMAD R0, R0, c[0x0][0x0], R7
MUFU.RCP R4, R4
IADD3 R2, R4, 0xffffffe, RZ
IABS R4, R8
F2I.FTZ.U32.TRUNC.NTZ R3, R2
IMAD.MOV.U32 R2, RZ, RZ, RZ
IMAD.MOV R6, RZ, RZ, -R3
IMAD R7, R6, R9, RZ
IMAD.HI.U32 R2, R3, R7, R2
IMAD.MOV R3, RZ, RZ, -R4
I2F.RP R4, R11
MUFU.RCP R4, R4
IADD3 R5, R0, R5, RZ
IABS R0, R5
IMAD.MOV.U32 R7, RZ, RZ, R0
MOV R0, R3
IADD3 R3, R4, 0xffffffe, RZ
IMAD.HI.U32 R2, R2, R7, RZ
IMAD R0, R2, R0, R7
F2I.FTZ.U32.TRUNC.NTZ R3, R3
ISETP.GT.U32.AND P1, PT, R9, R0, PT
@!P1 IMAD.IADD R0, R0, 0x1, -R9
@!P1 IADD3 R2, R2, 0x1, RZ
ISETP.NE.AND P1, PT, R8, RZ, PT
ISETP.GE.U32.AND P0, PT, R0, R9, PT
LOP3.LUT R0, R5, R8, RZ, 0x3c, !PT
ISETP.GE.AND P2, PT, R0, RZ, PT
@P0 IADD3 R2, R2, 0x1, RZ
IMAD.MOV.U32 R13, RZ, RZ, R2
IMAD.MOV R2, RZ, RZ, -R3
@!P2 IADD3 R13, -R13, RZ, RZ
IMAD R9, R2, R11, RZ
MOV R2, RZ
@!P1 LOP3.LUT R13, RZ, R8, RZ, 0x33, !PT
IMAD.HI.U32 R2, R3, R9, R2
IMAD R12, R8, R13, RZ
IMAD.IADD R0, R5, 0x1, -R12
IABS R4, R0
LOP3.LUT R0, R0, c[0x0][0x180], RZ, 0x3c, !PT
IMAD.HI.U32 R3, R2, R4, RZ
IMAD.HI.U32 R2, R2, R7, RZ
IMAD.MOV R6, RZ, RZ, -R3
IMAD.MOV R10, RZ, RZ, -R2
IMAD R2, R11, R6, R4
IMAD R4, R11, R10, R7
ISETP.GT.U32.AND P1, PT, R11, R2, PT
ISETP.GT.U32.AND P0, PT, R11, R4, PT
@!P1 IADD3 R2, R2, -R11, RZ
@!P0 IMAD.IADD R4, R4, 0x1, -R11
ISETP.GE.AND P0, PT, R0, RZ, PT
ISETP.GE.U32.AND P3, PT, R2, R11, PT
@!P1 IADD3 R3, R3, 0x1, RZ
ISETP.GT.U32.AND P2, PT, R11, R4, PT
ISETP.NE.AND P1, PT, RZ, c[0x0][0x180], PT
LOP3.LUT R0, RZ, c[0x0][0x180], RZ, 0x33, !PT
@P3 IADD3 R3, R3, 0x1, RZ
ISETP.GE.AND P3, PT, R5, RZ, PT
@!P0 IADD3 R3, -R3, RZ, RZ
@!P2 IMAD.IADD R4, R4, 0x1, -R11
SEL R3, R0, R3, !P1
ISETP.GE.AND P0, PT, R3, UR6, PT
UIADD3 UR6, -UR6, UR9, URZ
@!P3 IMAD.MOV R4, RZ, RZ, -R4
ISETP.LT.OR P0, PT, R13, UR4, !P0
UIADD3 UR4, -UR4, UR8, URZ
SEL R0, R0, R4, !P1
ISETP.LT.OR P0, PT, R0, UR5, P0
UIADD3 UR5, -UR5, UR10, URZ
ISETP.GE.OR P0, PT, R13, UR4, P0
ISETP.GE.OR P0, PT, R3, UR6, P0
ISETP.GE.OR P0, PT, R0, UR5, P0
@P0 EXIT
IMAD R53, R3, c[0x0][0x180], R12
IMAD.IADD R53, R0, 0x1, R53
MOV R0, 0x8
IADD3 R17, R53, -c[0x0][0x180], RZ
IMAD.WIDE R16, R17, R0, c[0x0][0x160]
IMAD.IADD R9, R53, 0x1, -R8
LDG.E.64.CONSTANT.SYS R4, [R16+-0x8]
IMAD.WIDE R12, R9, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R48, [R16]
IMAD.WIDE R14, R53, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R46, [R12]
IMAD.WIDE R54, R8, 0x8, R14
LDG.E.64.CONSTANT.SYS R44, [R14+-0x8]
IMAD.WIDE R2, R53, R0, c[0x0][0x168]
LDG.E.64.CONSTANT.SYS R40, [R14+0x8]
LDG.E.64.CONSTANT.SYS R42, [R54]
IMAD.WIDE R20, R51, 0x8, R14
IADD3 R7, R9, -c[0x0][0x180], RZ
LDG.E.64.CONSTANT.SYS R2, [R2]
IMAD.WIDE R50, R51, 0x8, R54
IADD3 R11, R53, -c[0x0][0x180], R8
LDG.E.64.CONSTANT.SYS R32, [R14]
IMAD.WIDE R6, R7, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R36, [R20]
LDG.E.64.CONSTANT.SYS R38, [R50+0x8]
IMAD.WIDE R10, R11, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R34, [R6+0x8]
IADD3 R57, R9, c[0x0][0x180], RZ
LDG.E.64.CONSTANT.SYS R28, [R12+-0x8]
LDG.E.64.CONSTANT.SYS R30, [R10+-0x8]
IMAD.WIDE R56, R57, R0, c[0x0][0x160]
LDG.E.64.CONSTANT.SYS R26, [R10+0x8]
LDG.E.64.CONSTANT.SYS R24, [R20+-0x8]
LDG.E.64.CONSTANT.SYS R22, [R56]
LDG.E.64.CONSTANT.SYS R18, [R50]
LDG.E.64.CONSTANT.SYS R20, [R20+0x8]
LDG.E.64.CONSTANT.SYS R16, [R16+0x8]
LDG.E.64.CONSTANT.SYS R14, [R56+0x8]
LDG.E.64.CONSTANT.SYS R12, [R12+0x8]
LDG.E.64.CONSTANT.SYS R10, [R10]
LDG.E.64.CONSTANT.SYS R8, [R54+0x8]
LDG.E.64.CONSTANT.SYS R58, [R56+-0x8]
LDG.E.64.CONSTANT.SYS R60, [R54+-0x8]
LDG.E.64.CONSTANT.SYS R50, [R50+-0x8]
LDG.E.64.CONSTANT.SYS R6, [R6]
DMUL R4, R4, -9
DFMA R4, R48, -10, R4
DFMA R4, R46, -4, R4
DFMA R4, R44, -12, R4
DFMA R4, R42, -22, R4
DFMA R4, R40, -14, R4
DFMA R2, R38, -26, R2
DFMA R4, R36, -16, R4
DFMA R2, R34, -2, R2
DFMA R4, R32, -13, R4
DFMA R2, R30, -18, R2
DFMA R4, R28, -3, R4
DFMA R2, R26, -20, R2
DFMA R4, R24, -15, R4
DFMA R2, R22, -7, R2
DFMA R4, R20, -17, R4
DFMA R2, R18, -25, R2
DFMA R4, R16, -11, R4
DFMA R2, R14, -8, R2
DFMA R4, R12, -5, R4
DFMA R2, R10, -19, R2
DFMA R4, R8, -23, R4
DFMA R2, R58, -6, R2
DFMA R4, R60, -21, R4
DFMA R2, R50, -24, R2
DADD R4, -R6, R4
DADD R2, R4, R2
IMAD.WIDE R4, R53, R0, c[0x0][0x170]
STG.E.64.SYS [R4], R2
EXIT
BRA 0x7fee37efc450
NOP
NOP
Edits: Added comments for clarification of which kernel is version a and which is version b in OpenCL source code.