Hi all,
I have a quite specific question this time.
I use the following kernel on the device:
//***********************************************************
#pragma acc parallel loop vector_length(_vec_len) pcopyin(data_1[0:d1_size], u[0:v_size], dob[0:dob_size]) pcopy(v[0:v_size])
for (int ii = 0; ii < v_size; ii += _vec_len) {
const int lml = ((dob[ii / _vec_len + 1] - dob[ii / _vec_len]) / _vec_len);
const int stride = dob[ii / _vec_len];
const int kk_max = _vec_len + ii < v_size ? _vec_len + ii : v_size;
#pragma acc loop independent
for (int kk = ii; kk < kk_max; ++kk) {
S s = 0.0;
#pragma acc loop seq
for (int jj = 0; jj < lml; jj+=2) {
const int index = stride + kk - ii + jj * _vec_len;
const int q = (int)data_1[index];
s += data_1[index + _vec_len] * u[q];
}
v[kk] = s;
}
}
//***********************************************************
First of all, the CODE works! My question concerns the line:
const int q = (int)data_1[index];
I want to cast at this line a double to an integer value. My application is quite specific, and I expected to get a better performance if I cast this value locally. Unfortunately, my performance decreases :-(
I looked at the ptx code (created via: -ta=tesla:cc60,fastmath,keepptx) and tried to find the “cast - point”.
In
I found that a cast in ptx looks like : (.s64)(.u64)
But I can’t find a cast at the “ptx”-kernel!
Here is my question: Is it possible that the compiler realize, that I cast with an “arithmetic” (the pointer jumps in every loop with _vec_len), and cast this elements to an integer automatically, before the value is send to the thread?
If yes, can I force the compiler to send a double, and cast locally?
Best,
Stefan
PS:
-
All pointers and elements of my kernel are constant, except on v and s. And every pointer is __restrict! “dob” is the only integer array!
-
I add the ptx output for this kernel:
// .weak _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu
.weak .entry _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu(
.param .u32 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_0,
.param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_1,
.param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_2,
.param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_3,
.param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_4,
.param .u32 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_5
)
.maxntid 128, 1, 1
{
.reg .pred %p<11>;
.reg .b32 %r<119>;
.reg .f64 %fd<33>;
.reg .b64 %rd<35>;
ld.param.u32 %r36, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_0];
ld.param.u64 %rd2, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_1];
ld.param.u64 %rd3, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_2];
ld.param.u64 %rd4, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_3];
ld.param.u64 %rd5, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_4];
ld.param.u32 %r33, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_5];
neg.s32 %r107, %r36;
mov.u32 %r37, %ctaid.x;
shl.b32 %r110, %r37, 7;
mov.u32 %r38, 1;
sub.s32 %r39, %r38, %r36;
add.s32 %r108, %r39, %r37;
mov.u32 %r109, 0;
mov.u32 %r106, %r109;
cvta.to.global.u64 %rd1, %rd2;
cvta.to.global.u64 %rd33, %rd3;
BB57_1:
setp.gt.s32 %p1, %r108, 0;
@%p1 bra BB57_12;
add.s32 %r40, %r109, %r37;
shl.b32 %r41, %r40, 7;
bfe.s32 %r42, %r40, 24, 1;
shr.u32 %r43, %r42, 25;
add.s32 %r44, %r41, %r43;
shr.s32 %r45, %r44, 7;
shl.b32 %r46, %r45, 2;
add.s32 %r47, %r46, 4;
cvt.s64.s32 %rd6, %r47;
add.s64 %rd7, %rd1, %rd6;
ld.global.nc.u32 %r48, [%rd7±4];
ld.global.nc.u32 %r49, [%rd7];
sub.s32 %r50, %r49, %r48;
shr.s32 %r51, %r50, 31;
shr.u32 %r52, %r51, 24;
add.s32 %r53, %r50, %r52;
shr.s32 %r54, %r53, 8;
shl.b32 %r10, %r54, 1;
shl.b32 %r56, %r37, 7;
add.s32 %r57, %r56, %r106;
add.s32 %r58, %r57, 128;
min.s32 %r11, %r33, %r58;
sub.s32 %r12, %r41, %r11;
setp.gt.s32 %p2, %r12, -1;
@%p2 bra BB57_12;
shr.s32 %r59, %r110, 31;
shr.u32 %r60, %r59, 25;
add.s32 %r61, %r110, %r60;
shr.s32 %r62, %r61, 7;
shl.b32 %r63, %r62, 2;
cvt.s64.s32 %rd8, %r63;
add.s64 %rd9, %rd1, %rd8;
ld.global.nc.u32 %r13, [%rd9];
add.s32 %r64, %r10, 1;
shr.u32 %r65, %r64, 31;
add.s32 %r66, %r64, %r65;
shr.s32 %r14, %r66, 1;
bar.sync 0;
sub.s32 %r15, %r11, %r41;
mov.u32 %r111, 0;
BB57_4:
mov.u32 %r16, %r111;
mov.u32 %r70, %tid.x;
add.s32 %r17, %r70, %r16;
add.s32 %r71, %r17, %r12;
setp.gt.s32 %p3, %r71, -1;
@%p3 bra BB57_11;
setp.lt.s32 %p4, %r10, 1;
mov.f64 %fd30, 0d0000000000000000;
@%p4 bra BB57_10;
setp.lt.s32 %p5, %r64, 8;
add.s32 %r18, %r17, %r13;
mov.u32 %r118, 0;
mov.f64 %fd32, 0d0000000000000000;
mov.u32 %r117, %r118;
mov.f64 %fd31, %fd32;
mov.u32 %r115, %r14;
mov.u32 %r114, %r14;
@%p5 bra BB57_9;
BB57_7:
mov.u32 %r19, %r115;
add.s32 %r75, %r18, %r118;
shl.b32 %r76, %r75, 3;
cvt.s64.s32 %rd10, %r76;
cvta.to.global.u64 %rd11, %rd4;
add.s64 %rd12, %rd11, %rd10;
ld.global.nc.f64 %fd9, [%rd12];
cvt.rzi.s32.f64 %r77, %fd9;
add.s32 %r78, %r75, 128;
shl.b32 %r79, %r78, 3;
cvt.s64.s32 %rd13, %r79;
add.s64 %rd14, %rd11, %rd13;
ld.global.nc.f64 %fd10, [%rd14+1024];
cvt.rzi.s32.f64 %r80, %fd10;
ld.global.nc.f64 %fd11, [%rd14+3072];
cvt.rzi.s32.f64 %r81, %fd11;
ld.global.nc.f64 %fd12, [%rd14+5120];
cvt.rzi.s32.f64 %r82, %fd12;
shl.b32 %r83, %r77, 3;
cvt.s64.s32 %rd15, %r83;
cvta.to.global.u64 %rd16, %rd5;
add.s64 %rd17, %rd16, %rd15;
ld.global.nc.f64 %fd13, [%rd17];
ld.global.nc.f64 %fd14, [%rd14];
fma.rn.f64 %fd15, %fd13, %fd14, %fd32;
shl.b32 %r84, %r80, 3;
cvt.s64.s32 %rd18, %r84;
add.s64 %rd19, %rd16, %rd18;
ld.global.nc.f64 %fd16, [%rd19];
ld.global.nc.f64 %fd17, [%rd14+2048];
fma.rn.f64 %fd18, %fd16, %fd17, %fd15;
shl.b32 %r85, %r81, 3;
cvt.s64.s32 %rd20, %r85;
add.s64 %rd21, %rd16, %rd20;
ld.global.nc.f64 %fd19, [%rd21];
ld.global.nc.f64 %fd20, [%rd14+4096];
fma.rn.f64 %fd21, %fd19, %fd20, %fd18;
shl.b32 %r86, %r82, 3;
cvt.s64.s32 %rd22, %r86;
add.s64 %rd23, %rd16, %rd22;
ld.global.nc.f64 %fd22, [%rd23];
ld.global.nc.f64 %fd23, [%rd14+6144];
fma.rn.f64 %fd32, %fd22, %fd23, %fd21;
add.s32 %r118, %r118, 1024;
add.s32 %r115, %r19, -4;
add.s32 %r87, %r19, -7;
setp.gt.s32 %p6, %r87, 0;
@%p6 bra BB57_7;
setp.lt.s32 %p7, %r115, 1;
mov.u32 %r113, %r115;
mov.u32 %r114, %r113;
mov.u32 %r117, %r118;
mov.f64 %fd30, %fd32;
mov.f64 %fd31, %fd32;
@%p7 bra BB57_10;
BB57_9:
mov.u32 %r23, %r114;
add.s32 %r88, %r18, %r117;
shl.b32 %r89, %r88, 3;
cvt.s64.s32 %rd24, %r89;
cvta.to.global.u64 %rd25, %rd4;
add.s64 %rd26, %rd25, %rd24;
ld.global.nc.f64 %fd24, [%rd26];
cvt.rzi.s32.f64 %r90, %fd24;
add.s32 %r91, %r88, 128;
shl.b32 %r92, %r91, 3;
cvt.s64.s32 %rd27, %r92;
add.s64 %rd28, %rd25, %rd27;
shl.b32 %r93, %r90, 3;
cvt.s64.s32 %rd29, %r93;
cvta.to.global.u64 %rd30, %rd5;
add.s64 %rd31, %rd30, %rd29;
ld.global.nc.f64 %fd25, [%rd31];
ld.global.nc.f64 %fd26, [%rd28];
fma.rn.f64 %fd31, %fd25, %fd26, %fd31;
add.s32 %r117, %r117, 256;
add.s32 %r26, %r23, -1;
setp.gt.s32 %p8, %r26, 0;
mov.u32 %r114, %r26;
mov.f64 %fd30, %fd31;
@%p8 bra BB57_9;
BB57_10:
add.s32 %r97, %r16, %r41;
add.s32 %r99, %r97, %r70;
shl.b32 %r100, %r99, 3;
cvt.s64.s32 %rd32, %r100;
add.s64 %rd34, %rd33, %rd32;
st.global.f64 [%rd34], %fd30;
BB57_11:
add.s32 %r111, %r16, 128;
mov.u32 %r101, -128;
sub.s32 %r102, %r101, %r16;
add.s32 %r103, %r15, %r102;
setp.gt.s32 %p9, %r103, 0;
@%p9 bra BB57_4;
BB57_12:
mov.u32 %r104, %nctaid.x;
add.s32 %r109, %r104, %r109;
shl.b32 %r105, %r104, 7;
add.s32 %r110, %r105, %r110;
add.s32 %r108, %r104, %r108;
add.s32 %r107, %r104, %r107;
add.s32 %r106, %r105, %r106;
setp.lt.s32 %p10, %r107, 0;
@%p10 bra BB57_1;
ret;
}