OpenACC should cast double to integer on the device

Hi all,

I have a quite specific question this time.

I use the following kernel on the device:

//***********************************************************
#pragma acc parallel loop vector_length(_vec_len) pcopyin(data_1[0:d1_size], u[0:v_size], dob[0:dob_size]) pcopy(v[0:v_size])
for (int ii = 0; ii < v_size; ii += _vec_len) {
const int lml = ((dob[ii / _vec_len + 1] - dob[ii / _vec_len]) / _vec_len);
const int stride = dob[ii / _vec_len];

const int kk_max = _vec_len + ii < v_size ? _vec_len + ii : v_size;
#pragma acc loop independent
for (int kk = ii; kk < kk_max; ++kk) {
S s = 0.0;
#pragma acc loop seq
for (int jj = 0; jj < lml; jj+=2) {
const int index = stride + kk - ii + jj * _vec_len;
const int q = (int)data_1[index];
s += data_1[index + _vec_len] * u[q];
}
v[kk] = s;
}
}
//***********************************************************

First of all, the CODE works! My question concerns the line:

const int q = (int)data_1[index];

I want to cast at this line a double to an integer value. My application is quite specific, and I expected to get a better performance if I cast this value locally. Unfortunately, my performance decreases :-(

I looked at the ptx code (created via: -ta=tesla:cc60,fastmath,keepptx) and tried to find the “cast - point”.

In
http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#syntax
I found that a cast in ptx looks like : (.s64)(.u64)
But I can’t find a cast at the “ptx”-kernel!

Here is my question: Is it possible that the compiler realize, that I cast with an “arithmetic” (the pointer jumps in every loop with _vec_len), and cast this elements to an integer automatically, before the value is send to the thread?
If yes, can I force the compiler to send a double, and cast locally?

Best,
Stefan

PS:

  1. All pointers and elements of my kernel are constant, except on v and s. And every pointer is __restrict! “dob” is the only integer array!

  2. I add the ptx output for this kernel:


    // .weak _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu
    .weak .entry _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu(
    .param .u32 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_0,
    .param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_1,
    .param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_2,
    .param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_3,
    .param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_4,
    .param .u32 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_5
    )
    .maxntid 128, 1, 1
    {
    .reg .pred %p<11>;
    .reg .b32 %r<119>;
    .reg .f64 %fd<33>;
    .reg .b64 %rd<35>;


    ld.param.u32 %r36, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_0];
    ld.param.u64 %rd2, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_1];
    ld.param.u64 %rd3, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_2];
    ld.param.u64 %rd4, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_3];
    ld.param.u64 %rd5, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_4];
    ld.param.u32 %r33, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_5];
    neg.s32 %r107, %r36;
    mov.u32 %r37, %ctaid.x;
    shl.b32 %r110, %r37, 7;
    mov.u32 %r38, 1;
    sub.s32 %r39, %r38, %r36;
    add.s32 %r108, %r39, %r37;
    mov.u32 %r109, 0;
    mov.u32 %r106, %r109;
    cvta.to.global.u64 %rd1, %rd2;
    cvta.to.global.u64 %rd33, %rd3;

BB57_1:
setp.gt.s32 %p1, %r108, 0;
@%p1 bra BB57_12;

add.s32 %r40, %r109, %r37;
shl.b32 %r41, %r40, 7;
bfe.s32 %r42, %r40, 24, 1;
shr.u32 %r43, %r42, 25;
add.s32 %r44, %r41, %r43;
shr.s32 %r45, %r44, 7;
shl.b32 %r46, %r45, 2;
add.s32 %r47, %r46, 4;
cvt.s64.s32 %rd6, %r47;
add.s64 %rd7, %rd1, %rd6;
ld.global.nc.u32 %r48, [%rd7±4];
ld.global.nc.u32 %r49, [%rd7];
sub.s32 %r50, %r49, %r48;
shr.s32 %r51, %r50, 31;
shr.u32 %r52, %r51, 24;
add.s32 %r53, %r50, %r52;
shr.s32 %r54, %r53, 8;
shl.b32 %r10, %r54, 1;
shl.b32 %r56, %r37, 7;
add.s32 %r57, %r56, %r106;
add.s32 %r58, %r57, 128;
min.s32 %r11, %r33, %r58;
sub.s32 %r12, %r41, %r11;
setp.gt.s32 %p2, %r12, -1;
@%p2 bra BB57_12;

shr.s32 %r59, %r110, 31;
shr.u32 %r60, %r59, 25;
add.s32 %r61, %r110, %r60;
shr.s32 %r62, %r61, 7;
shl.b32 %r63, %r62, 2;
cvt.s64.s32 %rd8, %r63;
add.s64 %rd9, %rd1, %rd8;
ld.global.nc.u32 %r13, [%rd9];
add.s32 %r64, %r10, 1;
shr.u32 %r65, %r64, 31;
add.s32 %r66, %r64, %r65;
shr.s32 %r14, %r66, 1;
bar.sync 0;
sub.s32 %r15, %r11, %r41;
mov.u32 %r111, 0;

BB57_4:
mov.u32 %r16, %r111;
mov.u32 %r70, %tid.x;
add.s32 %r17, %r70, %r16;
add.s32 %r71, %r17, %r12;
setp.gt.s32 %p3, %r71, -1;
@%p3 bra BB57_11;

setp.lt.s32 %p4, %r10, 1;
mov.f64 %fd30, 0d0000000000000000;
@%p4 bra BB57_10;

setp.lt.s32 %p5, %r64, 8;
add.s32 %r18, %r17, %r13;
mov.u32 %r118, 0;
mov.f64 %fd32, 0d0000000000000000;
mov.u32 %r117, %r118;
mov.f64 %fd31, %fd32;
mov.u32 %r115, %r14;
mov.u32 %r114, %r14;
@%p5 bra BB57_9;

BB57_7:
mov.u32 %r19, %r115;
add.s32 %r75, %r18, %r118;
shl.b32 %r76, %r75, 3;
cvt.s64.s32 %rd10, %r76;
cvta.to.global.u64 %rd11, %rd4;
add.s64 %rd12, %rd11, %rd10;
ld.global.nc.f64 %fd9, [%rd12];
cvt.rzi.s32.f64 %r77, %fd9;
add.s32 %r78, %r75, 128;
shl.b32 %r79, %r78, 3;
cvt.s64.s32 %rd13, %r79;
add.s64 %rd14, %rd11, %rd13;
ld.global.nc.f64 %fd10, [%rd14+1024];
cvt.rzi.s32.f64 %r80, %fd10;
ld.global.nc.f64 %fd11, [%rd14+3072];
cvt.rzi.s32.f64 %r81, %fd11;
ld.global.nc.f64 %fd12, [%rd14+5120];
cvt.rzi.s32.f64 %r82, %fd12;
shl.b32 %r83, %r77, 3;
cvt.s64.s32 %rd15, %r83;
cvta.to.global.u64 %rd16, %rd5;
add.s64 %rd17, %rd16, %rd15;
ld.global.nc.f64 %fd13, [%rd17];
ld.global.nc.f64 %fd14, [%rd14];
fma.rn.f64 %fd15, %fd13, %fd14, %fd32;
shl.b32 %r84, %r80, 3;
cvt.s64.s32 %rd18, %r84;
add.s64 %rd19, %rd16, %rd18;
ld.global.nc.f64 %fd16, [%rd19];
ld.global.nc.f64 %fd17, [%rd14+2048];
fma.rn.f64 %fd18, %fd16, %fd17, %fd15;
shl.b32 %r85, %r81, 3;
cvt.s64.s32 %rd20, %r85;
add.s64 %rd21, %rd16, %rd20;
ld.global.nc.f64 %fd19, [%rd21];
ld.global.nc.f64 %fd20, [%rd14+4096];
fma.rn.f64 %fd21, %fd19, %fd20, %fd18;
shl.b32 %r86, %r82, 3;
cvt.s64.s32 %rd22, %r86;
add.s64 %rd23, %rd16, %rd22;
ld.global.nc.f64 %fd22, [%rd23];
ld.global.nc.f64 %fd23, [%rd14+6144];
fma.rn.f64 %fd32, %fd22, %fd23, %fd21;
add.s32 %r118, %r118, 1024;
add.s32 %r115, %r19, -4;
add.s32 %r87, %r19, -7;
setp.gt.s32 %p6, %r87, 0;
@%p6 bra BB57_7;

setp.lt.s32 %p7, %r115, 1;
mov.u32 %r113, %r115;
mov.u32 %r114, %r113;
mov.u32 %r117, %r118;
mov.f64 %fd30, %fd32;
mov.f64 %fd31, %fd32;
@%p7 bra BB57_10;

BB57_9:
mov.u32 %r23, %r114;
add.s32 %r88, %r18, %r117;
shl.b32 %r89, %r88, 3;
cvt.s64.s32 %rd24, %r89;
cvta.to.global.u64 %rd25, %rd4;
add.s64 %rd26, %rd25, %rd24;
ld.global.nc.f64 %fd24, [%rd26];
cvt.rzi.s32.f64 %r90, %fd24;
add.s32 %r91, %r88, 128;
shl.b32 %r92, %r91, 3;
cvt.s64.s32 %rd27, %r92;
add.s64 %rd28, %rd25, %rd27;
shl.b32 %r93, %r90, 3;
cvt.s64.s32 %rd29, %r93;
cvta.to.global.u64 %rd30, %rd5;
add.s64 %rd31, %rd30, %rd29;
ld.global.nc.f64 %fd25, [%rd31];
ld.global.nc.f64 %fd26, [%rd28];
fma.rn.f64 %fd31, %fd25, %fd26, %fd31;
add.s32 %r117, %r117, 256;
add.s32 %r26, %r23, -1;
setp.gt.s32 %p8, %r26, 0;
mov.u32 %r114, %r26;
mov.f64 %fd30, %fd31;
@%p8 bra BB57_9;

BB57_10:
add.s32 %r97, %r16, %r41;
add.s32 %r99, %r97, %r70;
shl.b32 %r100, %r99, 3;
cvt.s64.s32 %rd32, %r100;
add.s64 %rd34, %rd33, %rd32;
st.global.f64 [%rd34], %fd30;

BB57_11:
add.s32 %r111, %r16, 128;
mov.u32 %r101, -128;
sub.s32 %r102, %r101, %r16;
add.s32 %r103, %r15, %r102;
setp.gt.s32 %p9, %r103, 0;
@%p9 bra BB57_4;

BB57_12:
mov.u32 %r104, %nctaid.x;
add.s32 %r109, %r104, %r109;
shl.b32 %r105, %r104, 7;
add.s32 %r110, %r105, %r110;
add.s32 %r108, %r104, %r108;
add.s32 %r107, %r104, %r107;
add.s32 %r106, %r105, %r106;
setp.lt.s32 %p10, %r107, 0;
@%p10 bra BB57_1;

ret;
}

Hi Stephan,

Is it possible that the compiler realize, that I cast with an “arithmetic” (the pointer jumps in every loop with _vec_len), and cast this elements to an integer automatically, before the value is send to the thread?

While I’m not 100% positive, I believe the relevant section is:

BB57_9: 
...
	ld.global.nc.f64 %fd24, [%rd26]; 
	cvt.rzi.s32.f64 %r90, %fd24; 
...

It looks to me that the “data_1” value is being stored into a floating point register and then converted using the “cvt” instruction into an integer register (i.e.“q”).

Note that the PGI compiler doesn’t create the PTX code. That’s performed by the back-end CUDA compiler (libnvvm). What may be helpful is to look at the code PGI generates by using the “keep” option instead of “keepptx”. This will show the LLVM code we generate in the “.gpu” file. Or if you add “-ta=tesla:nollvm,keep”, the .gpu file will contain low-level CUDA C code which I find easier to read.

Unfortunately, my performance decreases :-(

Compared to what code? Could the performance loss be due to some other factor such as data layout or register pressure?

-Mat

Hi Mat,

thanks for the (as usually) useful answer. The flag “-ta=tesla:nollvm,keep” is indeed easier to read.
I try now to find (and understand) the concerned part of the code!

Compared to what code?

This is a slight modification with respect to the “data-layout”. Therefore I compared the result to the code without this “modification”.

Thanks,
Stefan