OpenACC should cast double to integer on the device

StefanRos · July 4, 2017, 5:51am

Hi all,

I have a quite specific question this time.

I use the following kernel on the device:

//***********************************************************
#pragma acc parallel loop vector_length(_vec_len) pcopyin(data_1[0:d1_size], u[0:v_size], dob[0:dob_size]) pcopy(v[0:v_size])
for (int ii = 0; ii < v_size; ii += _vec_len) {
const int lml = ((dob[ii / _vec_len + 1] - dob[ii / _vec_len]) / _vec_len);
const int stride = dob[ii / _vec_len];

const int kk_max = _vec_len + ii < v_size ? _vec_len + ii : v_size;
#pragma acc loop independent
for (int kk = ii; kk < kk_max; ++kk) {
S s = 0.0;
#pragma acc loop seq
for (int jj = 0; jj < lml; jj+=2) {
const int index = stride + kk - ii + jj * _vec_len;
const int q = (int)data_1[index];
s += data_1[index + _vec_len] * u[q];
}
v[kk] = s;
}
}
//***********************************************************

First of all, the CODE works! My question concerns the line:

const int q = (int)data_1[index];

I want to cast at this line a double to an integer value. My application is quite specific, and I expected to get a better performance if I cast this value locally. Unfortunately, my performance decreases :-(

I looked at the ptx code (created via: -ta=tesla:cc60,fastmath,keepptx) and tried to find the “cast - point”.

In

I found that a cast in ptx looks like : (.s64)(.u64)
But I can’t find a cast at the “ptx”-kernel!

Here is my question: Is it possible that the compiler realize, that I cast with an “arithmetic” (the pointer jumps in every loop with _vec_len), and cast this elements to an integer automatically, before the value is send to the thread?
If yes, can I force the compiler to send a double, and cast locally?

Best,
Stefan

PS:

All pointers and elements of my kernel are constant, except on v and s. And every pointer is __restrict! “dob” is the only integer array!
I add the ptx output for this kernel:

// .weak _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu
.weak .entry _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu(
.param .u32 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_0,
.param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_1,
.param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_2,
.param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_3,
.param .u64 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_4,
.param .u32 _ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_5
)
.maxntid 128, 1, 1
{
.reg .pred %p<11>;
.reg .b32 %r<119>;
.reg .f64 %fd<33>;
.reg .b64 %rd<35>;

ld.param.u32 %r36, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_0];
ld.param.u64 %rd2, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_1];
ld.param.u64 %rd3, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_2];
ld.param.u64 %rd4, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_3];
ld.param.u64 %rd5, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_4];
ld.param.u32 %r33, [_ZNK9sr_solver18RELLPACKPlusmatrixIidE13matrix_vectorERKNS_14toolbox_vectorIdEERS3__57_gpu_param_5];
neg.s32 %r107, %r36;
mov.u32 %r37, %ctaid.x;
shl.b32 %r110, %r37, 7;
mov.u32 %r38, 1;
sub.s32 %r39, %r38, %r36;
add.s32 %r108, %r39, %r37;
mov.u32 %r109, 0;
mov.u32 %r106, %r109;
cvta.to.global.u64 %rd1, %rd2;
cvta.to.global.u64 %rd33, %rd3;

BB57_1:
setp.gt.s32 %p1, %r108, 0;
@%p1 bra BB57_12;

add.s32 %r40, %r109, %r37;
shl.b32 %r41, %r40, 7;
bfe.s32 %r42, %r40, 24, 1;
shr.u32 %r43, %r42, 25;
add.s32 %r44, %r41, %r43;
shr.s32 %r45, %r44, 7;
shl.b32 %r46, %r45, 2;
add.s32 %r47, %r46, 4;
cvt.s64.s32 %rd6, %r47;
add.s64 %rd7, %rd1, %rd6;
ld.global.nc.u32 %r48, [%rd7±4];
ld.global.nc.u32 %r49, [%rd7];
sub.s32 %r50, %r49, %r48;
shr.s32 %r51, %r50, 31;
shr.u32 %r52, %r51, 24;
add.s32 %r53, %r50, %r52;
shr.s32 %r54, %r53, 8;
shl.b32 %r10, %r54, 1;
shl.b32 %r56, %r37, 7;
add.s32 %r57, %r56, %r106;
add.s32 %r58, %r57, 128;
min.s32 %r11, %r33, %r58;
sub.s32 %r12, %r41, %r11;
setp.gt.s32 %p2, %r12, -1;
@%p2 bra BB57_12;

shr.s32 %r59, %r110, 31;
shr.u32 %r60, %r59, 25;
add.s32 %r61, %r110, %r60;
shr.s32 %r62, %r61, 7;
shl.b32 %r63, %r62, 2;
cvt.s64.s32 %rd8, %r63;
add.s64 %rd9, %rd1, %rd8;
ld.global.nc.u32 %r13, [%rd9];
add.s32 %r64, %r10, 1;
shr.u32 %r65, %r64, 31;
add.s32 %r66, %r64, %r65;
shr.s32 %r14, %r66, 1;
bar.sync 0;
sub.s32 %r15, %r11, %r41;
mov.u32 %r111, 0;

BB57_4:
mov.u32 %r16, %r111;
mov.u32 %r70, %tid.x;
add.s32 %r17, %r70, %r16;
add.s32 %r71, %r17, %r12;
setp.gt.s32 %p3, %r71, -1;
@%p3 bra BB57_11;

setp.lt.s32 %p4, %r10, 1;
mov.f64 %fd30, 0d0000000000000000;
@%p4 bra BB57_10;

setp.lt.s32 %p5, %r64, 8;
add.s32 %r18, %r17, %r13;
mov.u32 %r118, 0;
mov.f64 %fd32, 0d0000000000000000;
mov.u32 %r117, %r118;
mov.f64 %fd31, %fd32;
mov.u32 %r115, %r14;
mov.u32 %r114, %r14;
@%p5 bra BB57_9;

BB57_7:
mov.u32 %r19, %r115;
add.s32 %r75, %r18, %r118;
shl.b32 %r76, %r75, 3;
cvt.s64.s32 %rd10, %r76;
cvta.to.global.u64 %rd11, %rd4;
add.s64 %rd12, %rd11, %rd10;
ld.global.nc.f64 %fd9, [%rd12];
cvt.rzi.s32.f64 %r77, %fd9;
add.s32 %r78, %r75, 128;
shl.b32 %r79, %r78, 3;
cvt.s64.s32 %rd13, %r79;
add.s64 %rd14, %rd11, %rd13;
ld.global.nc.f64 %fd10, [%rd14+1024];
cvt.rzi.s32.f64 %r80, %fd10;
ld.global.nc.f64 %fd11, [%rd14+3072];
cvt.rzi.s32.f64 %r81, %fd11;
ld.global.nc.f64 %fd12, [%rd14+5120];
cvt.rzi.s32.f64 %r82, %fd12;
shl.b32 %r83, %r77, 3;
cvt.s64.s32 %rd15, %r83;
cvta.to.global.u64 %rd16, %rd5;
add.s64 %rd17, %rd16, %rd15;
ld.global.nc.f64 %fd13, [%rd17];
ld.global.nc.f64 %fd14, [%rd14];
fma.rn.f64 %fd15, %fd13, %fd14, %fd32;
shl.b32 %r84, %r80, 3;
cvt.s64.s32 %rd18, %r84;
add.s64 %rd19, %rd16, %rd18;
ld.global.nc.f64 %fd16, [%rd19];
ld.global.nc.f64 %fd17, [%rd14+2048];
fma.rn.f64 %fd18, %fd16, %fd17, %fd15;
shl.b32 %r85, %r81, 3;
cvt.s64.s32 %rd20, %r85;
add.s64 %rd21, %rd16, %rd20;
ld.global.nc.f64 %fd19, [%rd21];
ld.global.nc.f64 %fd20, [%rd14+4096];
fma.rn.f64 %fd21, %fd19, %fd20, %fd18;
shl.b32 %r86, %r82, 3;
cvt.s64.s32 %rd22, %r86;
add.s64 %rd23, %rd16, %rd22;
ld.global.nc.f64 %fd22, [%rd23];
ld.global.nc.f64 %fd23, [%rd14+6144];
fma.rn.f64 %fd32, %fd22, %fd23, %fd21;
add.s32 %r118, %r118, 1024;
add.s32 %r115, %r19, -4;
add.s32 %r87, %r19, -7;
setp.gt.s32 %p6, %r87, 0;
@%p6 bra BB57_7;

setp.lt.s32 %p7, %r115, 1;
mov.u32 %r113, %r115;
mov.u32 %r114, %r113;
mov.u32 %r117, %r118;
mov.f64 %fd30, %fd32;
mov.f64 %fd31, %fd32;
@%p7 bra BB57_10;

BB57_9:
mov.u32 %r23, %r114;
add.s32 %r88, %r18, %r117;
shl.b32 %r89, %r88, 3;
cvt.s64.s32 %rd24, %r89;
cvta.to.global.u64 %rd25, %rd4;
add.s64 %rd26, %rd25, %rd24;
ld.global.nc.f64 %fd24, [%rd26];
cvt.rzi.s32.f64 %r90, %fd24;
add.s32 %r91, %r88, 128;
shl.b32 %r92, %r91, 3;
cvt.s64.s32 %rd27, %r92;
add.s64 %rd28, %rd25, %rd27;
shl.b32 %r93, %r90, 3;
cvt.s64.s32 %rd29, %r93;
cvta.to.global.u64 %rd30, %rd5;
add.s64 %rd31, %rd30, %rd29;
ld.global.nc.f64 %fd25, [%rd31];
ld.global.nc.f64 %fd26, [%rd28];
fma.rn.f64 %fd31, %fd25, %fd26, %fd31;
add.s32 %r117, %r117, 256;
add.s32 %r26, %r23, -1;
setp.gt.s32 %p8, %r26, 0;
mov.u32 %r114, %r26;
mov.f64 %fd30, %fd31;
@%p8 bra BB57_9;

BB57_10:
add.s32 %r97, %r16, %r41;
add.s32 %r99, %r97, %r70;
shl.b32 %r100, %r99, 3;
cvt.s64.s32 %rd32, %r100;
add.s64 %rd34, %rd33, %rd32;
st.global.f64 [%rd34], %fd30;

BB57_11:
add.s32 %r111, %r16, 128;
mov.u32 %r101, -128;
sub.s32 %r102, %r101, %r16;
add.s32 %r103, %r15, %r102;
setp.gt.s32 %p9, %r103, 0;
@%p9 bra BB57_4;

BB57_12:
mov.u32 %r104, %nctaid.x;
add.s32 %r109, %r104, %r109;
shl.b32 %r105, %r104, 7;
add.s32 %r110, %r105, %r110;
add.s32 %r108, %r104, %r108;
add.s32 %r107, %r104, %r107;
add.s32 %r106, %r105, %r106;
setp.lt.s32 %p10, %r107, 0;
@%p10 bra BB57_1;

ret;
}

MatColgrove · July 6, 2017, 6:00pm

Hi Stephan,

Is it possible that the compiler realize, that I cast with an “arithmetic” (the pointer jumps in every loop with _vec_len), and cast this elements to an integer automatically, before the value is send to the thread?

While I’m not 100% positive, I believe the relevant section is:

BB57_9: 
...
	ld.global.nc.f64 %fd24, [%rd26]; 
	cvt.rzi.s32.f64 %r90, %fd24; 
...

It looks to me that the “data_1” value is being stored into a floating point register and then converted using the “cvt” instruction into an integer register (i.e.“q”).

Note that the PGI compiler doesn’t create the PTX code. That’s performed by the back-end CUDA compiler (libnvvm). What may be helpful is to look at the code PGI generates by using the “keep” option instead of “keepptx”. This will show the LLVM code we generate in the “.gpu” file. Or if you add “-ta=tesla:nollvm,keep”, the .gpu file will contain low-level CUDA C code which I find easier to read.

Unfortunately, my performance decreases :-(

Compared to what code? Could the performance loss be due to some other factor such as data layout or register pressure?

-Mat

StefanRos · July 13, 2017, 9:27am

Hi Mat,

thanks for the (as usually) useful answer. The flag “-ta=tesla:nollvm,keep” is indeed easier to read.
I try now to find (and understand) the concerned part of the code!

Compared to what code?

This is a slight modification with respect to the “data-layout”. Therefore I compared the result to the code without this “modification”.

Thanks,
Stefan

Topic		Replies	Views
Acceleration of nested loops Legacy PGI Compilers	5	4096	November 8, 2016
PGCC-S-0094-Illegal type conversion required! Legacy PGI Compilers	5	4951	September 30, 2015
Mapping the kernel arguments for PGI generated OpenACC cuda code Legacy PGI Compilers cuda	1	584	November 2, 2020
Compiler failed to translate accelerator region Legacy PGI Compilers	9	6829	June 26, 2013
OPENACC changes value of array Legacy PGI Compilers	12	9810	May 17, 2016
"invalid cast opcode" compiling OpenACC for radeon Legacy PGI Compilers	1	2733	July 16, 2015
OpenACC diff between GPU + CPU codes Legacy PGI Compilers	5	4100	May 31, 2012
Examining the generated .ptx file CUDA Programming and Performance	13	2580	October 24, 2014
compiler ask acc routine information for internal function Legacy PGI Compilers	12	20415	October 25, 2017
atomicadd for double precision in CUDA Fortran Legacy PGI Compilers	20	21787	November 15, 2013

OpenACC should cast double to integer on the device

Related topics