I simplified the code:
$ cat t22.cu
#include <stdio.h>
__global__ void mytest(float c, float a) {
#pragma unroll 256
for (int i = 0; i < 256; i++)
c += a;
if (c == 0) printf("?");
}
int main(){
mytest<<<8192,256>>>(0.1f, 0.2f);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_50 -o t22 t22.cu
$ nvprof --metrics issued_ipc,ipc,eligible_warps_per_cycle ./t22
==10405== NVPROF is profiling process 10405, command: ./t22
==10405== Profiling application: ./t22
==10405== Profiling result:
==10405== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "GeForce GTX 960 (0)"
Kernel: mytest(float, float)
1 issued_ipc Issued IPC 3.855329 3.855329 3.855329
1 ipc Executed IPC 3.855097 3.855097 3.855097
1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 4.242007 4.242007 4.242007
$ cuobjdump -sass ./t22
Fatbin elf code:
================
arch = sm_50
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_50
Fatbin elf code:
================
arch = sm_50
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_50
Function : _Z6mytestff
.headerflags @"EF_CUDA_SM50 EF_CUDA_PTX_SM(EF_CUDA_SM50)"
/* 0x001fd800fec007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ MOV R0, c[0x0][0x144] ; /* 0x4c98078005170000 */
/*0018*/ FADD R0, R0, c[0x0][0x140] ; /* 0x4c58000005070000 */
/* 0x001fd800fec007f6 */
/*0028*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0030*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0038*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800fec007e6 */
/*0048*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0050*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0058*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800fec007f6 */
/*0068*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0070*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0078*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800ffa007e6 */
/*0088*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0090*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0098*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800fec007f6 */
/*00a8*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*00b0*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*00b8*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800fcc007f6 */
/*00c8*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*00d0*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*00d8*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800fec007f6 */
/*00e8*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*00f0*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*00f8*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001f9800ffa007f6 */
/*0108*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0110*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0118*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800fec007f6 */
/*0128*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
The above pattern repeats until the final output:
/*0a48*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0a50*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0a58*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800fec007f6 */
/*0a68*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0a70*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0a78*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800ffa007f6 */
/*0a88*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0a90*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0a98*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fd800fec007e6 */
/*0aa8*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0ab0*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/*0ab8*/ FADD R0, R0, c[0x0][0x144] ; /* 0x4c58000005170000 */
/* 0x001fc400ffa007ed */
/*0ac8*/ FSETP.NEU.AND P0, PT, R0, RZ, PT ; /* 0x5bbd03800ff70007 */
/*0ad0*/ @P0 EXIT ; /* 0xe30000000000000f */
/*0ad8*/ MOV32I R4, 0x0 ; /* 0x010000000007f004 */
/* 0x001fd800fe2007f1 */
/*0ae8*/ MOV32I R5, 0x0 ; /* 0x010000000007f005 */
/*0af0*/ MOV R6, RZ ; /* 0x5c9807800ff70006 */
/*0af8*/ MOV R7, RZ ; /* 0x5c9807800ff70007 */
/* 0x001fbc00fde007fd */
/*0b08*/ JCAL 0x0 ; /* 0xe220000000000040 */
/*0b10*/ NOP ; /* 0x50b0000000070f00 */
/*0b18*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001ffc00ffe007e4 */
/*0b28*/ NOP ; /* 0x50b0000000070f00 */
/*0b30*/ EXIT ; /* 0xe30000000007000f */
/*0b38*/ BRA 0xb38 ; /* 0xe2400fffff87000f */
......................
Fatbin ptx code:
================
arch = sm_50
code version = [6,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$