Compile flags:
-gencode arch=compute_32,code=sm_32 --ptxas-options=-v -m32 -Xcompiler -mfloat-abi=hard
This is what’s in the main body:
global void int_kernel_2_256_(int num_threads, int num_streams, TYPE* in, TYPE* out)
{
uint tid = threadIdx.x + blockIdx.x * blockDim.x;
TYPE x, y;
if(tid < num_threads) {
x = in[tid];
y = in[num_threads + tid];
x = x + CONST;
x = x && 0x000FFFFF;
y = y + CONST;
y = y && 0x000FFFFF;
…
out[tid] = x;
out[num_threads + tid] = y;
}
}
and in the objdump:
Function : _Z16int_kernel_2_256iiPiS_
.headerflags @“EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)”
/* 0x08a000b0a0a08c00 /
/0008/ MOV R1, c[0x0][0x44]; / 0x64c03c00089c0006 /
/0010/ S2R R0, SR_CTAID.X; / 0x86400000129c0002 /
/0018/ S2R R3, SR_TID.X; / 0x86400000109c000e /
/0020/ IMAD R2, R0, c[0x0][0x28], R3; / 0x51080c00051c000a /
/0028/ ISETP.GE.U32.AND P0, PT, R2, c[0x0][0x140], PT; / 0x5b601c00281c081e /
/0030/ @P0 BRA.U 0x12d8; / 0x120000095000023c /
/0038/ @!P0 ISCADD R4, R2, c[0x0][0x148], 0x2; / 0x60c0080029200812 /
/ 0x08a0b8a010b8a010 /
/0048/ @!P0 IADD R0, R2, c[0x0][0x140]; / 0x6080000028200802 /
/0050/ @!P0 LD R4, [R4]; / 0xc400000000201010 /
/0058/ @!P0 ISCADD R3, R0, c[0x0][0x148], 0x2; / 0x60c008002920000e /
/0060/ @!P0 LD R3, [R3]; / 0xc400000000200c0c /
/0068/ @!P0 IADD R5, R4, 0x5; / 0xc080000002a01015 /
/0070/ @!P0 LOP32I.XOR R4, R5, 0xf0f0f0f0; / 0x2278787878201410 /
/0078/ @!P0 IADD R5, R3, 0x5; / 0xc080000002a00c15 /
/ 0x08a09c80a010a010 /
/0088/ @!P0 IADD R3, R4, 0x5; / 0xc080000002a0100d /
/0090/ @!P0 LOP32I.XOR R4, R5, 0xf0f0f0f0; / 0x2278787878201410 /
/0098/ @!P0 IADD R4, R4, 0x5; / 0xc080000002a01011 /
/00a0/ @!P0 LOP32I.XOR R3, R3, 0xf0f0f0f0; / 0x2278787878200c0c /
/00a8/ @!P0 LOP32I.XOR R4, R4, 0xf0f0f0f0; / 0x2278787878201010 /
/00b0/ @!P0 IADD R3, R3, 0x5; / 0xc080000002a00c0d /
/00b8/ @!P0 IADD R4, R4, 0x5; / 0xc080000002a01011 /
/ 0x08a09c80a010a010 /
/00c8/ @!P0 LOP32I.XOR R3, R3, 0xf0f0f0f0; / 0x2278787878200c0c /
/00d0/ @!P0 LOP32I.XOR R4, R4, 0xf0f0f0f0; / 0x2278787878201010 /
/00d8/ @!P0 IADD R3, R3, 0x5; / 0xc080000002a00c0d /
/00e0/ @!P0 IADD R4, R4, 0x5; / 0xc080000002a01011 /
/00e8/ @!P0 LOP32I.XOR R3, R3, 0xf0f0f0f0; / 0x2278787878200c0c /
/00f0/ @!P0 LOP32I.XOR R4, R4, 0xf0f0f0f0; / 0x2278787878201010 /
/00f8/ @!P0 IADD R3, R3, 0x5; / 0xc080000002a00c0d /
/ 0x08a09c80a010a010 /
/0108/ @!P0 IADD R4, R4, 0x5; / 0xc080000002a01011 /
/0110/ @!P0 LOP32I.XOR R3, R3, 0xf0f0f0f0; / 0x2278787878200c0c /
/0118/ @!P0 LOP32I.XOR R4, R4, 0xf0f0f0f0; / 0x2278787878201010 /
/0120/ @!P0 IADD R3, R3, 0x5; / 0xc080000002a00c0d /
/0128/ @!P0 IADD R4, R4, 0x5; / 0xc080000002a01011 /
/0130/ @!P0 LOP32I.XOR R3, R3, 0xf0f0f0f0; / 0x2278787878200c0c /
/0138/ @!P0 LOP32I.XOR R4, R4, 0xf0f0f0f0; / 0x2278787878201010 /
/ 0x08a09c80a010a010 /
/0148/ @!P0 IADD R3, R3, 0x5; / 0xc080000002a00c0d /
/0150/ @!P0 IADD R4, R4, 0x5; / 0xc080000002a01011 /
/0158/ @!P0 LOP32I.XOR R3, R3, 0xf0f0f0f0; / 0x2278787878200c0c /
/0160/ @!P0 LOP32I.XOR R4, R4, 0xf0f0f0f0; / 0x2278787878201010 /
/0168/ @!P0 IADD R3, R3, 0x5; / 0xc080000002a00c0d /
/0170/ @!P0 IADD R4, R4, 0x5; / 0xc080000002a01011 /
/0178/ @!P0 LOP32I.XOR R3, R3, 0xf0f0f0f0; / 0x2278787878200c0c /
…
/1290/ @!P0 IADD R3, R3, 0x5; / 0xc080000002a00c0d /
/1298/ @!P0 IADD R4, R4, 0x5; / 0xc080000002a01011 /
/12a0/ @!P0 ISCADD R2, R2, c[0x0][0x14c], 0x2; / 0x60c0080029a0080a /
/12a8/ @!P0 LOP32I.XOR R3, R3, 0xf0f0f0f0; / 0x2278787878200c0c /
/12b0/ @!P0 ISCADD R0, R0, c[0x0][0x14c], 0x2; / 0x60c0080029a00002 /
/12b8/ @!P0 LOP32I.XOR R4, R4, 0xf0f0f0f0; / 0x2278787878201010 /
/ 0x08000000b81000b8 /
/12c8/ @!P0 ST [R2], R3; / 0xe40000000020080c /
/12d0/ @!P0 ST [R0], R4; / 0xe400000000200010 /
/12d8/ MOV RZ, RZ; / 0xe4c03c007f9c03fe /
/12e0/ EXIT; / 0x18000000001c003c /
/12e8/ BRA 0x12e8; / 0x12007ffffc1c003c /
/12f0/ NOP; / 0x85800000001c3c02 /
/12f8/ NOP; / 0x85800000001c3c02 */