Stopping optimization of inline PTX code

I’m trying to create microbenchmarks to measure the latency of certain arithmetic instructions, but the PTX optimization always jumbles up my code. I’m using the volatile keyword to stop this optimization.

__global__ void test_latency(clock_t *time) {
  int start = 0, end = 0;
  volatile int test1 = 0x00C0FFEE;
  volatile int test2 = 0xDEADBEEF;

  asm volatile (
        "mov.u32 %0, %%clock;\n\t"
        "add.s32 %2, %2, %3;\n\t"
        "mov.u32 %1, %%clock;\n\t"
        : "=r"(start) 
        , "=r"(end)
        , "+r"(test1)
        , "=r"(test2)
        :: "memory"
      );
  
  *time = (clock_t)(end - start);
}

However, nvcc still optimizes my code. When I compile it with default optimization levels, the IADD instruction gets optimized away completely, leaving me with:

S2R R2, SR_CLOCKLO;
S2R R3, SR_CLOCKLO;

which only measures the latency of the clock function.

When I disable all optimizations with the -Xptxas -O0 flag, I get:

S2R R2, SR_CLOCKLO;
MOV R2, R2;
MOV R2, R2;
IADD R0, R0, R0;
MOV R0, R0;
S2R R0, SR_CLOCKLO;
MOV R0, R0;
MOV R0, R0;

which not only measures the latency of the IADD instruction, but also of three moves that were added to the assembler, despite the volatile keyword.

What I need is something like this:

S2R R2, SR_CLOCKLO;
IADD R0, R0, R1;
S2R R3, SR_CLOCKLO;

where the variables temp1 and temp2 would sit in R0 and R1, respectively.

So, why does my code still get optimized, despite using the volatile keyword, and how to I stop nvcc from doing that?

You need to use function arguments as inputs, not constant values.

riffing off of tera’s answer:

$ cat t392.cu
__device__ int test_latency(clock_t *time, int test1, int test2) {
  unsigned start = 0, end = 0;

  asm volatile (
        "mov.u32 %0, %%clock;\n\t"
        "add.s32 %2, %2, %3;\n\t"
        "mov.u32 %1, %%clock;\n\t"
        : "=r"(start)
        , "=r"(end)
        , "+r"(test1)
        : "r"(test2)
        : "memory"
      );

  *time = (clock_t)(end - start);
  return test1;
}
$ nvcc -dc t392.cu
$ cuobjdump -sass t392.o

Fatbin elf code:
================
arch = sm_30
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
compressed

        code for sm_30
                Function : _Z12test_latencyPlii
        .headerflags    @"EF_CUDA_SM30 EF_CUDA_PTX_SM(EF_CUDA_SM30)"
                                                        /* 0x228042f2f2f2f047 */
        /*0008*/                   MOV R8, R4;          /* 0x2800000010021de4 */
        /*0010*/                   MOV R9, R5;          /* 0x2800000014025de4 */
        /*0018*/                   S2R R0, SR_CLOCKLO;  /* 0x2c00000140001c04 */
        /*0020*/                   IADD R4, R6, R7;     /* 0x480000001c611c03 */
        /*0028*/                   S2R R6, SR_CLOCKLO;  /* 0x2c00000140019c04 */
        /*0030*/                   IADD R6, -R0, R6;    /* 0x4800000018019e03 */
        /*0038*/                   MOV R7, RZ;          /* 0x28000000fc01dde4 */
                                                        /* 0x200000000002f047 */
        /*0048*/                   ST.E.64 [R8], R6;    /* 0x9400000000819ca5 */
        /*0050*/                   RET;                 /* 0x9000000000001de7 */
        /*0058*/                   BRA 0x58;            /* 0x4003ffffe0001de7 */
        /*0060*/                   NOP;                 /* 0x4000000000001de4 */
        /*0068*/                   NOP;                 /* 0x4000000000001de4 */
        /*0070*/                   NOP;                 /* 0x4000000000001de4 */
        /*0078*/                   NOP;                 /* 0x4000000000001de4 */
                ...............................



Fatbin ptx code:
================
arch = sm_30
code version = [6,3]
producer = cuda
host = linux
compile_size = 64bit
compressed
ptxasOptions = --compile-only
$