I have a simple CUDA demo, when I tried to measure kernel execution time using nsys
command, I find that the execution of kernel code is independent of the number of loops in the loop body.
My code is as follows :
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>
#define BDIMX 32
#define BDIMY 32
__global__ void setRowReadRow(int * out)
{
unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
for(unsigned int e=0; e<65536*30000; e++)
for(unsigned int w=0; w<65536*30000; w++)
for(unsigned int q=0; q<65536*30000; q++)
for(unsigned int i=0; i<65535*30000; i++)
for(unsigned int j=0; j<65535*30000; j++)
for(unsigned int k=0; k<65536*30000; k++)
for(unsigned int l=0; l<65535*30000; l++)
for(unsigned int m=0; m<65536; m++){
out[idx] += m*j ;
}
}
int main(int argc,char **argv)
{
int nElem=BDIMX*BDIMY;
int nBlock=1;
int nByte=sizeof(int)*nElem*nBlock ;
int * out = NULL;
int * host_data = NULL;
host_data = (int *)malloc(sizeof(int) * nElem);
cudaMalloc((int**)&out,nByte);
dim3 block(BDIMY,BDIMX);
dim3 grid(1);
setRowReadRow<<<grid,block>>>(out);
cudaMemcpy(host_data, out, sizeof(int) * nElem,
cudaMemcpyDeviceToHost);
cudaFree(out);
return 0;
}
So I tried to get kernel PTX and SASS code,I used the following command to get the PTX and SASS code:
# compile
nvcc demo.cu -arch=sm_80 -o demo
# get ptx code
cuobjdump --dump-ptx ./demo
# get sass code
nvcc demo.cu -arch=sm_80 -c demo.cu
cuobjdump -sass demo.o
PTX code is as follows, only one ret
command :
Fatbin elf code:
================
arch = sm_80
code version = [1,7]
host = linux
compile_size = 64bit
Fatbin elf code:
================
arch = sm_80
code version = [1,7]
host = linux
compile_size = 64bit
Fatbin ptx code:
================
arch = sm_80
code version = [8,1]
host = linux
compile_size = 64bit
compressed
.version 8.1
.target sm_80
.address_size 64
.visible .entry _Z13setRowReadRowPi(
.param .u64 _Z13setRowReadRowPi_param_0
)
{
ret;
}
SASS code is as follows:
Fatbin elf code:
================
arch = sm_80
code version = [1,7]
host = linux
compile_size = 64bit
code for sm_80
Function : _Z13setRowReadRowPi
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM80 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM80)"
/*0000*/ MOV R1, c[0x0][0x28] ; /* 0x00000a0000017a02 */
/* 0x000fc40000000f00 */
/*0010*/ EXIT ; /* 0x000000000000794d */
/* 0x000fea0003800000 */
/*0020*/ BRA 0x20; /* 0xfffffff000007947 */
/* 0x000fc0000383ffff */
/*0030*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0040*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0050*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0060*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0070*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0080*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*0090*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*00a0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
/*00b0*/ NOP; /* 0x0000000000007918 */
/* 0x000fc00000000000 */
..........
So my question is why is this PTX and SASS code empty? My GPU is A800, OS is Ubuntu20.04.