Neat! The method provided by njuffa seems to be faster in some cases.
# cat t130.cu
/* -2**22 <= a < 2**22 */
__device__ float fast_int_to_float (int a)
{
const float fmagic = (1 << 23) + (1 << 22);
const int imagic = __float_as_int (fmagic);
return __int_as_float (imagic + a) - fmagic;
}
__global__ void k1(int s, int e, float *r){
float val = 0;
for (int i = s+threadIdx.x; i < e; i++){
float x = i;
val += x;}
r[threadIdx.x] = val;
}
__global__ void k2(int s, int e, float *r){
float val = 0;
for (int i = s+threadIdx.x; i < e; i++){
float x = fast_int_to_float(i);
val += x;}
r[threadIdx.x] = val;
}
int main(){
const int nBLK = 58*3;
const int nTPB = 512;
const int s = 101;
const int e = 1048576;
float *r;
cudaMalloc(&r, nTPB*sizeof(*r));
k1<<<nBLK, nTPB>>>(s,e,r);
k2<<<nBLK, nTPB>>>(s,e,r);
cudaDeviceSynchronize();
k1<<<nBLK, nTPB>>>(s,e,r);
cudaDeviceSynchronize();
k2<<<nBLK, nTPB>>>(s,e,r);
cudaDeviceSynchronize();
}
# nvcc -o t130 t130.cu -arch=sm_89 -Xptxas=-v
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z2k2iiPf' for 'sm_89'
ptxas info : Function properties for _Z2k2iiPf
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 12 registers, 368 bytes cmem[0]
ptxas info : Compiling entry function '_Z2k1iiPf' for 'sm_89'
ptxas info : Function properties for _Z2k1iiPf
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 12 registers, 368 bytes cmem[0]
root@hpe-dl385-gen10-005:~/bobc# compute-sanitizer ./t130
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
# nsys nvprof --print-gpu-trace ./t130
WARNING: t130 and any of its children processes will be profiled.
Generating '/tmp/nsys-report-9c7e.qdstrm'
[1/3] [========================100%] report26.nsys-rep
[2/3] [========================100%] report26.sqlite
[3/3] Executing 'cuda_gpu_trace' stats report
Start (ns) Duration (ns) CorrId GrdX GrdY GrdZ BlkX BlkY BlkZ Reg/Trd StcSMem (MB) DymSMem (MB) Bytes (MB) Throughput (MBps) SrcMemKd DstMemKd Device Ctx Strm Name
----------- ------------- ------ ---- ---- ---- ---- ---- ---- ------- ------------ ------------ ---------- ----------------- -------- -------- ------------- --- ---- ---------------------
728,359,084 25,519,897 119 174 1 1 512 1 1 16 0.000 0.000 NVIDIA L4 (0) 1 7 k1(int, int, float *)
753,882,117 21,606,997 120 174 1 1 512 1 1 16 0.000 0.000 NVIDIA L4 (0) 1 7 k2(int, int, float *)
775,503,834 30,668,735 122 174 1 1 512 1 1 16 0.000 0.000 NVIDIA L4 (0) 1 7 k1(int, int, float *)
806,183,065 24,234,968 124 174 1 1 512 1 1 16 0.000 0.000 NVIDIA L4 (0) 1 7 k2(int, int, float *)
#
I did not do careful verification of the SASS, but I did confirm that the k1
kernel has I2FP
instructions and the k2
kernel does not.