Math Intrinsics are not speeding up the performance

Hi!!!

I have a GTX 1080 Ti GPU, using CUDA 8.0.

CMake:

find_package(CUDA REQUIRED 8.0)

set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G -g -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60")

set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} " -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60")

SET(CUDA_PROPAGATE_HOST_FLAGS ON)
enable_language(CUDA)

I have this Kernel:

__global__ void test_Kernel()
{
        float x;        
        for (int i = 0;i < 100000000;i++)
        {
            x = __expf((float)i);
            x = exp((float)i);

	    x = __cosf((float)i);
	    x = cos((float)i);
        }
}

void my_function()
{
int width = 500;
int height= 500;

int thread_x = 16;
int thread_y = 8;

dim3 Num_Threads_per_Block(thread_x, thread_y);
dim3 Num_Blocks(DivUp(width, thread_x), DivUp(height, thread_y));

for (int i = 0;i < 1000;i++)
{
     test_Kernel<< < Num_Blocks, Num_Threads_per_Block >> > ();
}
}

In the Kernel function I did two tests:

Test1:

x = __expf((float)i);
            //x = exp((float)i);

	    x = __cosf((float)i);
	    //x = cos((float)i);

Test2:

//x = __expf((float)i);
            x = exp((float)i);

	    //x = __cosf((float)i);
	    x = cos((float)i);

In both tests I am getting the same performance, no speed up…

I was checking the assembler code with the code like this:

x = __expf((float)i);
            x = exp((float)i);

	    x = __cosf((float)i);
	    x = cos((float)i);

and I am getting this SASS:

float x;
        for (int i = 0;i < 100000000;i++)
0x000da230       ISETP.LT.AND P0, PT, R16, c[0x2][0x0], PT   
0x000da238       PSETP.AND.AND P0, PT, !P0, PT, PT   
0x000da240       NOP  
0x000da248  @P0  BRA 0x5b0   
0x000da250       BRA 0x3d8   
        {
            x = __expf((float)i);
0x000da258       I2F.F32.S32 R0, R16   
0x000da260       NOP  
0x000da268       MOV R0, R0   
        {
            x = __expf((float)i);
0x000da270       BRA 0x3f8   
--- c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp 
  return __nv_fast_expf(a);
0x000da278       FMUL32I R2, R0, 1.4426950216293334961   
0x000da280       NOP  
0x000da288       MOV R2, R2   
0x000da290       MOV R2, R2   
0x000da298       MOV R2, R2   
0x000da2a0       NOP  
0x000da2a8       MOV R2, R2   
0x000da2b0       MOV32I R3, 0xc2fc0000   
0x000da2b8       FSETP.LT.AND P0, PT, R2, R3, PT   
0x000da2c0       NOP  
0x000da2c8       MOV32I R3, 0x3f000000   
0x000da2d0       FMUL R3, R2, R3   
0x000da2d8       SEL R2, R3, R2, P0   
0x000da2e0       NOP  
0x000da2e8       RRO.EX2 R2, R2   
0x000da2f0       MUFU.EX2 R2, R2   
0x000da2f8       MOV R2, R2   
0x000da300       NOP  
0x000da308       FMUL R3, R2, R2   
0x000da310       SEL R2, R3, R2, P0   
0x000da318       MOV R2, R2   
0x000da320       NOP  
0x000da328       MOV R2, R2   
0x000da330       MOV R2, R2   
#define __DEVICE_FUNCTIONS_DECL__ __device__
#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__
0x000da338       MOV R0, R2   
0x000da340       NOP  
0x000da348       MOV R2, R0   
#endif /* __CUDACC_RTC__ */
0x000da350       I2F.F32.S32 R0, R16   
0x000da358       MOV R4, R0   
0x000da360       NOP  
0x000da368       JCAL 0xd6340   
0x000da370       MOV R0, R4   
0x000da378       MOV R0, R0   
0x000da380       NOP  

#include "builtin_types.h"
0x000da388       I2F.F32.S32 R2, R16   
0x000da390       MOV R2, R2   

#include "builtin_types.h"
0x000da398       BRA 0x528   
0x000da3a0       NOP  
--- c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp 
  return __nv_fast_cosf(a);
0x000da3a8       RRO.SINCOS R3, R2   
0x000da3b0       MUFU.COS R3, R3   
0x000da3b8       MOV R3, R3   
0x000da3c0       NOP  

			x = __cosf((float)i);
0x000da3c8       MOV R2, R3   
0x000da3d0       MOV R2, R2   
			x = cos((float)i);
0x000da3d8       I2F.F32.S32 R0, R16   
0x000da3e0       NOP  
0x000da3e8       MOV R4, R0   
0x000da3f0       JCAL 0xd8480   
0x000da3f8       MOV R0, R4   
0x000da400       NOP  
0x000da408       MOV R0, R0

It seems that it is calling to the proper functions for intrinsics…

return __nv_fast_expf(a);
return __nv_fast_cosf(a);

Am I missing something?

Thx in advance!!!

PS: The time measurement

int LOOP_SIZE = 100;
    cudaEvent_t start, stop;
    float time_measurement = 0.0f;
    float sum_time_measurement = 0.0f;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    for (uint16_t i = 0;i < LOOP_SIZE;i++)
    {
        cudaEventRecord(start, 0);
        my_function(im_rgb_host.data, image_gray.data, im_rgb_host.cols, im_rgb_host.rows);
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time_measurement, start, stop);
        sum_time_measurement += time_measurement;
    }

    printf("Time ms: %f \n", sum_time_measurement/LOOP_SIZE);

For a release build, as part of normal optimizations, the CUDA compiler will eliminate all dead code. Dead code refers to code that does not modify externally visible state. Your kernel has no externally visible state, because ‘x’ is never stored to global memory. That means the entire loop can be optimized away.

One approach to such timing of math functions is to accumulate (add up) the function results into a variable that is stored out to global memory at the end of the kernel. You should never benchmark a binary (executable) generated by a debug build.

In general, when seeking assistance with code:

(1) Post a minimal, complete, self-contained example that others can build and run
(2) State the actual exact commandline used to invoke nvcc (not CMake configuration data)

You are right!!! Thxs for helping!!!

Have a nice weekend.