Hi!!!
I have a GTX 1080 Ti GPU, using CUDA 8.0.
CMake:
find_package(CUDA REQUIRED 8.0)
set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G -g -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60")
set(CUDA_NVCC_FLAGS_RELEASE ${CUDA_NVCC_FLAGS_RELEASE} " -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60")
SET(CUDA_PROPAGATE_HOST_FLAGS ON)
enable_language(CUDA)
I have this Kernel:
__global__ void test_Kernel()
{
float x;
for (int i = 0;i < 100000000;i++)
{
x = __expf((float)i);
x = exp((float)i);
x = __cosf((float)i);
x = cos((float)i);
}
}
void my_function()
{
int width = 500;
int height= 500;
int thread_x = 16;
int thread_y = 8;
dim3 Num_Threads_per_Block(thread_x, thread_y);
dim3 Num_Blocks(DivUp(width, thread_x), DivUp(height, thread_y));
for (int i = 0;i < 1000;i++)
{
test_Kernel<< < Num_Blocks, Num_Threads_per_Block >> > ();
}
}
In the Kernel function I did two tests:
Test1:
x = __expf((float)i);
//x = exp((float)i);
x = __cosf((float)i);
//x = cos((float)i);
Test2:
//x = __expf((float)i);
x = exp((float)i);
//x = __cosf((float)i);
x = cos((float)i);
In both tests I am getting the same performance, no speed up…
I was checking the assembler code with the code like this:
x = __expf((float)i);
x = exp((float)i);
x = __cosf((float)i);
x = cos((float)i);
and I am getting this SASS:
float x;
for (int i = 0;i < 100000000;i++)
0x000da230 ISETP.LT.AND P0, PT, R16, c[0x2][0x0], PT
0x000da238 PSETP.AND.AND P0, PT, !P0, PT, PT
0x000da240 NOP
0x000da248 @P0 BRA 0x5b0
0x000da250 BRA 0x3d8
{
x = __expf((float)i);
0x000da258 I2F.F32.S32 R0, R16
0x000da260 NOP
0x000da268 MOV R0, R0
{
x = __expf((float)i);
0x000da270 BRA 0x3f8
--- c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp
return __nv_fast_expf(a);
0x000da278 FMUL32I R2, R0, 1.4426950216293334961
0x000da280 NOP
0x000da288 MOV R2, R2
0x000da290 MOV R2, R2
0x000da298 MOV R2, R2
0x000da2a0 NOP
0x000da2a8 MOV R2, R2
0x000da2b0 MOV32I R3, 0xc2fc0000
0x000da2b8 FSETP.LT.AND P0, PT, R2, R3, PT
0x000da2c0 NOP
0x000da2c8 MOV32I R3, 0x3f000000
0x000da2d0 FMUL R3, R2, R3
0x000da2d8 SEL R2, R3, R2, P0
0x000da2e0 NOP
0x000da2e8 RRO.EX2 R2, R2
0x000da2f0 MUFU.EX2 R2, R2
0x000da2f8 MOV R2, R2
0x000da300 NOP
0x000da308 FMUL R3, R2, R2
0x000da310 SEL R2, R3, R2, P0
0x000da318 MOV R2, R2
0x000da320 NOP
0x000da328 MOV R2, R2
0x000da330 MOV R2, R2
#define __DEVICE_FUNCTIONS_DECL__ __device__
#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__
0x000da338 MOV R0, R2
0x000da340 NOP
0x000da348 MOV R2, R0
#endif /* __CUDACC_RTC__ */
0x000da350 I2F.F32.S32 R0, R16
0x000da358 MOV R4, R0
0x000da360 NOP
0x000da368 JCAL 0xd6340
0x000da370 MOV R0, R4
0x000da378 MOV R0, R0
0x000da380 NOP
#include "builtin_types.h"
0x000da388 I2F.F32.S32 R2, R16
0x000da390 MOV R2, R2
#include "builtin_types.h"
0x000da398 BRA 0x528
0x000da3a0 NOP
--- c:\program files\nvidia gpu computing toolkit\cuda\v8.0\include\device_functions.hpp
return __nv_fast_cosf(a);
0x000da3a8 RRO.SINCOS R3, R2
0x000da3b0 MUFU.COS R3, R3
0x000da3b8 MOV R3, R3
0x000da3c0 NOP
x = __cosf((float)i);
0x000da3c8 MOV R2, R3
0x000da3d0 MOV R2, R2
x = cos((float)i);
0x000da3d8 I2F.F32.S32 R0, R16
0x000da3e0 NOP
0x000da3e8 MOV R4, R0
0x000da3f0 JCAL 0xd8480
0x000da3f8 MOV R0, R4
0x000da400 NOP
0x000da408 MOV R0, R0
It seems that it is calling to the proper functions for intrinsics…
return __nv_fast_expf(a);
return __nv_fast_cosf(a);
Am I missing something?
Thx in advance!!!
PS: The time measurement
int LOOP_SIZE = 100;
cudaEvent_t start, stop;
float time_measurement = 0.0f;
float sum_time_measurement = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for (uint16_t i = 0;i < LOOP_SIZE;i++)
{
cudaEventRecord(start, 0);
my_function(im_rgb_host.data, image_gray.data, im_rgb_host.cols, im_rgb_host.rows);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_measurement, start, stop);
sum_time_measurement += time_measurement;
}
printf("Time ms: %f \n", sum_time_measurement/LOOP_SIZE);