I encountered a strange problem where increasing my occupancy by increasing the number of threads reduced performance.
I created the following program to illustrate the problem:
[codebox]
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
global void less_threads(float * d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
//Do 12 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
num_inliers += j*(j+5);
num_inliers += j*(j+6);
num_inliers += j*(j+7);
num_inliers += j*(j+8);
num_inliers += j*(j+9);
num_inliers += j*(j+10);
num_inliers += j*(j+11);
num_inliers += j*(j+12);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
global void more_threads(float *d_out) {
int num_inliers;
for (int j=0;j<800;++j) {
// Do 4 computations
num_inliers += j*(j+1);
num_inliers += j*(j+2);
num_inliers += j*(j+3);
num_inliers += j*(j+4);
}
if (threadIdx.x == -1)
d_out[threadIdx.x] = num_inliers;
}
int main(int argc, char* argv)
{
float *d_out = NULL;
cudaMalloc((void**)&d_out,sizeof(float)*25000);
more_threads<<<780,128>>>(d_out);
less_threads<<<780,32>>>(d_out);
return 0;
}
[/codebox]
and the ptx output
[codebox] .version 1.4
.version 1.4
.target sm_10, map_f64_to_f32
// compiled with C:\CUDA\bin64/../open64/lib//be.exe
// nvopencc 2.3 built on 2009-08-03
//-----------------------------------------------------------
// Compiling sample.cpp3.i (C:/Users/zenna/AppData/Local/Temp/ccBI#.a05244)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "sample.cudafe2.gpu"
.file 2 "c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\crtdefs.h"
.file 3 "C:\CUDA\include\crt/device_runtime.h"
.file 4 "C:\CUDA\include\host_defines.h"
.file 5 "C:\CUDA\include\builtin_types.h"
.file 6 "c:\cuda\include\device_types.h"
.file 7 "c:\cuda\include\driver_types.h"
.file 8 "c:\cuda\include\texture_types.h"
.file 9 "c:\cuda\include\vector_types.h"
.file 10 "c:\cuda\include\host_defines.h"
.file 11 "C:\CUDA\include\device_launch_parameters.h"
.file 12 "c:\cuda\include\crt\storage_class.h"
.file 13 "c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\time.h"
.file 14 "c:/Users/zenna/Documents/Visual Studio 2008/Projects/nspace2/nspace2/sample.cu"
.file 15 "C:\CUDA\include\common_functions.h"
.file 16 "c:\cuda\include\crt/func_macro.h"
.file 17 "c:\cuda\include\math_functions.h"
.file 18 "c:\cuda\include\device_functions.h"
.file 19 "c:\cuda\include\math_constants.h"
.file 20 "c:\cuda\include\sm_11_atomic_functions.h"
.file 21 "c:\cuda\include\sm_12_atomic_functions.h"
.file 22 "c:\cuda\include\sm_13_double_functions.h"
.file 23 "c:\cuda\include\common_types.h"
.file 24 "c:\cuda\include\texture_fetch_functions.h"
.file 25 "c:\cuda\include\math_functions_dbl_ptx1.h"
.entry _Z12less_threadsPf (
.param .u64 __cudaparm__Z12less_threadsPf_d_out)
{
.reg .u32 %r<32>;
.reg .u64 %rd<6>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 14 6 0
$LBB1__Z12less_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, 0;
mov.s32 %r5, 0;
mov.s32 %r6, 0;
mov.s32 %r7, 0;
mov.s32 %r8, 0;
mov.s32 %r9, 0;
mov.s32 %r10, 0;
mov.s32 %r11, 0;
mov.s32 %r12, %r13;
mov.s32 %r14, 0;
$Lt_0_2562:
// Loop body line 6, nesting depth: 1, iterations: 800
.loc 14 10 0
mul.lo.s32 %r15, %r14, %r14;
add.s32 %r16, %r12, %r14;
add.s32 %r12, %r15, %r16;
.loc 14 11 0
add.s32 %r17, %r15, %r12;
add.s32 %r12, %r1, %r17;
.loc 14 12 0
add.s32 %r18, %r15, %r12;
add.s32 %r12, %r2, %r18;
.loc 14 13 0
add.s32 %r19, %r15, %r12;
add.s32 %r12, %r3, %r19;
.loc 14 14 0
add.s32 %r20, %r15, %r12;
add.s32 %r12, %r4, %r20;
.loc 14 15 0
add.s32 %r21, %r15, %r12;
add.s32 %r12, %r5, %r21;
.loc 14 16 0
add.s32 %r22, %r15, %r12;
add.s32 %r12, %r6, %r22;
.loc 14 17 0
add.s32 %r23, %r15, %r12;
add.s32 %r12, %r7, %r23;
.loc 14 18 0
add.s32 %r24, %r15, %r12;
add.s32 %r12, %r8, %r24;
.loc 14 19 0
add.s32 %r25, %r15, %r12;
add.s32 %r12, %r9, %r25;
.loc 14 20 0
add.s32 %r26, %r15, %r12;
add.s32 %r12, %r10, %r26;
.loc 14 21 0
add.s32 %r27, %r15, %r12;
add.s32 %r12, %r11, %r27;
add.s32 %r14, %r14, 1;
add.s32 %r11, %r11, 12;
add.s32 %r10, %r10, 11;
add.s32 %r9, %r9, 10;
add.s32 %r8, %r8, 9;
add.s32 %r7, %r7, 8;
add.s32 %r6, %r6, 7;
add.s32 %r5, %r5, 6;
add.s32 %r4, %r4, 5;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r28, 1600;
setp.ne.s32 %p1, %r1, %r28;
@%p1 bra $Lt_0_2562;
cvt.u32.u16 %r29, %tid.x;
mov.u32 %r30, -1;
setp.ne.u32 %p2, %r29, %r30;
@%p2 bra $Lt_0_3074;
.loc 14 25 0
cvt.rn.f32.s32 %f1, %r12;
ld.param.u64 %rd1, [__cudaparm__Z12less_threadsPf_d_out];
cvt.u64.u32 %rd2, %r29;
mul.lo.u64 %rd3, %rd2, 4;
add.u64 %rd4, %rd1, %rd3;
st.global.f32 [%rd4+0], %f1;
$Lt_0_3074:
.loc 14 26 0
exit;
$LDWend__Z12less_threadsPf:
} // _Z12less_threadsPf
.entry _Z12more_threadsPf (
.param .u64 __cudaparm__Z12more_threadsPf_d_out)
{
.reg .u32 %r<16>;
.reg .u64 %rd<6>;
.reg .f32 %f<3>;
.reg .pred %p<4>;
.loc 14 28 0
$LBB1__Z12more_threadsPf:
mov.s32 %r1, 0;
mov.s32 %r2, 0;
mov.s32 %r3, 0;
mov.s32 %r4, %r5;
mov.s32 %r6, 0;
$Lt_1_2562:
// Loop body line 28, nesting depth: 1, iterations: 800
.loc 14 32 0
mul.lo.s32 %r7, %r6, %r6;
add.s32 %r8, %r4, %r6;
add.s32 %r4, %r7, %r8;
.loc 14 33 0
add.s32 %r9, %r7, %r4;
add.s32 %r4, %r1, %r9;
.loc 14 34 0
add.s32 %r10, %r7, %r4;
add.s32 %r4, %r2, %r10;
.loc 14 35 0
add.s32 %r11, %r7, %r4;
add.s32 %r4, %r3, %r11;
add.s32 %r6, %r6, 1;
add.s32 %r3, %r3, 4;
add.s32 %r2, %r2, 3;
add.s32 %r1, %r1, 2;
mov.u32 %r12, 1600;
setp.ne.s32 %p1, %r1, %r12;
@%p1 bra $Lt_1_2562;
cvt.u32.u16 %r13, %tid.x;
mov.u32 %r14, -1;
setp.ne.u32 %p2, %r13, %r14;
@%p2 bra $Lt_1_3074;
.loc 14 38 0
cvt.rn.f32.s32 %f1, %r4;
ld.param.u64 %rd1, [__cudaparm__Z12more_threadsPf_d_out];
cvt.u64.u32 %rd2, %r13;
mul.lo.u64 %rd3, %rd2, 4;
add.u64 %rd4, %rd1, %rd3;
st.global.f32 [%rd4+0], %f1;
$Lt_1_3074:
.loc 14 39 0
exit;
$LDWend__Z12more_threadsPf:
} // _Z12more_threadsPf
[/codebox]
Note both kernels should do the same amount of work in total, the (if threadIdx.x == -1 is a trick to stop the compiler optimising everything out and leaving an empty kernel). The work should be the same as more_threads is using 4 times as many threads but with each thread doing 4 times less work.
Significant results form the profiler results are as followsL:
more_threads: GPU runtime = 1474 us,reg per thread = 6,occupancy=1,branch=83746,divergent_branch = 26,instructions = 584065,gst request=1084552
less_threads: GPU runtime = 921 us,reg per thread = 14,occupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381
As I said previously, the run time of the kernel using more threads is longer, this could be due to the increased number of instructions.
[b]Why are there more instructions?
Why is there any branching, let alone divergent branching, considering there is no conditional code?
Why are there any gst requests when there is no global memory access?[/b]
What is going on here!
Thanks
[b]Update
[/b]
Apologies, I pasted the wrong code, the correct code is now shown and should compile