CUDA, more threads for same work = Longer run time despite better occupancy, Why?

I encountered a strange problem where increasing my occupancy by increasing the number of threads reduced performance.

I created the following program to illustrate the problem:

[codebox]

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

global void less_threads(float * d_out) {

int num_inliers;

for (int j=0;j<800;++j) {

	//Do 12 computations

	num_inliers += j*(j+1);

	num_inliers += j*(j+2);

	num_inliers += j*(j+3);

	num_inliers += j*(j+4);

	num_inliers += j*(j+5);

	num_inliers += j*(j+6);

	num_inliers += j*(j+7);

	num_inliers += j*(j+8);

	num_inliers += j*(j+9);

	num_inliers += j*(j+10);

	num_inliers += j*(j+11);

	num_inliers += j*(j+12);

}

if (threadIdx.x == -1)

	d_out[threadIdx.x] = num_inliers;

}

global void more_threads(float *d_out) {

int num_inliers;

for (int j=0;j<800;++j) {

	// Do 4 computations

	num_inliers += j*(j+1);

	num_inliers += j*(j+2);

	num_inliers += j*(j+3);

	num_inliers += j*(j+4);

}

if (threadIdx.x == -1)

	d_out[threadIdx.x] = num_inliers;

}

int main(int argc, char* argv)

{

float *d_out = NULL;

cudaMalloc((void**)&d_out,sizeof(float)*25000);

more_threads<<<780,128>>>(d_out);

less_threads<<<780,32>>>(d_out);

return 0;

}

[/codebox]

and the ptx output

[codebox] .version 1.4

.version 1.4

.target sm_10, map_f64_to_f32

// compiled with C:\CUDA\bin64/../open64/lib//be.exe

// nvopencc 2.3 built on 2009-08-03

//-----------------------------------------------------------

// Compiling sample.cpp3.i (C:/Users/zenna/AppData/Local/Temp/ccBI#.a05244)

//-----------------------------------------------------------

//-----------------------------------------------------------

// Options:

//-----------------------------------------------------------

//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64

//  -O3	(Optimization level)

//  -g0	(Debug level)

//  -m2	(Report advisories)

//-----------------------------------------------------------

.file	1	"sample.cudafe2.gpu"

.file	2	"c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\crtdefs.h"

.file	3	"C:\CUDA\include\crt/device_runtime.h"

.file	4	"C:\CUDA\include\host_defines.h"

.file	5	"C:\CUDA\include\builtin_types.h"

.file	6	"c:\cuda\include\device_types.h"

.file	7	"c:\cuda\include\driver_types.h"

.file	8	"c:\cuda\include\texture_types.h"

.file	9	"c:\cuda\include\vector_types.h"

.file	10	"c:\cuda\include\host_defines.h"

.file	11	"C:\CUDA\include\device_launch_parameters.h"

.file	12	"c:\cuda\include\crt\storage_class.h"

.file	13	"c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\time.h"

.file	14	"c:/Users/zenna/Documents/Visual Studio 2008/Projects/nspace2/nspace2/sample.cu"

.file	15	"C:\CUDA\include\common_functions.h"

.file	16	"c:\cuda\include\crt/func_macro.h"

.file	17	"c:\cuda\include\math_functions.h"

.file	18	"c:\cuda\include\device_functions.h"

.file	19	"c:\cuda\include\math_constants.h"

.file	20	"c:\cuda\include\sm_11_atomic_functions.h"

.file	21	"c:\cuda\include\sm_12_atomic_functions.h"

.file	22	"c:\cuda\include\sm_13_double_functions.h"

.file	23	"c:\cuda\include\common_types.h"

.file	24	"c:\cuda\include\texture_fetch_functions.h"

.file	25	"c:\cuda\include\math_functions_dbl_ptx1.h"

.entry _Z12less_threadsPf (

	.param .u64 __cudaparm__Z12less_threadsPf_d_out)

{

.reg .u32 %r<32>;

.reg .u64 %rd<6>;

.reg .f32 %f<3>;

.reg .pred %p<4>;

.loc	14	6	0

$LBB1__Z12less_threadsPf:

mov.s32 	%r1, 0;

mov.s32 	%r2, 0;

mov.s32 	%r3, 0;

mov.s32 	%r4, 0;

mov.s32 	%r5, 0;

mov.s32 	%r6, 0;

mov.s32 	%r7, 0;

mov.s32 	%r8, 0;

mov.s32 	%r9, 0;

mov.s32 	%r10, 0;

mov.s32 	%r11, 0;

mov.s32 	%r12, %r13;

mov.s32 	%r14, 0;

$Lt_0_2562:

// Loop body line 6, nesting depth: 1, iterations: 800

.loc	14	10	0

mul.lo.s32 	%r15, %r14, %r14;

add.s32 	%r16, %r12, %r14;

add.s32 	%r12, %r15, %r16;

.loc	14	11	0

add.s32 	%r17, %r15, %r12;

add.s32 	%r12, %r1, %r17;

.loc	14	12	0

add.s32 	%r18, %r15, %r12;

add.s32 	%r12, %r2, %r18;

.loc	14	13	0

add.s32 	%r19, %r15, %r12;

add.s32 	%r12, %r3, %r19;

.loc	14	14	0

add.s32 	%r20, %r15, %r12;

add.s32 	%r12, %r4, %r20;

.loc	14	15	0

add.s32 	%r21, %r15, %r12;

add.s32 	%r12, %r5, %r21;

.loc	14	16	0

add.s32 	%r22, %r15, %r12;

add.s32 	%r12, %r6, %r22;

.loc	14	17	0

add.s32 	%r23, %r15, %r12;

add.s32 	%r12, %r7, %r23;

.loc	14	18	0

add.s32 	%r24, %r15, %r12;

add.s32 	%r12, %r8, %r24;

.loc	14	19	0

add.s32 	%r25, %r15, %r12;

add.s32 	%r12, %r9, %r25;

.loc	14	20	0

add.s32 	%r26, %r15, %r12;

add.s32 	%r12, %r10, %r26;

.loc	14	21	0

add.s32 	%r27, %r15, %r12;

add.s32 	%r12, %r11, %r27;

add.s32 	%r14, %r14, 1;

add.s32 	%r11, %r11, 12;

add.s32 	%r10, %r10, 11;

add.s32 	%r9, %r9, 10;

add.s32 	%r8, %r8, 9;

add.s32 	%r7, %r7, 8;

add.s32 	%r6, %r6, 7;

add.s32 	%r5, %r5, 6;

add.s32 	%r4, %r4, 5;

add.s32 	%r3, %r3, 4;

add.s32 	%r2, %r2, 3;

add.s32 	%r1, %r1, 2;

mov.u32 	%r28, 1600;

setp.ne.s32 	%p1, %r1, %r28;

@%p1 bra 	$Lt_0_2562;

cvt.u32.u16 	%r29, %tid.x;

mov.u32 	%r30, -1;

setp.ne.u32 	%p2, %r29, %r30;

@%p2 bra 	$Lt_0_3074;

.loc	14	25	0

cvt.rn.f32.s32 	%f1, %r12;

ld.param.u64 	%rd1, [__cudaparm__Z12less_threadsPf_d_out];

cvt.u64.u32 	%rd2, %r29;

mul.lo.u64 	%rd3, %rd2, 4;

add.u64 	%rd4, %rd1, %rd3;

st.global.f32 	[%rd4+0], %f1;

$Lt_0_3074:

.loc	14	26	0

exit;

$LDWend__Z12less_threadsPf:

} // _Z12less_threadsPf

.entry _Z12more_threadsPf (

	.param .u64 __cudaparm__Z12more_threadsPf_d_out)

{

.reg .u32 %r<16>;

.reg .u64 %rd<6>;

.reg .f32 %f<3>;

.reg .pred %p<4>;

.loc	14	28	0

$LBB1__Z12more_threadsPf:

mov.s32 	%r1, 0;

mov.s32 	%r2, 0;

mov.s32 	%r3, 0;

mov.s32 	%r4, %r5;

mov.s32 	%r6, 0;

$Lt_1_2562:

// Loop body line 28, nesting depth: 1, iterations: 800

.loc	14	32	0

mul.lo.s32 	%r7, %r6, %r6;

add.s32 	%r8, %r4, %r6;

add.s32 	%r4, %r7, %r8;

.loc	14	33	0

add.s32 	%r9, %r7, %r4;

add.s32 	%r4, %r1, %r9;

.loc	14	34	0

add.s32 	%r10, %r7, %r4;

add.s32 	%r4, %r2, %r10;

.loc	14	35	0

add.s32 	%r11, %r7, %r4;

add.s32 	%r4, %r3, %r11;

add.s32 	%r6, %r6, 1;

add.s32 	%r3, %r3, 4;

add.s32 	%r2, %r2, 3;

add.s32 	%r1, %r1, 2;

mov.u32 	%r12, 1600;

setp.ne.s32 	%p1, %r1, %r12;

@%p1 bra 	$Lt_1_2562;

cvt.u32.u16 	%r13, %tid.x;

mov.u32 	%r14, -1;

setp.ne.u32 	%p2, %r13, %r14;

@%p2 bra 	$Lt_1_3074;

.loc	14	38	0

cvt.rn.f32.s32 	%f1, %r4;

ld.param.u64 	%rd1, [__cudaparm__Z12more_threadsPf_d_out];

cvt.u64.u32 	%rd2, %r13;

mul.lo.u64 	%rd3, %rd2, 4;

add.u64 	%rd4, %rd1, %rd3;

st.global.f32 	[%rd4+0], %f1;

$Lt_1_3074:

.loc	14	39	0

exit;

$LDWend__Z12more_threadsPf:

} // _Z12more_threadsPf

[/codebox]

Note both kernels should do the same amount of work in total, the (if threadIdx.x == -1 is a trick to stop the compiler optimising everything out and leaving an empty kernel). The work should be the same as more_threads is using 4 times as many threads but with each thread doing 4 times less work.

Significant results form the profiler results are as followsL:

more_threads: GPU runtime = 1474 us,reg per thread = 6,occupancy=1,branch=83746,divergent_branch = 26,instructions = 584065,gst request=1084552

less_threads: GPU runtime = 921 us,reg per thread = 14,occupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381

As I said previously, the run time of the kernel using more threads is longer, this could be due to the increased number of instructions.

[b]Why are there more instructions?

Why is there any branching, let alone divergent branching, considering there is no conditional code?

Why are there any gst requests when there is no global memory access?[/b]

What is going on here!

Thanks

[b]Update

[/b]

Apologies, I pasted the wrong code, the correct code is now shown and should compile

Hard to say, given that the C code and the ptx you have posted don’t match (the C won’t even compile).

The for loop compiles to conditional branching.

The PTX code suggests that your threadIdx.x == -1 “trick”, doesn’t work (or at least it doesn’t do what you think it does), because there doesn’t seem to be any conditional code around the global memory store. So every thread is writing to global memory.

I just read that threadIdx is uint3,
which would explain why the trick doesn’t work
So various bits of code may be being optimize out

(blockIdx and threadIDx are uint3, gridDim, blockDim are dim3 which is it says is ‘based on uint3’ warpSize is an int )

I fixed the code

Any insight as to why using more threads makes things slower?

Quick guess; more threads = > bigger blocks => fewer active blocks/SM => less room for context switching between blocks to hide global memory latencies.

Hi

Thanks for your answer but I am not sure if that is correct. What you are suggesting is worse occupancy due to the bigger blocks whereas actually the occupancy is better when using larger blocks.

If the compiler was really smart, it could refactor

num_inliers += j*(j+1);

num_inliers += j*(j+2);

num_inliers += j*(j+3);

num_inliers += j*(j+4);

num_inliers += j*(j+5);

num_inliers += j*(j+6);

num_inliers += j*(j+7);

num_inliers += j*(j+8);

num_inliers += j*(j+9);

num_inliers += j*(j+10);

num_inliers += j*(j+11);

num_inliers += j*(j+12);

into just

num_inliers += j*(12*j+78);

Such an algebraic simplification would obviously be more efficient the more terms it has unrolled in the loop, hence the fewer thread case with each thread doing a lot of work would simplify more effectively than many threads each doing a small piece of work if, for instance, the for loop limited the number of terms it can combine.

I don’t actually know if the compiler is that smart, of course, but you never know…

I was suggesting similar occupancy but with more blocks.

But after having looked at your code it seems you would have 8 or 4 blocks / SM anyways (depending on device), so i dont think that’s the issue either.

actually i am not sure that profiler time is good helper in such things. maybe you should make your own timing like they do it in sdk?

[codebox]

unsigned int timer = 0;

cutCreateTimer( &timer);

int numIterations = 100;

for (int i = 0; i < numIterations; i++)

{

    cutStartTimer( timer);

kernel<<<>>>(...);

cutStopTimer( timer);

}

cudaThreadSynchronize();

cutStopTimer( timer);

float Time = cutGetAverageTimerValue(timer);

[/codebox]

if compare more_threads with

[codebox]

global void less_threads_new(float * d_out) {

int num_inliers;

for (int j=0;j<3200;++j) {

	//Do 4 computations

	num_inliers += j*(j+1);

	num_inliers += j*(j+2);

	num_inliers += j*(j+3);

	num_inliers += j*(j+4);

}

if (threadIdx.x == -1)

	d_out[threadIdx.x] = num_inliers;

}

[/codebox]

the result is: more_threads is running faster