CUDA, more threads for same work = Longer run time despite better occupancy, Why?

zenna · March 15, 2010, 6:17pm

I encountered a strange problem where increasing my occupancy by increasing the number of threads reduced performance.

I created the following program to illustrate the problem:

[codebox]

#include <stdio.h>

#include <stdlib.h>

#include <cuda_runtime.h>

#include <cutil.h>

global void less_threads(float * d_out) {

int num_inliers;

for (int j=0;j<800;++j) {

	//Do 12 computations

	num_inliers += j*(j+1);

	num_inliers += j*(j+2);

	num_inliers += j*(j+3);

	num_inliers += j*(j+4);

	num_inliers += j*(j+5);

	num_inliers += j*(j+6);

	num_inliers += j*(j+7);

	num_inliers += j*(j+8);

	num_inliers += j*(j+9);

	num_inliers += j*(j+10);

	num_inliers += j*(j+11);

	num_inliers += j*(j+12);

}

if (threadIdx.x == -1)

	d_out[threadIdx.x] = num_inliers;

}

global void more_threads(float *d_out) {

int num_inliers;

for (int j=0;j<800;++j) {

	// Do 4 computations

	num_inliers += j*(j+1);

	num_inliers += j*(j+2);

	num_inliers += j*(j+3);

	num_inliers += j*(j+4);

}

if (threadIdx.x == -1)

	d_out[threadIdx.x] = num_inliers;

}

int main(int argc, char* argv)

{

float *d_out = NULL;

cudaMalloc((void**)&d_out,sizeof(float)*25000);

more_threads<<<780,128>>>(d_out);

less_threads<<<780,32>>>(d_out);

return 0;

}

[/codebox]

and the ptx output

[codebox] .version 1.4

.version 1.4

.target sm_10, map_f64_to_f32

// compiled with C:\CUDA\bin64/../open64/lib//be.exe

// nvopencc 2.3 built on 2009-08-03

//-----------------------------------------------------------

// Compiling sample.cpp3.i (C:/Users/zenna/AppData/Local/Temp/ccBI#.a05244)

//-----------------------------------------------------------

//-----------------------------------------------------------

// Options:

//-----------------------------------------------------------

//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64

//  -O3	(Optimization level)

//  -g0	(Debug level)

//  -m2	(Report advisories)

//-----------------------------------------------------------

.file	1	"sample.cudafe2.gpu"

.file	2	"c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\crtdefs.h"

.file	3	"C:\CUDA\include\crt/device_runtime.h"

.file	4	"C:\CUDA\include\host_defines.h"

.file	5	"C:\CUDA\include\builtin_types.h"

.file	6	"c:\cuda\include\device_types.h"

.file	7	"c:\cuda\include\driver_types.h"

.file	8	"c:\cuda\include\texture_types.h"

.file	9	"c:\cuda\include\vector_types.h"

.file	10	"c:\cuda\include\host_defines.h"

.file	11	"C:\CUDA\include\device_launch_parameters.h"

.file	12	"c:\cuda\include\crt\storage_class.h"

.file	13	"c:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\include\time.h"

.file	14	"c:/Users/zenna/Documents/Visual Studio 2008/Projects/nspace2/nspace2/sample.cu"

.file	15	"C:\CUDA\include\common_functions.h"

.file	16	"c:\cuda\include\crt/func_macro.h"

.file	17	"c:\cuda\include\math_functions.h"

.file	18	"c:\cuda\include\device_functions.h"

.file	19	"c:\cuda\include\math_constants.h"

.file	20	"c:\cuda\include\sm_11_atomic_functions.h"

.file	21	"c:\cuda\include\sm_12_atomic_functions.h"

.file	22	"c:\cuda\include\sm_13_double_functions.h"

.file	23	"c:\cuda\include\common_types.h"

.file	24	"c:\cuda\include\texture_fetch_functions.h"

.file	25	"c:\cuda\include\math_functions_dbl_ptx1.h"

.entry _Z12less_threadsPf (

	.param .u64 __cudaparm__Z12less_threadsPf_d_out)

{

.reg .u32 %r<32>;

.reg .u64 %rd<6>;

.reg .f32 %f<3>;

.reg .pred %p<4>;

.loc	14	6	0

$LBB1__Z12less_threadsPf:

mov.s32 	%r1, 0;

mov.s32 	%r2, 0;

mov.s32 	%r3, 0;

mov.s32 	%r4, 0;

mov.s32 	%r5, 0;

mov.s32 	%r6, 0;

mov.s32 	%r7, 0;

mov.s32 	%r8, 0;

mov.s32 	%r9, 0;

mov.s32 	%r10, 0;

mov.s32 	%r11, 0;

mov.s32 	%r12, %r13;

mov.s32 	%r14, 0;

$Lt_0_2562:

// Loop body line 6, nesting depth: 1, iterations: 800

.loc	14	10	0

mul.lo.s32 	%r15, %r14, %r14;

add.s32 	%r16, %r12, %r14;

add.s32 	%r12, %r15, %r16;

.loc	14	11	0

add.s32 	%r17, %r15, %r12;

add.s32 	%r12, %r1, %r17;

.loc	14	12	0

add.s32 	%r18, %r15, %r12;

add.s32 	%r12, %r2, %r18;

.loc	14	13	0

add.s32 	%r19, %r15, %r12;

add.s32 	%r12, %r3, %r19;

.loc	14	14	0

add.s32 	%r20, %r15, %r12;

add.s32 	%r12, %r4, %r20;

.loc	14	15	0

add.s32 	%r21, %r15, %r12;

add.s32 	%r12, %r5, %r21;

.loc	14	16	0

add.s32 	%r22, %r15, %r12;

add.s32 	%r12, %r6, %r22;

.loc	14	17	0

add.s32 	%r23, %r15, %r12;

add.s32 	%r12, %r7, %r23;

.loc	14	18	0

add.s32 	%r24, %r15, %r12;

add.s32 	%r12, %r8, %r24;

.loc	14	19	0

add.s32 	%r25, %r15, %r12;

add.s32 	%r12, %r9, %r25;

.loc	14	20	0

add.s32 	%r26, %r15, %r12;

add.s32 	%r12, %r10, %r26;

.loc	14	21	0

add.s32 	%r27, %r15, %r12;

add.s32 	%r12, %r11, %r27;

add.s32 	%r14, %r14, 1;

add.s32 	%r11, %r11, 12;

add.s32 	%r10, %r10, 11;

add.s32 	%r9, %r9, 10;

add.s32 	%r8, %r8, 9;

add.s32 	%r7, %r7, 8;

add.s32 	%r6, %r6, 7;

add.s32 	%r5, %r5, 6;

add.s32 	%r4, %r4, 5;

add.s32 	%r3, %r3, 4;

add.s32 	%r2, %r2, 3;

add.s32 	%r1, %r1, 2;

mov.u32 	%r28, 1600;

setp.ne.s32 	%p1, %r1, %r28;

@%p1 bra 	$Lt_0_2562;

cvt.u32.u16 	%r29, %tid.x;

mov.u32 	%r30, -1;

setp.ne.u32 	%p2, %r29, %r30;

@%p2 bra 	$Lt_0_3074;

.loc	14	25	0

cvt.rn.f32.s32 	%f1, %r12;

ld.param.u64 	%rd1, [__cudaparm__Z12less_threadsPf_d_out];

cvt.u64.u32 	%rd2, %r29;

mul.lo.u64 	%rd3, %rd2, 4;

add.u64 	%rd4, %rd1, %rd3;

st.global.f32 	[%rd4+0], %f1;

$Lt_0_3074:

.loc	14	26	0

exit;

$LDWend__Z12less_threadsPf:

} // _Z12less_threadsPf

.entry _Z12more_threadsPf (

	.param .u64 __cudaparm__Z12more_threadsPf_d_out)

{

.reg .u32 %r<16>;

.reg .u64 %rd<6>;

.reg .f32 %f<3>;

.reg .pred %p<4>;

.loc	14	28	0

$LBB1__Z12more_threadsPf:

mov.s32 	%r1, 0;

mov.s32 	%r2, 0;

mov.s32 	%r3, 0;

mov.s32 	%r4, %r5;

mov.s32 	%r6, 0;

$Lt_1_2562:

// Loop body line 28, nesting depth: 1, iterations: 800

.loc	14	32	0

mul.lo.s32 	%r7, %r6, %r6;

add.s32 	%r8, %r4, %r6;

add.s32 	%r4, %r7, %r8;

.loc	14	33	0

add.s32 	%r9, %r7, %r4;

add.s32 	%r4, %r1, %r9;

.loc	14	34	0

add.s32 	%r10, %r7, %r4;

add.s32 	%r4, %r2, %r10;

.loc	14	35	0

add.s32 	%r11, %r7, %r4;

add.s32 	%r4, %r3, %r11;

add.s32 	%r6, %r6, 1;

add.s32 	%r3, %r3, 4;

add.s32 	%r2, %r2, 3;

add.s32 	%r1, %r1, 2;

mov.u32 	%r12, 1600;

setp.ne.s32 	%p1, %r1, %r12;

@%p1 bra 	$Lt_1_2562;

cvt.u32.u16 	%r13, %tid.x;

mov.u32 	%r14, -1;

setp.ne.u32 	%p2, %r13, %r14;

@%p2 bra 	$Lt_1_3074;

.loc	14	38	0

cvt.rn.f32.s32 	%f1, %r4;

ld.param.u64 	%rd1, [__cudaparm__Z12more_threadsPf_d_out];

cvt.u64.u32 	%rd2, %r13;

mul.lo.u64 	%rd3, %rd2, 4;

add.u64 	%rd4, %rd1, %rd3;

st.global.f32 	[%rd4+0], %f1;

$Lt_1_3074:

.loc	14	39	0

exit;

$LDWend__Z12more_threadsPf:

} // _Z12more_threadsPf

[/codebox]

Note both kernels should do the same amount of work in total, the (if threadIdx.x == -1 is a trick to stop the compiler optimising everything out and leaving an empty kernel). The work should be the same as more_threads is using 4 times as many threads but with each thread doing 4 times less work.

Significant results form the profiler results are as followsL:

more_threads: GPU runtime = 1474 us,reg per thread = 6,occupancy=1,branch=83746,divergent_branch = 26,instructions = 584065,gst request=1084552

less_threads: GPU runtime = 921 us,reg per thread = 14,occupancy=0.25,branch=20956,divergent_branch = 26,instructions = 312663,gst request=677381

As I said previously, the run time of the kernel using more threads is longer, this could be due to the increased number of instructions.

[b]Why are there more instructions?

Why is there any branching, let alone divergent branching, considering there is no conditional code?

Why are there any gst requests when there is no global memory access?[/b]

What is going on here!

Thanks

[b]Update

[/b]

Apologies, I pasted the wrong code, the correct code is now shown and should compile

avidday · March 15, 2010, 9:19pm

Hard to say, given that the C code and the ptx you have posted don’t match (the C won’t even compile).

The for loop compiles to conditional branching.

The PTX code suggests that your threadIdx.x == -1 “trick”, doesn’t work (or at least it doesn’t do what you think it does), because there doesn’t seem to be any conditional code around the global memory store. So every thread is writing to global memory.

kbam · March 16, 2010, 12:21am

I just read that threadIdx is uint3,
which would explain why the trick doesn’t work
So various bits of code may be being optimize out

(blockIdx and threadIDx are uint3, gridDim, blockDim are dim3 which is it says is ‘based on uint3’ warpSize is an int )

zenna · March 16, 2010, 3:13pm

I fixed the code

Any insight as to why using more threads makes things slower?

Jimmy_Pettersson · March 17, 2010, 8:00am

Quick guess; more threads = > bigger blocks => fewer active blocks/SM => less room for context switching between blocks to hide global memory latencies.

zenna · March 18, 2010, 1:30pm

Hi

Thanks for your answer but I am not sure if that is correct. What you are suggesting is worse occupancy due to the bigger blocks whereas actually the occupancy is better when using larger blocks.

Keldor314 · March 19, 2010, 6:53pm

If the compiler was really smart, it could refactor

num_inliers += j*(j+1);

num_inliers += j*(j+2);

num_inliers += j*(j+3);

num_inliers += j*(j+4);

num_inliers += j*(j+5);

num_inliers += j*(j+6);

num_inliers += j*(j+7);

num_inliers += j*(j+8);

num_inliers += j*(j+9);

num_inliers += j*(j+10);

num_inliers += j*(j+11);

num_inliers += j*(j+12);

into just

num_inliers += j*(12*j+78);

Such an algebraic simplification would obviously be more efficient the more terms it has unrolled in the loop, hence the fewer thread case with each thread doing a lot of work would simplify more effectively than many threads each doing a small piece of work if, for instance, the for loop limited the number of terms it can combine.

I don’t actually know if the compiler is that smart, of course, but you never know…

Jimmy_Pettersson · March 20, 2010, 10:08am

I was suggesting similar occupancy but with more blocks.

But after having looked at your code it seems you would have 8 or 4 blocks / SM anyways (depending on device), so i dont think that’s the issue either.

You7878 · March 25, 2010, 2:15pm

actually i am not sure that profiler time is good helper in such things. maybe you should make your own timing like they do it in sdk?

[codebox]

unsigned int timer = 0;

cutCreateTimer( &timer);

int numIterations = 100;

for (int i = 0; i < numIterations; i++)

{

    cutStartTimer( timer);

kernel<<<>>>(...);

cutStopTimer( timer);

}

cudaThreadSynchronize();

cutStopTimer( timer);

float Time = cutGetAverageTimerValue(timer);

[/codebox]

You7878 · March 25, 2010, 2:41pm

if compare more_threads with

[codebox]

global void less_threads_new(float * d_out) {

int num_inliers;

for (int j=0;j<3200;++j) {

	//Do 4 computations

	num_inliers += j*(j+1);

	num_inliers += j*(j+2);

	num_inliers += j*(j+3);

	num_inliers += j*(j+4);

}

if (threadIdx.x == -1)

	d_out[threadIdx.x] = num_inliers;

}

[/codebox]

the result is: more_threads is running faster

Topic		Replies	Views
CUDA image processing Accelaration tips anyone? CUDA Programming and Performance	20	6060	November 16, 2010
Efficient use of shared memory CUDA Programming and Performance	29	4339	December 2, 2019
need a help from employees or guys who know compiler well CUDA Programming and Performance	22	8614	December 18, 2008
Performance degradation as task size grows CUDA Programming and Performance	13	629	April 25, 2023
Occupancy wierdness.... Is the calculator wrong? CUDA Programming and Performance	5	5899	July 25, 2007
shard memory accesses is slow... CUDA Programming and Performance	27	9250	December 24, 2011
Can you GUESS this without experimenting? Latencies CUDA Programming and Performance	13	9346	January 7, 2008
Best way to pack bits into words for global memory Better than reduce in shared memory? CUDA Programming and Performance	17	6677	June 2, 2012
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16291	January 30, 2011
Urgent help with threads please! CUDA Programming and Performance	21	10786	March 6, 2008

CUDA, more threads for same work = Longer run time despite better occupancy, Why?

Related topics