NVidia 280.xx drivers: reason of slowdown

Well, I believe I found the main reason why 280.xx drivers (280.19 case) show significant slowdown in OpenCL in comparison with 270.81.

First I noticed that the more ratio of global memory access operations in the kernel the more slowdown is.

Then I checked the PTX code the compiler generates from my OpenCL source code. It now uses ldu.global.f32 instead of ld.global.f32 to get data from global memory.

I checked PTX documentation, it says (for ldu instruction):

Load read-only data into register variable d from the location specified by the source

address operand a in the global state space, [b]where the address is guaranteed to be the

same across all threads in the warp[/b]

Well, it makes sense to use this instruction in case the source code reads data from the buffer declared with __constant qualifier. It is not my case. My parameters are declared this way:

const __global float * restrict input_buffer

I am rollbacking to 270.81 for the time being as slowdown is more than 2x.

P.S. I successfully used OpenCL 1.1 feature CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE to get warp size; it works.

Might be something else in your kernel that’s causing this. Best is to visit your code again. I didn’t see any performance drop in my routines in moving to 280.xx drivers.

Did you inspect PTX code generated for your kernels? My educated guess is No. Your kernels don’t show any performance degradation with new driver release? That doesn’t mean this beta release of the driver is flawless.

I checked WHQL 280.26 - the problem remains.

Here is source code for OpenCL kernel:

float AddOnActivationFunction3(const float x)  

{ 

	return x; 

}  

#define output_width 23

#define output_height 23

#define feature_map_count 12

#define source_width 4

#define source_height 4

#define input_width 92

#define input_height 92

#define output_entry_size 6348

#define input_entry_size 101568

__kernel 

void Average( 

	const __global float * restrict input, 

	__global float * restrict output, 

	const float weight, 

	const uint entry_count 

) 

{ 

	const uint dimension_id = get_global_id(0); 

	uint entry_id = dimension_id / (output_width * output_height * feature_map_count); 

	entry_id = entry_id < entry_count ? entry_id : 0; 

	const uint output_neuron_id_inside_entry = dimension_id % (output_width * output_height * feature_map_count); 

	const uint feature_map_id = output_neuron_id_inside_entry / (output_width * output_height); 

	const uint output_neuron_id = output_neuron_id_inside_entry % (output_width * output_height); 

	const uint dest_y = output_neuron_id / output_width; 

	const uint dest_x = output_neuron_id % output_width; 

	float sum = 0.0F; 

	uint input_base_offset = (entry_id * (uint)input_entry_size) + (feature_map_id * (uint)(input_width * input_height)) + mad24(dest_x, (uint)source_width, mul24(dest_y, (uint)(source_height * input_width))); 

	for(uint source_y = 0; source_y < source_height; source_y++) 

	{ 

		for(uint source_x = 0; source_x < source_width; source_x++) 

		{ 

			sum += input[input_base_offset + source_x]; 

		} 

		input_base_offset += input_width; 

	} 

	output[entry_id * output_entry_size + output_neuron_id_inside_entry] = AddOnActivationFunction3((sum * weight)); 

}

Here is PTX code generated by 270.81:

//

// Generated by NVIDIA LLVM Compiler 4.0

//

.version 2.2

.target sm_21, texmode_independent

.entry Average(

	.param .u32 .ptr .global .align 4 Average_param_0,

	.param .u32 .ptr .global .align 4 Average_param_1,

	.param .f32 Average_param_2,

	.param .u32 Average_param_3

)

{

	.reg .f32 	%f<35>;

	.reg .pred 	%p<2>;

	.reg .s32 	%r<37>;

_Average:

	mov.u32 	%r1, %tid.x;

	mov.u32 	%r2, %envreg3;

	add.s32 	%r3, %r1, %r2;

	mov.u32 	%r4, %ctaid.x;

	mov.u32 	%r5, %ntid.x;

	mad.lo.s32 	%r6, %r4, %r5, %r3;

	mul.hi.u32 	%r7, %r6, 1247624401;

	sub.s32 	%r8, %r6, %r7;

	shr.u32 	%r9, %r8, 1;

	add.s32 	%r10, %r9, %r7;

	shr.u32 	%r11, %r10, 12;

	mul.lo.s32 	%r12, %r11, 6348;

	sub.s32 	%r13, %r6, %r12;

	mul.hi.u32 	%r14, %r13, 4156943773;

	shr.u32 	%r15, %r14, 9;

	mul.lo.s32 	%r16, %r15, 529;

	sub.s32 	%r17, %r13, %r16;

	mul.hi.u32 	%r18, %r17, 2987803337;

	shr.u32 	%r19, %r18, 4;

	mul.lo.s32 	%r20, %r19, 23;

	ld.param.u32 	%r21, [Average_param_3];

	mov.u32 	%r22, 368;

	setp.lt.u32 	%p1, %r11, %r21;

	sub.s32 	%r23, %r17, %r20;

	mul24.lo.u32 	%r24, %r19, %r22;

	mov.u32 	%r25, 4;

	selp.b32 	%r26, %r11, 0, %p1;

	mad24.lo.u32 	%r27, %r23, %r25, %r24;

	mad.lo.s32 	%r28, %r26, 101568, %r27;

	mad.lo.s32 	%r29, %r15, 8464, %r28;

	shl.b32 	%r30, %r29, 2;

	ld.param.u32 	%r31, [Average_param_0];

	add.s32 	%r32, %r31, %r30;

	ld.global.f32 	%f1, [%r32];

	add.rn.ftz.f32 	%f2, %f1, 0f00000000;

	ld.global.f32 	%f3, [%r32+4];

	add.rn.ftz.f32 	%f4, %f2, %f3;

	ld.global.f32 	%f5, [%r32+8];

	add.rn.ftz.f32 	%f6, %f4, %f5;

	ld.global.f32 	%f7, [%r32+12];

	add.rn.ftz.f32 	%f8, %f6, %f7;

	ld.global.f32 	%f9, [%r32+368];

	add.rn.ftz.f32 	%f10, %f8, %f9;

	ld.global.f32 	%f11, [%r32+372];

	add.rn.ftz.f32 	%f12, %f10, %f11;

	ld.global.f32 	%f13, [%r32+376];

	add.rn.ftz.f32 	%f14, %f12, %f13;

	ld.global.f32 	%f15, [%r32+380];

	add.rn.ftz.f32 	%f16, %f14, %f15;

	ld.global.f32 	%f17, [%r32+736];

	add.rn.ftz.f32 	%f18, %f16, %f17;

	ld.global.f32 	%f19, [%r32+740];

	add.rn.ftz.f32 	%f20, %f18, %f19;

	ld.global.f32 	%f21, [%r32+744];

	add.rn.ftz.f32 	%f22, %f20, %f21;

	ld.global.f32 	%f23, [%r32+748];

	add.rn.ftz.f32 	%f24, %f22, %f23;

	ld.global.f32 	%f25, [%r32+1104];

	add.rn.ftz.f32 	%f26, %f24, %f25;

	ld.global.f32 	%f27, [%r32+1108];

	add.rn.ftz.f32 	%f28, %f26, %f27;

	ld.global.f32 	%f29, [%r32+1112];

	ld.global.f32 	%f30, [%r32+1116];

	add.rn.ftz.f32 	%f31, %f28, %f29;

	mad.lo.s32 	%r33, %r26, 6348, %r13;

	add.rn.ftz.f32 	%f32, %f31, %f30;

	ld.param.f32 	%f33, [Average_param_2];

	shl.b32 	%r34, %r33, 2;

	ld.param.u32 	%r35, [Average_param_1];

	mul.rn.ftz.f32 	%f34, %f32, %f33;

	add.s32 	%r36, %r35, %r34;

	st.global.f32 	[%r36], %f34;

	ret;

}

And here is the code generated by 280.26:

//

// Generated by NVIDIA NVVM Compiler

// Compiler built on Wed Aug 03 12:45:53 2011 (1312361153)

// Driver 

//

.version 2.4

.target sm_21, texmode_independent, large_surfaces

.address_size 32

.entry Average(

	.param .u32 .ptr .global .align 4 Average_param_0,

	.param .u32 .ptr .global .align 4 Average_param_1,

	.param .f32 Average_param_2,

	.param .u32 Average_param_3

)

{

	.reg .f32 	%f<35>;

	.reg .pred 	%p<2>;

	.reg .s32 	%r<38>;

	// inline asm

	mov.u32 	%r1, %envreg3;

	// inline asm

	// inline asm

	mov.u32 	%r2, %ntid.x;

	// inline asm

	// inline asm

	mov.u32 	%r3, %ctaid.x;

	// inline asm

	// inline asm

	mov.u32 	%r4, %tid.x;

	// inline asm

	mad.lo.s32 	%r12, %r3, %r2, %r1;

	add.s32 	%r13, %r12, %r4;

	mul.hi.u32 	%r14, %r13, 1247624401;

	sub.s32 	%r15, %r13, %r14;

	shr.u32 	%r16, %r15, 1;

	add.s32 	%r17, %r16, %r14;

	shr.u32 	%r18, %r17, 12;

	ld.param.u32 	%r19, [Average_param_3];

	setp.lt.u32 	%p1, %r18, %r19;

	selp.b32 	%r20, %r18, 0, %p1;

	mul.lo.s32 	%r21, %r18, 6348;

	sub.s32 	%r22, %r13, %r21;

	mul.hi.u32 	%r23, %r22, -138023523;

	shr.u32 	%r24, %r23, 9;

	mul.lo.s32 	%r25, %r24, 529;

	sub.s32 	%r26, %r22, %r25;

	mul.hi.u32 	%r27, %r26, -1307163959;

	shr.u32 	%r6, %r27, 4;

	mul.lo.s32 	%r28, %r6, 23;

	sub.s32 	%r9, %r26, %r28;

	mov.u32 	%r7, 368;

	// inline asm

	mul24.lo.u32 	%r5, %r6, %r7;

	// inline asm

	mov.u32 	%r10, 4;

	// inline asm

	mad.lo.u32 	%r8, %r9, %r10, %r5;

	// inline asm

	mad.lo.s32 	%r29, %r20, 101568, %r8;

	mad.lo.s32 	%r30, %r24, 8464, %r29;

	shl.b32 	%r31, %r30, 2;

	ld.param.u32 	%r32, [Average_param_0];

	add.s32 	%r33, %r32, %r31;

	ldu.global.f32 	%f1, [%r33];

	ldu.global.f32 	%f2, [%r33+4];

	add.ftz.f32 	%f3, %f1, 0f00000000;

	add.ftz.f32 	%f4, %f3, %f2;

	ldu.global.f32 	%f5, [%r33+8];

	ldu.global.f32 	%f6, [%r33+12];

	add.ftz.f32 	%f7, %f4, %f5;

	add.ftz.f32 	%f8, %f7, %f6;

	ldu.global.f32 	%f9, [%r33+368];

	ldu.global.f32 	%f10, [%r33+372];

	add.ftz.f32 	%f11, %f8, %f9;

	add.ftz.f32 	%f12, %f11, %f10;

	ldu.global.f32 	%f13, [%r33+376];

	ldu.global.f32 	%f14, [%r33+380];

	add.ftz.f32 	%f15, %f12, %f13;

	add.ftz.f32 	%f16, %f15, %f14;

	ldu.global.f32 	%f17, [%r33+736];

	ldu.global.f32 	%f18, [%r33+740];

	add.ftz.f32 	%f19, %f16, %f17;

	add.ftz.f32 	%f20, %f19, %f18;

	ldu.global.f32 	%f21, [%r33+744];

	ldu.global.f32 	%f22, [%r33+748];

	add.ftz.f32 	%f23, %f20, %f21;

	add.ftz.f32 	%f24, %f23, %f22;

	ldu.global.f32 	%f25, [%r33+1104];

	ldu.global.f32 	%f26, [%r33+1108];

	add.ftz.f32 	%f27, %f24, %f25;

	add.ftz.f32 	%f28, %f27, %f26;

	ldu.global.f32 	%f29, [%r33+1112];

	ldu.global.f32 	%f30, [%r33+1116];

	add.ftz.f32 	%f31, %f28, %f29;

	add.ftz.f32 	%f32, %f31, %f30;

	ld.param.f32 	%f33, [Average_param_2];

	mul.ftz.f32 	%f34, %f32, %f33;

	mad.lo.s32 	%r34, %r20, 6348, %r22;

	shl.b32 	%r35, %r34, 2;

	ld.param.u32 	%r36, [Average_param_1];

	add.s32 	%r37, %r36, %r35;

	st.global.f32 	[%r37], %f34;

	ret;

}

%USERPROFILE%\AppData\Local\Temp

One can observe ldu.global.f32 are used in new driver, and it seems it is a wrong code.

I am running Win7 64bit SP1, GTX 560Ti.

I submitted a bug. Bug ID: 863441

Have you tried to manually replace “ldu.global.f32” in the PTX code generated by the 280.26 driver with “ld.global.f32”, and loading that PTX code again with clCreateProgramWithBinary()? Just to rule out that something that happens when compiling the PTX code itself is causing the / another slowdown …

NVidia employee already commented the bug: They managed to reproduce it.

The bug is closed. I was told that the fix will likely to appear in the next driver release (285.xx?)

Could you please test your sample with new 285.27 drivers released yesterday?

I have just tested with 285.27 at Win7 x64, 560Ti. The bug is fixed in this driver’s version. The overall program performance is reduced by 3%-4% but at this time there are no distinct places (kernels) with huge performance degradation. So I am keeping 285.27, -3%-4% is not a problem for me.