I checked WHQL 280.26 - the problem remains.
Here is source code for OpenCL kernel:
float AddOnActivationFunction3(const float x)
{
return x;
}
#define output_width 23
#define output_height 23
#define feature_map_count 12
#define source_width 4
#define source_height 4
#define input_width 92
#define input_height 92
#define output_entry_size 6348
#define input_entry_size 101568
__kernel
void Average(
const __global float * restrict input,
__global float * restrict output,
const float weight,
const uint entry_count
)
{
const uint dimension_id = get_global_id(0);
uint entry_id = dimension_id / (output_width * output_height * feature_map_count);
entry_id = entry_id < entry_count ? entry_id : 0;
const uint output_neuron_id_inside_entry = dimension_id % (output_width * output_height * feature_map_count);
const uint feature_map_id = output_neuron_id_inside_entry / (output_width * output_height);
const uint output_neuron_id = output_neuron_id_inside_entry % (output_width * output_height);
const uint dest_y = output_neuron_id / output_width;
const uint dest_x = output_neuron_id % output_width;
float sum = 0.0F;
uint input_base_offset = (entry_id * (uint)input_entry_size) + (feature_map_id * (uint)(input_width * input_height)) + mad24(dest_x, (uint)source_width, mul24(dest_y, (uint)(source_height * input_width)));
for(uint source_y = 0; source_y < source_height; source_y++)
{
for(uint source_x = 0; source_x < source_width; source_x++)
{
sum += input[input_base_offset + source_x];
}
input_base_offset += input_width;
}
output[entry_id * output_entry_size + output_neuron_id_inside_entry] = AddOnActivationFunction3((sum * weight));
}
Here is PTX code generated by 270.81:
//
// Generated by NVIDIA LLVM Compiler 4.0
//
.version 2.2
.target sm_21, texmode_independent
.entry Average(
.param .u32 .ptr .global .align 4 Average_param_0,
.param .u32 .ptr .global .align 4 Average_param_1,
.param .f32 Average_param_2,
.param .u32 Average_param_3
)
{
.reg .f32 %f<35>;
.reg .pred %p<2>;
.reg .s32 %r<37>;
_Average:
mov.u32 %r1, %tid.x;
mov.u32 %r2, %envreg3;
add.s32 %r3, %r1, %r2;
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mad.lo.s32 %r6, %r4, %r5, %r3;
mul.hi.u32 %r7, %r6, 1247624401;
sub.s32 %r8, %r6, %r7;
shr.u32 %r9, %r8, 1;
add.s32 %r10, %r9, %r7;
shr.u32 %r11, %r10, 12;
mul.lo.s32 %r12, %r11, 6348;
sub.s32 %r13, %r6, %r12;
mul.hi.u32 %r14, %r13, 4156943773;
shr.u32 %r15, %r14, 9;
mul.lo.s32 %r16, %r15, 529;
sub.s32 %r17, %r13, %r16;
mul.hi.u32 %r18, %r17, 2987803337;
shr.u32 %r19, %r18, 4;
mul.lo.s32 %r20, %r19, 23;
ld.param.u32 %r21, [Average_param_3];
mov.u32 %r22, 368;
setp.lt.u32 %p1, %r11, %r21;
sub.s32 %r23, %r17, %r20;
mul24.lo.u32 %r24, %r19, %r22;
mov.u32 %r25, 4;
selp.b32 %r26, %r11, 0, %p1;
mad24.lo.u32 %r27, %r23, %r25, %r24;
mad.lo.s32 %r28, %r26, 101568, %r27;
mad.lo.s32 %r29, %r15, 8464, %r28;
shl.b32 %r30, %r29, 2;
ld.param.u32 %r31, [Average_param_0];
add.s32 %r32, %r31, %r30;
ld.global.f32 %f1, [%r32];
add.rn.ftz.f32 %f2, %f1, 0f00000000;
ld.global.f32 %f3, [%r32+4];
add.rn.ftz.f32 %f4, %f2, %f3;
ld.global.f32 %f5, [%r32+8];
add.rn.ftz.f32 %f6, %f4, %f5;
ld.global.f32 %f7, [%r32+12];
add.rn.ftz.f32 %f8, %f6, %f7;
ld.global.f32 %f9, [%r32+368];
add.rn.ftz.f32 %f10, %f8, %f9;
ld.global.f32 %f11, [%r32+372];
add.rn.ftz.f32 %f12, %f10, %f11;
ld.global.f32 %f13, [%r32+376];
add.rn.ftz.f32 %f14, %f12, %f13;
ld.global.f32 %f15, [%r32+380];
add.rn.ftz.f32 %f16, %f14, %f15;
ld.global.f32 %f17, [%r32+736];
add.rn.ftz.f32 %f18, %f16, %f17;
ld.global.f32 %f19, [%r32+740];
add.rn.ftz.f32 %f20, %f18, %f19;
ld.global.f32 %f21, [%r32+744];
add.rn.ftz.f32 %f22, %f20, %f21;
ld.global.f32 %f23, [%r32+748];
add.rn.ftz.f32 %f24, %f22, %f23;
ld.global.f32 %f25, [%r32+1104];
add.rn.ftz.f32 %f26, %f24, %f25;
ld.global.f32 %f27, [%r32+1108];
add.rn.ftz.f32 %f28, %f26, %f27;
ld.global.f32 %f29, [%r32+1112];
ld.global.f32 %f30, [%r32+1116];
add.rn.ftz.f32 %f31, %f28, %f29;
mad.lo.s32 %r33, %r26, 6348, %r13;
add.rn.ftz.f32 %f32, %f31, %f30;
ld.param.f32 %f33, [Average_param_2];
shl.b32 %r34, %r33, 2;
ld.param.u32 %r35, [Average_param_1];
mul.rn.ftz.f32 %f34, %f32, %f33;
add.s32 %r36, %r35, %r34;
st.global.f32 [%r36], %f34;
ret;
}
And here is the code generated by 280.26:
//
// Generated by NVIDIA NVVM Compiler
// Compiler built on Wed Aug 03 12:45:53 2011 (1312361153)
// Driver
//
.version 2.4
.target sm_21, texmode_independent, large_surfaces
.address_size 32
.entry Average(
.param .u32 .ptr .global .align 4 Average_param_0,
.param .u32 .ptr .global .align 4 Average_param_1,
.param .f32 Average_param_2,
.param .u32 Average_param_3
)
{
.reg .f32 %f<35>;
.reg .pred %p<2>;
.reg .s32 %r<38>;
// inline asm
mov.u32 %r1, %envreg3;
// inline asm
// inline asm
mov.u32 %r2, %ntid.x;
// inline asm
// inline asm
mov.u32 %r3, %ctaid.x;
// inline asm
// inline asm
mov.u32 %r4, %tid.x;
// inline asm
mad.lo.s32 %r12, %r3, %r2, %r1;
add.s32 %r13, %r12, %r4;
mul.hi.u32 %r14, %r13, 1247624401;
sub.s32 %r15, %r13, %r14;
shr.u32 %r16, %r15, 1;
add.s32 %r17, %r16, %r14;
shr.u32 %r18, %r17, 12;
ld.param.u32 %r19, [Average_param_3];
setp.lt.u32 %p1, %r18, %r19;
selp.b32 %r20, %r18, 0, %p1;
mul.lo.s32 %r21, %r18, 6348;
sub.s32 %r22, %r13, %r21;
mul.hi.u32 %r23, %r22, -138023523;
shr.u32 %r24, %r23, 9;
mul.lo.s32 %r25, %r24, 529;
sub.s32 %r26, %r22, %r25;
mul.hi.u32 %r27, %r26, -1307163959;
shr.u32 %r6, %r27, 4;
mul.lo.s32 %r28, %r6, 23;
sub.s32 %r9, %r26, %r28;
mov.u32 %r7, 368;
// inline asm
mul24.lo.u32 %r5, %r6, %r7;
// inline asm
mov.u32 %r10, 4;
// inline asm
mad.lo.u32 %r8, %r9, %r10, %r5;
// inline asm
mad.lo.s32 %r29, %r20, 101568, %r8;
mad.lo.s32 %r30, %r24, 8464, %r29;
shl.b32 %r31, %r30, 2;
ld.param.u32 %r32, [Average_param_0];
add.s32 %r33, %r32, %r31;
ldu.global.f32 %f1, [%r33];
ldu.global.f32 %f2, [%r33+4];
add.ftz.f32 %f3, %f1, 0f00000000;
add.ftz.f32 %f4, %f3, %f2;
ldu.global.f32 %f5, [%r33+8];
ldu.global.f32 %f6, [%r33+12];
add.ftz.f32 %f7, %f4, %f5;
add.ftz.f32 %f8, %f7, %f6;
ldu.global.f32 %f9, [%r33+368];
ldu.global.f32 %f10, [%r33+372];
add.ftz.f32 %f11, %f8, %f9;
add.ftz.f32 %f12, %f11, %f10;
ldu.global.f32 %f13, [%r33+376];
ldu.global.f32 %f14, [%r33+380];
add.ftz.f32 %f15, %f12, %f13;
add.ftz.f32 %f16, %f15, %f14;
ldu.global.f32 %f17, [%r33+736];
ldu.global.f32 %f18, [%r33+740];
add.ftz.f32 %f19, %f16, %f17;
add.ftz.f32 %f20, %f19, %f18;
ldu.global.f32 %f21, [%r33+744];
ldu.global.f32 %f22, [%r33+748];
add.ftz.f32 %f23, %f20, %f21;
add.ftz.f32 %f24, %f23, %f22;
ldu.global.f32 %f25, [%r33+1104];
ldu.global.f32 %f26, [%r33+1108];
add.ftz.f32 %f27, %f24, %f25;
add.ftz.f32 %f28, %f27, %f26;
ldu.global.f32 %f29, [%r33+1112];
ldu.global.f32 %f30, [%r33+1116];
add.ftz.f32 %f31, %f28, %f29;
add.ftz.f32 %f32, %f31, %f30;
ld.param.f32 %f33, [Average_param_2];
mul.ftz.f32 %f34, %f32, %f33;
mad.lo.s32 %r34, %r20, 6348, %r22;
shl.b32 %r35, %r34, 2;
ld.param.u32 %r36, [Average_param_1];
add.s32 %r37, %r36, %r35;
st.global.f32 [%r37], %f34;
ret;
}
%USERPROFILE%\AppData\Local\Temp
One can observe ldu.global.f32 are used in new driver, and it seems it is a wrong code.
I am running Win7 64bit SP1, GTX 560Ti.