Hello,
In this forum posting I posted details about my RAM test (which is a performance test):
In this posting I will post the kernel which is used by it (which is/was also included in the rar file and so forth):
I welcome any idea’s how to make the kernel any faster without changing what it needs to do External Image which is: “traverse the whole index chain for it’s block”.
The kernel 0.02 is:
Copy & paste friendly:
// test cuda random memory access performance
//
// cuda kernel version 0.02 created on 12 july 2011 by Skybuck Flying
//
extern “C”
{ // extern c begin
global void Kernel( int ElementCount, int BlockCount, int LoopCount, int *Memory, int *BlockResult )
{
int BlockIndex;
int ElementIndex;
int LoopIndex;
int LinearIndex;
// uses 9 registers for sm_10
// alternative ways to calculate BlockIndex
/*
BlockIndex =
(threadIdx.x) +
(threadIdx.y * blockDim.x) +
(threadIdx.z * blockDim.x * blockDim.y) +
(blockIdx.x * blockDim.x * blockDim.y * blockDim.z) +
(blockIdx.y * blockDim.x * blockDim.y * blockDim.z * gridDim.x) +
(blockIdx.z * blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y);
*/
// uses 8 registers for sm_10
int LinearDimension;
BlockIndex = threadIdx.x;
LinearDimension = blockDim.x;
BlockIndex = BlockIndex + threadIdx.y * LinearDimension;
LinearDimension = LinearDimension * blockDim.y;
BlockIndex = BlockIndex + threadIdx.z * LinearDimension;
LinearDimension = LinearDimension * blockDim.z;
BlockIndex = BlockIndex + blockIdx.x * LinearDimension;
LinearDimension = LinearDimension * gridDim.x;
BlockIndex = BlockIndex + blockIdx.y * LinearDimension;
LinearDimension = LinearDimension * gridDim.y;
BlockIndex = BlockIndex + blockIdx.z * LinearDimension;
LinearDimension = LinearDimension * gridDim.z;
if (BlockIndex < BlockCount)
{
ElementIndex = 0;
for (LoopIndex = 0; LoopIndex < LoopCount; LoopIndex++)
{
LinearIndex = ElementIndex + (BlockIndex * ElementCount);
// get next element index
ElementIndex = Memory[ LinearIndex ];
}
// each block should output the last element index for a check up and to prevent the kernel from being reduced away ! <img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/wink.gif' class='bbc_emoticon' alt=';)' />
BlockResult[ BlockIndex ] = ElementIndex;
}
}
} // extern c end
Layout friendly:
// test cuda random memory access performance
//
// cuda kernel version 0.02 created on 12 july 2011 by Skybuck Flying
//
extern "C"
{ // extern c begin
__global__ void Kernel( int ElementCount, int BlockCount, int LoopCount, int *Memory, int *BlockResult )
{
int BlockIndex;
int ElementIndex;
int LoopIndex;
int LinearIndex;
// uses 9 registers for sm_10
// alternative ways to calculate BlockIndex
/*
BlockIndex =
(threadIdx.x) +
(threadIdx.y * blockDim.x) +
(threadIdx.z * blockDim.x * blockDim.y) +
(blockIdx.x * blockDim.x * blockDim.y * blockDim.z) +
(blockIdx.y * blockDim.x * blockDim.y * blockDim.z * gridDim.x) +
(blockIdx.z * blockDim.x * blockDim.y * blockDim.z * gridDim.x * gridDim.y);
*/
// uses 8 registers for sm_10
int LinearDimension;
BlockIndex = threadIdx.x;
LinearDimension = blockDim.x;
BlockIndex = BlockIndex + threadIdx.y * LinearDimension;
LinearDimension = LinearDimension * blockDim.y;
BlockIndex = BlockIndex + threadIdx.z * LinearDimension;
LinearDimension = LinearDimension * blockDim.z;
BlockIndex = BlockIndex + blockIdx.x * LinearDimension;
LinearDimension = LinearDimension * gridDim.x;
BlockIndex = BlockIndex + blockIdx.y * LinearDimension;
LinearDimension = LinearDimension * gridDim.y;
BlockIndex = BlockIndex + blockIdx.z * LinearDimension;
LinearDimension = LinearDimension * gridDim.z;
if (BlockIndex < BlockCount)
{
ElementIndex = 0;
for (LoopIndex = 0; LoopIndex < LoopCount; LoopIndex++)
{
LinearIndex = ElementIndex + (BlockIndex * ElementCount);
// get next element index
ElementIndex = Memory[ LinearIndex ];
}
// each block should output the last element index for a check up and to prevent the kernel from being reduced away ! <img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/wink.gif' class='bbc_emoticon' alt=';)' />
BlockResult[ BlockIndex ] = ElementIndex;
}
}
} // extern c end
And it’s ptx is:
Copy * Paste friendly:
.version 2.3
.target sm_20
.address_size 32
// compiled with C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../open64/lib//be.exe
// nvopencc 4.0 built on 2011-05-13
//-----------------------------------------------------------
// Compiling C:/Users/Skybuck/AppData/Local/Temp/tmpxft_000010cc_00000000-11_CudaMemoryTest.cpp3.i (C:/Users/Skybuck/AppData/Local/Temp/ccBI#.a04384)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:32
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "C:/Users/Skybuck/AppData/Local/Temp/tmpxft_000010cc_00000000-10_CudaMemoryTest.cudafe2.gpu"
.file 2 "c:\tools\microsoft visual studio 10.0\vc\include\codeanalysis\sourceannotations.h"
.file 3 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\crt/device_runtime.h"
.file 4 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\host_defines.h"
.file 5 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\builtin_types.h"
.file 6 "c:\tools\cuda\toolkit 4.0\v4.0\include\device_types.h"
.file 7 "c:\tools\cuda\toolkit 4.0\v4.0\include\driver_types.h"
.file 8 "c:\tools\cuda\toolkit 4.0\v4.0\include\surface_types.h"
.file 9 "c:\tools\cuda\toolkit 4.0\v4.0\include\texture_types.h"
.file 10 "c:\tools\cuda\toolkit 4.0\v4.0\include\vector_types.h"
.file 11 "c:\tools\cuda\toolkit 4.0\v4.0\include\builtin_types.h"
.file 12 "c:\tools\cuda\toolkit 4.0\v4.0\include\host_defines.h"
.file 13 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\device_launch_parameters.h"
.file 14 "c:\tools\cuda\toolkit 4.0\v4.0\include\crt\storage_class.h"
.file 15 "C:\Tools\Microsoft Visual Studio 10.0\VC\bin/../../VC/INCLUDE\time.h"
.file 16 "O:/CUDA C/Tests/test cuda memory test/version 0.02/CudaMemoryTest.cu"
.file 17 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\common_functions.h"
.file 18 "c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions.h"
.file 19 "c:\tools\cuda\toolkit 4.0\v4.0\include\math_constants.h"
.file 20 "c:\tools\cuda\toolkit 4.0\v4.0\include\device_functions.h"
.file 21 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_11_atomic_functions.h"
.file 22 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_12_atomic_functions.h"
.file 23 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_13_double_functions.h"
.file 24 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_atomic_functions.h"
.file 25 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_intrinsics.h"
.file 26 "c:\tools\cuda\toolkit 4.0\v4.0\include\surface_functions.h"
.file 27 "c:\tools\cuda\toolkit 4.0\v4.0\include\texture_fetch_functions.h"
.file 28 "c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions_dbl_ptx3.h"
.entry Kernel (
.param .s32 __cudaparm_Kernel_ElementCount,
.param .s32 __cudaparm_Kernel_BlockCount,
.param .s32 __cudaparm_Kernel_LoopCount,
.param .u32 __cudaparm_Kernel_Memory,
.param .u32 __cudaparm_Kernel_BlockResult)
{
.reg .u32 %r<43>;
.reg .pred %p<5>;
.loc 16 4 0
$LDWbegin_Kernel:
cvt.s32.u32 %r1, %ntid.x;
mov.u32 %r2, %tid.y;
mul.lo.u32 %r3, %r1, %r2;
mov.u32 %r4, %ntid.y;
mul.lo.u32 %r5, %r1, %r4;
cvt.s32.u32 %r6, %tid.x;
add.u32 %r7, %r6, %r3;
mov.u32 %r8, %tid.z;
mul.lo.u32 %r9, %r8, %r5;
mov.u32 %r10, %ntid.z;
mul.lo.u32 %r11, %r10, %r5;
add.u32 %r12, %r7, %r9;
mov.u32 %r13, %ctaid.x;
mul.lo.u32 %r14, %r13, %r11;
mov.u32 %r15, %nctaid.x;
mul.lo.u32 %r16, %r15, %r11;
add.u32 %r17, %r12, %r14;
mov.u32 %r18, %ctaid.y;
mul.lo.u32 %r19, %r18, %r16;
mov.u32 %r20, %nctaid.y;
mul.lo.u32 %r21, %r20, %r16;
add.u32 %r22, %r17, %r19;
mov.u32 %r23, %ctaid.z;
mul.lo.u32 %r24, %r23, %r21;
add.u32 %r25, %r22, %r24;
ld.param.s32 %r26, [__cudaparm_Kernel_BlockCount];
setp.le.s32 %p1, %r26, %r25;
@%p1 bra $Lt_0_2050;
ld.param.s32 %r27, [__cudaparm_Kernel_LoopCount];
mov.u32 %r28, 0;
setp.le.s32 %p2, %r27, %r28;
@%p2 bra $Lt_0_3586;
mov.s32 %r29, %r27;
ld.param.s32 %r30, [__cudaparm_Kernel_ElementCount];
mul.lo.s32 %r31, %r30, %r25;
ld.param.u32 %r32, [__cudaparm_Kernel_Memory];
mov.s32 %r33, 0;
mov.s32 %r34, 0;
mov.s32 %r35, %r29;
$Lt_0_3074:
// Loop body line 4, nesting depth: 1, estimated iterations: unknown
.loc 16 55 0
add.s32 %r36, %r31, %r34;
mul.lo.u32 %r37, %r36, 4;
add.u32 %r38, %r32, %r37;
ld.global.s32 %r34, [%r38+0];
add.s32 %r33, %r33, 1;
setp.ne.s32 %p3, %r27, %r33;
@%p3 bra $Lt_0_3074;
bra.uni $Lt_0_2562;
$Lt_0_3586:
mov.s32 %r34, 0;
$Lt_0_2562:
.loc 16 59 0
ld.param.u32 %r39, [__cudaparm_Kernel_BlockResult];
mul.lo.u32 %r40, %r25, 4;
add.u32 %r41, %r39, %r40;
st.global.s32 [%r41+0], %r34;
$Lt_0_2050:
.loc 16 68 0
exit;
$LDWend_Kernel:
} // Kernel
Layout-friendly:
.version 2.3
.target sm_20
.address_size 32
// compiled with C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../open64/lib//be.exe
// nvopencc 4.0 built on 2011-05-13
//-----------------------------------------------------------
// Compiling C:/Users/Skybuck/AppData/Local/Temp/tmpxft_000010cc_00000000-11_CudaMemoryTest.cpp3.i (C:/Users/Skybuck/AppData/Local/Temp/ccBI#.a04384)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:32
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "C:/Users/Skybuck/AppData/Local/Temp/tmpxft_000010cc_00000000-10_CudaMemoryTest.cudafe2.gpu"
.file 2 "c:\tools\microsoft visual studio 10.0\vc\include\codeanalysis\sourceannotations.h"
.file 3 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\crt/device_runtime.h"
.file 4 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\host_defines.h"
.file 5 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\builtin_types.h"
.file 6 "c:\tools\cuda\toolkit 4.0\v4.0\include\device_types.h"
.file 7 "c:\tools\cuda\toolkit 4.0\v4.0\include\driver_types.h"
.file 8 "c:\tools\cuda\toolkit 4.0\v4.0\include\surface_types.h"
.file 9 "c:\tools\cuda\toolkit 4.0\v4.0\include\texture_types.h"
.file 10 "c:\tools\cuda\toolkit 4.0\v4.0\include\vector_types.h"
.file 11 "c:\tools\cuda\toolkit 4.0\v4.0\include\builtin_types.h"
.file 12 "c:\tools\cuda\toolkit 4.0\v4.0\include\host_defines.h"
.file 13 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\device_launch_parameters.h"
.file 14 "c:\tools\cuda\toolkit 4.0\v4.0\include\crt\storage_class.h"
.file 15 "C:\Tools\Microsoft Visual Studio 10.0\VC\bin/../../VC/INCLUDE\time.h"
.file 16 "O:/CUDA C/Tests/test cuda memory test/version 0.02/CudaMemoryTest.cu"
.file 17 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\common_functions.h"
.file 18 "c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions.h"
.file 19 "c:\tools\cuda\toolkit 4.0\v4.0\include\math_constants.h"
.file 20 "c:\tools\cuda\toolkit 4.0\v4.0\include\device_functions.h"
.file 21 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_11_atomic_functions.h"
.file 22 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_12_atomic_functions.h"
.file 23 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_13_double_functions.h"
.file 24 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_atomic_functions.h"
.file 25 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_intrinsics.h"
.file 26 "c:\tools\cuda\toolkit 4.0\v4.0\include\surface_functions.h"
.file 27 "c:\tools\cuda\toolkit 4.0\v4.0\include\texture_fetch_functions.h"
.file 28 "c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions_dbl_ptx3.h"
.entry Kernel (
.param .s32 __cudaparm_Kernel_ElementCount,
.param .s32 __cudaparm_Kernel_BlockCount,
.param .s32 __cudaparm_Kernel_LoopCount,
.param .u32 __cudaparm_Kernel_Memory,
.param .u32 __cudaparm_Kernel_BlockResult)
{
.reg .u32 %r<43>;
.reg .pred %p<5>;
.loc 16 4 0
$LDWbegin_Kernel:
cvt.s32.u32 %r1, %ntid.x;
mov.u32 %r2, %tid.y;
mul.lo.u32 %r3, %r1, %r2;
mov.u32 %r4, %ntid.y;
mul.lo.u32 %r5, %r1, %r4;
cvt.s32.u32 %r6, %tid.x;
add.u32 %r7, %r6, %r3;
mov.u32 %r8, %tid.z;
mul.lo.u32 %r9, %r8, %r5;
mov.u32 %r10, %ntid.z;
mul.lo.u32 %r11, %r10, %r5;
add.u32 %r12, %r7, %r9;
mov.u32 %r13, %ctaid.x;
mul.lo.u32 %r14, %r13, %r11;
mov.u32 %r15, %nctaid.x;
mul.lo.u32 %r16, %r15, %r11;
add.u32 %r17, %r12, %r14;
mov.u32 %r18, %ctaid.y;
mul.lo.u32 %r19, %r18, %r16;
mov.u32 %r20, %nctaid.y;
mul.lo.u32 %r21, %r20, %r16;
add.u32 %r22, %r17, %r19;
mov.u32 %r23, %ctaid.z;
mul.lo.u32 %r24, %r23, %r21;
add.u32 %r25, %r22, %r24;
ld.param.s32 %r26, [__cudaparm_Kernel_BlockCount];
setp.le.s32 %p1, %r26, %r25;
@%p1 bra $Lt_0_2050;
ld.param.s32 %r27, [__cudaparm_Kernel_LoopCount];
mov.u32 %r28, 0;
setp.le.s32 %p2, %r27, %r28;
@%p2 bra $Lt_0_3586;
mov.s32 %r29, %r27;
ld.param.s32 %r30, [__cudaparm_Kernel_ElementCount];
mul.lo.s32 %r31, %r30, %r25;
ld.param.u32 %r32, [__cudaparm_Kernel_Memory];
mov.s32 %r33, 0;
mov.s32 %r34, 0;
mov.s32 %r35, %r29;
$Lt_0_3074:
//<loop> Loop body line 4, nesting depth: 1, estimated iterations: unknown
.loc 16 55 0
add.s32 %r36, %r31, %r34;
mul.lo.u32 %r37, %r36, 4;
add.u32 %r38, %r32, %r37;
ld.global.s32 %r34, [%r38+0];
add.s32 %r33, %r33, 1;
setp.ne.s32 %p3, %r27, %r33;
@%p3 bra $Lt_0_3074;
bra.uni $Lt_0_2562;
$Lt_0_3586:
mov.s32 %r34, 0;
$Lt_0_2562:
.loc 16 59 0
ld.param.u32 %r39, [__cudaparm_Kernel_BlockResult];
mul.lo.u32 %r40, %r25, 4;
add.u32 %r41, %r39, %r40;
st.global.s32 [%r41+0], %r34;
$Lt_0_2050:
.loc 16 68 0
exit;
$LDWend_Kernel:
} // Kernel
Bye,
Skybuck.