cuLaunchKernel parameters ?

Hello,

My Delphi cuda 4.0 program tries to run the following ptx file via cuLaunchKernel:

(Everything is working… ptx module is being loaded, kernel function is found and set etc…)

// array of cuda pointer (cudeviceptr) (32 bit)
mKernelParameterValueArray[0] := mCudaMemoryA;
mKernelParameterValueArray[1] := mCudaMemoryB;

// array of host pointer (32 bit)
mKernelParameterPointerArray[0] := @mKernelParameterValueArray[0];
mKernelParameterPointerArray[1] := @mKernelParameterValueArray[1];

// launch
if cuLaunchKernel
(
1,1,1,
16,1,1,
0,
nil,
@mKernelParameterPointerArray[0],
nil
) then
begin
writeln(‘cuLaunchKernel successfull.’);
end else
begin
writeln(‘cuLaunchKernel failed.’);
end;

It returns “successfull”, nut the output is “Hello” but it should be “Hello World”.

After the kernel launch the copy functions seem to fail as well.

I tried debugging with Parallel Nsight but that didn’t work out… break point was not activated… perhaps I will have to re-write the kernel to not include any host code.

I also tried debugging with Visual Profiler… it does show two memory copies before the launch, but then it says there was no kernel launched ?!?

So I think I am doing something wrong with the kernel parameters ?!?

I could try the other way via the extra parameter…

Any idea’s/help is welcome.

(Perhaps later I will give the “deprecated launching methods a try” ;))

I do notice a 64 bit pointer size down below in the PTX, maybe that is giving problems ?
(Maybe the PTX is generated for 64 bit host pointers ??? I think the C/C++ project was 32 bit though…)

Code:

// from internet:

#include <stdio.h>

const int N = 16;
const int blocksize = 16;

global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);


printf("%s", a);

cudaMalloc( (void**)&ad, csize ); 
cudaMalloc( (void**)&bd, isize ); 
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice ); 
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice ); 

dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost ); 
cudaFree( ad );
    // notice how one free is missing <img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/wink.gif' class='bbc_emoticon' alt=';)' />

printf("%s\n", a);
return EXIT_SUCCESS;

}

PTX:

.version 1.4
.target sm_10, map_f64_to_f32
// compiled with C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../open64/lib//be.exe
// nvopencc 4.0 built on 2011-05-13

//-----------------------------------------------------------
// Compiling C:/Users/Skybuck/AppData/Local/Temp/tmpxft_000008cc_00000000-11_kernel.cpp3.i (C:/Users/Skybuck/AppData/Local/Temp/ccBI#.a02776)
//-----------------------------------------------------------

//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
//  -O3	(Optimization level)
//  -g0	(Debug level)
//  -m2	(Report advisories)
//-----------------------------------------------------------

.file	1	"C:/Users/Skybuck/AppData/Local/Temp/tmpxft_000008cc_00000000-10_kernel.cudafe2.gpu"
.file	2	"c:\tools\microsoft visual studio 10.0\vc\include\codeanalysis\sourceannotations.h"
.file	3	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\crt/device_runtime.h"
.file	4	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\host_defines.h"
.file	5	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\builtin_types.h"
.file	6	"c:\tools\cuda\toolkit 4.0\v4.0\include\device_types.h"
.file	7	"c:\tools\cuda\toolkit 4.0\v4.0\include\driver_types.h"
.file	8	"c:\tools\cuda\toolkit 4.0\v4.0\include\surface_types.h"
.file	9	"c:\tools\cuda\toolkit 4.0\v4.0\include\texture_types.h"
.file	10	"c:\tools\cuda\toolkit 4.0\v4.0\include\vector_types.h"
.file	11	"c:\tools\cuda\toolkit 4.0\v4.0\include\builtin_types.h"
.file	12	"c:\tools\cuda\toolkit 4.0\v4.0\include\host_defines.h"
.file	13	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\device_launch_parameters.h"
.file	14	"c:\tools\cuda\toolkit 4.0\v4.0\include\crt\storage_class.h"
.file	15	"c:\Tools\Microsoft Visual Studio 10.0\VC\BIN/../../VC/INCLUDE\time.h"
.file	16	"kernel.cu"
.file	17	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\common_functions.h"
.file	18	"c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions.h"
.file	19	"c:\tools\cuda\toolkit 4.0\v4.0\include\math_constants.h"
.file	20	"c:\tools\cuda\toolkit 4.0\v4.0\include\device_functions.h"
.file	21	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_11_atomic_functions.h"
.file	22	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_12_atomic_functions.h"
.file	23	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_13_double_functions.h"
.file	24	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_atomic_functions.h"
.file	25	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_intrinsics.h"
.file	26	"c:\tools\cuda\toolkit 4.0\v4.0\include\surface_functions.h"
.file	27	"c:\tools\cuda\toolkit 4.0\v4.0\include\texture_fetch_functions.h"
.file	28	"c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions_dbl_ptx1.h"


.entry _Z5helloPcPi (
	.param .u64 __cudaparm__Z5helloPcPi_a,
	.param .u64 __cudaparm__Z5helloPcPi_b)
{
.reg .u16 %rh<5>;
.reg .u64 %rd<8>;
.loc	16	8	0

$LDWbegin__Z5helloPcPi:
.loc 16 10 0
cvt.u64.u16 %rd1, %tid.x;
ld.param.u64 %rd2, [__cudaparm__Z5helloPcPi_a];
add.u64 %rd3, %rd2, %rd1;
ld.global.s8 %rh1, [%rd3+0];
ld.param.u64 %rd4, [__cudaparm__Z5helloPcPi_b];
mul.lo.u64 %rd5, %rd1, 4;
add.u64 %rd6, %rd4, %rd5;
ld.global.s8 %rh2, [%rd6+0];
add.s16 %rh3, %rh1, %rh2;
st.global.s8 [%rd3+0], %rh3;
.loc 16 11 0
exit;
$LDWend__Z5helloPcPi:
} // _Z5helloPcPi

I have a good idea, hopefully it helps, the idea is to check the GPU Computing SDK 4.0 to see if it contains an example of the cuLaunchKernel API call.

So far textpad found the following:

Searching for: cuLaunchKernel
cudaDecodeD3D9\cudaProcessFrame.cpp(113): status = cuLaunchKernel( fpFunc, grid.x, grid.y, grid.z,
cudaDecodeGL\cudaProcessFrame.cpp(113): status = cuLaunchKernel( fpFunc, grid.x, grid.y, grid.z,
matrixMulDrv\matrixMulDrv.cpp(169): cutilDrvSafeCallNoSync(cuLaunchKernel( matrixMul, grid.x, grid.y, grid.z,
matrixMulDrv\matrixMulDrv.cpp(179): cutilDrvSafeCallNoSync(cuLaunchKernel( matrixMul, grid.x, grid.y, grid.z,
matrixMulDrv\matrixMulDrv.cpp(217): cutilDrvSafeCallNoSync(cuLaunchKernel( matrixMul, grid.x, grid.y, grid.z,
matrixMulDynlinkJIT\cuda_drvapi_dynlink.c(95): tcuLaunchKernel *cuLaunchKernel;
matrixMulDynlinkJIT\cuda_drvapi_dynlink.c(535): GET_PROC(cuLaunchKernel);
matrixMulDynlinkJIT\cuda_drvapi_dynlink_cuda.h(960): * ::cuLaunchKernel
matrixMulDynlinkJIT\cuda_drvapi_dynlink_cuda.h(966): * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
matrixMulDynlinkJIT\cuda_drvapi_dynlink_cuda.h(977): * ::cuLaunchKernel will be a pointer to a size_t which contains the
matrixMulDynlinkJIT\cuda_drvapi_dynlink_cuda.h(1407): typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
matrixMulDynlinkJIT\cuda_drvapi_dynlink_cuda.h(1600): extern tcuLaunchKernel *cuLaunchKernel;
matrixMulDynlinkJIT\matrixMulDynlinkJIT.cpp(264): cutilDrvSafeCallNoSync(cuLaunchKernel( matrixMul, (WC/block_size), (HC/block_size), 1,
simpleTextureDrv\simpleTextureDrv.cpp(170): cutilDrvSafeCallNoSync(cuLaunchKernel( transform, (width/block_size), (height/block_size), 1,
simpleTextureDrv\simpleTextureDrv.cpp(179): cutilDrvSafeCallNoSync(cuLaunchKernel( transform, (width/block_size), (height/block_size), 1,
simpleTextureDrv\simpleTextureDrv.cpp(202): cutilDrvSafeCallNoSync(cuLaunchKernel( transform, (width/block_size), (height/block_size), 1,
simpleTextureDrv\simpleTextureDrv.cpp(211): cutilDrvSafeCallNoSync(cuLaunchKernel( transform, (width/block_size), (height/block_size), 1,
threadMigration\threadMigration.cpp(252): status = cuLaunchKernel( pParams->hcuFunction, 1, 1, 1,
threadMigration\threadMigration.cpp(276): status = cuLaunchKernel( pParams->hcuFunction, 1, 1, 1,
vectorAddDrv\vectorAddDrv.cpp(209): error = cuLaunchKernel( vecAdd_kernel, blocksPerGrid, 1, 1,
vectorAddDrv\vectorAddDrv.cpp(229): error = cuLaunchKernel( vecAdd_kernel, blocksPerGrid, 1, 1,
vectorAddDrv\vectorAddDrv.cpp(258): error = cuLaunchKernel( vecAdd_kernel, blocksPerGrid, 1, 1,

So I am going to give these examples a looksy to see if I can find anything usefull inside of it ! ;) :)

Perhaps even try to make a small example in C/C++ and see if it can run the helloworld example ;)

I guess the “hello world” example is also still to difficult/complex because of the pointers/memory it uses, it’s not a good first debugging example.

So perhaps I should try something really simple first like a kernel with no parameters and perhaps some easy calculations inside… but I am not sure if that would be compiled away… if so then simply passing 1 parameter via call by value and no pointers would be a better example…

For example IBM’s first routine on mainframe is probably the “add-er” ;)

kernel( int a, int b )
{
int c;
c = a + b;
}

This should be easy to debug and see if the parameters are passed correctly… once that works the rest should be more easy to get working as well ;)

Hmmm this is interesting

nvcc:

–machine (-m)
Specify 32 vs 64 bit architecture.
Allowed values for this option: 32,64.
Default value: 64.

^ Could this mean all integers/cuda device pointers are expected to be in 64 bits on host side as well ? hmmm…

Perhaps I should try compiling with -mmachine 32 later on to see if that helps ;)

I also tried:
var
mKernelParameterConfig : array[0…4] of CUKernelParam; // CUKernelParam = 32 bit pointer

mKernelParameterBuffer : array[0..255] of byte;
mKernelParameterBufferSize : longword;

begin

											// Launch with kernel config
											Plongword(@mKernelParameterBuffer[0])^ := mCudaMemoryA;
											Plongword(@mKernelParameterBuffer[4])^ := mCudaMemoryB;

// also tried

											// Launch with kernel config

// Plongword(@mKernelParameterBuffer[0])^ := longword(@mCudaMemoryA);
// Plongword(@mKernelParameterBuffer[4])^ := longword(@mCudaMemoryB);

											mKernelParameterBufferSize := 8;

											mKernelParameterConfig[0] := pointer(CU_LAUNCH_PARAM_BUFFER_POINTER);
											mKernelParameterConfig[1] := @mKernelParameterBuffer[0];
											mKernelParameterConfig[2] := pointer(CU_LAUNCH_PARAM_BUFFER_SIZE);
											mKernelParameterConfig[3] := @mKernelParameterBufferSize;
											mKernelParameterConfig[4] := pointer(CU_LAUNCH_PARAM_END);

											// launch
											if mCudaDriverModule.Kernel.Launch
											(
												1,1,1,
												16,1,1,
												0,
												nil,
												nil,
												@mKernelParameterConfig[0]
											) then
											begin
												writeln('mCudaDriverModule.Kernel.Launch successfull.');
											end else
											begin
												writeln('mCudaDriverModule.Kernel.Launch failed.');
											end;

Successfull is shown, but copy back still just show “Hello” and no “Hello World”.

Perhaps it’s a 64 bit kernel problem, or something else is wrong.

I should give a more simple kernel a try, but it’s not so simple creating a simple kernel because the cuda compiler illiminates simple code.

Maybe visual studio or delphi can be used to debug the cuLaunchKernel call to see what’s going on… or perhaps something deeper like nvcuda.dll.

Perhaps I should first try simple launching an empty kernel to see if at least the call makes it through according to visual profiler…

Another explanation could be:

Maybe something special has to be done to “cuda device pointers” ? Do they have to be passed as if they were inside the kernel ? Or do they need to be translated to host addressing (seems unlikely and weird) ?

Empty kernel seems to execute just fine, at least according to visual profiler… some weird values here and there but nothing to big ;)

So I think the problem can be reduced to “parameter passing problem”.

Though not much going on yet, and the ptx file does say 64 bit pointers…

Now that I have textpad and compiler running nicely it’s apperent what the problem is from the assembly:

.entry _Z10HelloWorldPcPi (
	.param .u64 __cudaparm__Z10HelloWorldPcPi_a,
	.param .u64 __cudaparm__Z10HelloWorldPcPi_b)
{
.reg .u16 %rh<5>;
.reg .u64 %rd<8>;
.loc	16	1	0

$LDWbegin__Z10HelloWorldPcPi:
.loc 16 3 0
cvt.u64.u16 %rd1, %tid.x;
ld.param.u64 %rd2, [__cudaparm__Z10HelloWorldPcPi_a];
add.u64 %rd3, %rd2, %rd1;
ld.global.s8 %rh1, [%rd3+0];
ld.param.u64 %rd4, [__cudaparm__Z10HelloWorldPcPi_b];
mul.lo.u64 %rd5, %rd1, 4;
add.u64 %rd6, %rd4, %rd5;
ld.global.s8 %rh2, [%rd6+0];
add.s16 %rh3, %rh1, %rh2;
st.global.s8 [%rd3+0], %rh3;
.loc 16 4 0
exit;
$LDWend__Z10HelloWorldPcPi:
} // _Z10HelloWorldPcPi

The parameters appear to be 64 bit pointers.

Delphi is only 32 bit and the cuda unit assumes 32 bit as well…

I could now update code to use 64 bit or perhaps specify a nvcc compiler option so it will compile to 32 bit…

I think last idea is probably best… so I give that a try ;)

Ok, the kernel was compiled for a 32 bit machine and now it’s working ! ;) =D

Command line options were:

":\Tools\CUDA\Toolkit 4.0\v4.0\bin\nvcc.exe $File -ptx --machine 32

($file is something textpad specific and is replacement for HelloWorld.cu ;))

.version 1.4
.target sm_10, map_f64_to_f32
// compiled with C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../open64/lib//be.exe
// nvopencc 4.0 built on 2011-05-13

//-----------------------------------------------------------
// Compiling C:/Users/Skybuck/AppData/Local/Temp/tmpxft_00001150_00000000-11_HelloWorld.cpp3.i (C:/Users/Skybuck/AppData/Local/Temp/ccBI#.a06096)
//-----------------------------------------------------------

//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
//  Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
//  -O3	(Optimization level)
//  -g0	(Debug level)
//  -m2	(Report advisories)
//-----------------------------------------------------------

.file	1	"C:/Users/Skybuck/AppData/Local/Temp/tmpxft_00001150_00000000-10_HelloWorld.cudafe2.gpu"
.file	2	"c:\tools\microsoft visual studio 10.0\vc\include\codeanalysis\sourceannotations.h"
.file	3	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\crt/device_runtime.h"
.file	4	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\host_defines.h"
.file	5	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\builtin_types.h"
.file	6	"c:\tools\cuda\toolkit 4.0\v4.0\include\device_types.h"
.file	7	"c:\tools\cuda\toolkit 4.0\v4.0\include\driver_types.h"
.file	8	"c:\tools\cuda\toolkit 4.0\v4.0\include\surface_types.h"
.file	9	"c:\tools\cuda\toolkit 4.0\v4.0\include\texture_types.h"
.file	10	"c:\tools\cuda\toolkit 4.0\v4.0\include\vector_types.h"
.file	11	"c:\tools\cuda\toolkit 4.0\v4.0\include\builtin_types.h"
.file	12	"c:\tools\cuda\toolkit 4.0\v4.0\include\host_defines.h"
.file	13	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\device_launch_parameters.h"
.file	14	"c:\tools\cuda\toolkit 4.0\v4.0\include\crt\storage_class.h"
.file	15	"C:\Tools\Microsoft Visual Studio 10.0\VC\bin/../../VC/INCLUDE\time.h"
.file	16	"O:/CUDA C/test HelloWorld/HelloWorld.cu"
.file	17	"C:\Tools\CUDA\Toolkit 4.0\v4.0\bin/../include\common_functions.h"
.file	18	"c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions.h"
.file	19	"c:\tools\cuda\toolkit 4.0\v4.0\include\math_constants.h"
.file	20	"c:\tools\cuda\toolkit 4.0\v4.0\include\device_functions.h"
.file	21	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_11_atomic_functions.h"
.file	22	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_12_atomic_functions.h"
.file	23	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_13_double_functions.h"
.file	24	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_atomic_functions.h"
.file	25	"c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_intrinsics.h"
.file	26	"c:\tools\cuda\toolkit 4.0\v4.0\include\surface_functions.h"
.file	27	"c:\tools\cuda\toolkit 4.0\v4.0\include\texture_fetch_functions.h"
.file	28	"c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions_dbl_ptx1.h"


.entry _Z10HelloWorldPcPi (
	.param .u32 __cudaparm__Z10HelloWorldPcPi_a,
	.param .u32 __cudaparm__Z10HelloWorldPcPi_b)
{
.reg .u16 %rh<5>;
.reg .u32 %r<8>;
.loc	16	1	0

$LDWbegin__Z10HelloWorldPcPi:
.loc 16 3 0
cvt.u32.u16 %r1, %tid.x;
ld.param.u32 %r2, [__cudaparm__Z10HelloWorldPcPi_a];
add.u32 %r3, %r2, %r1;
ld.global.s8 %rh1, [%r3+0];
ld.param.u32 %r4, [__cudaparm__Z10HelloWorldPcPi_b];
mul24.lo.u32 %r5, %r1, 4;
add.u32 %r6, %r4, %r5;
ld.global.s8 %rh2, [%r6+0];
add.s16 %rh3, %rh1, %rh2;
st.global.s8 [%r3+0], %rh3;
.loc 16 4 0
exit;
$LDWend__Z10HelloWorldPcPi:
} // _Z10HelloWorldPcPi

Once this kernel is executed it will change:

“Hello”

into

“World!”

Bye,
Skybuck =D