Hello,
My Delphi cuda 4.0 program tries to run the following ptx file via cuLaunchKernel:
(Everything is working… ptx module is being loaded, kernel function is found and set etc…)
// array of cuda pointer (cudeviceptr) (32 bit)
mKernelParameterValueArray[0] := mCudaMemoryA;
mKernelParameterValueArray[1] := mCudaMemoryB;
// array of host pointer (32 bit)
mKernelParameterPointerArray[0] := @mKernelParameterValueArray[0];
mKernelParameterPointerArray[1] := @mKernelParameterValueArray[1];
// launch
if cuLaunchKernel
(
1,1,1,
16,1,1,
0,
nil,
@mKernelParameterPointerArray[0],
nil
) then
begin
writeln(‘cuLaunchKernel successfull.’);
end else
begin
writeln(‘cuLaunchKernel failed.’);
end;
It returns “successfull”, nut the output is “Hello” but it should be “Hello World”.
After the kernel launch the copy functions seem to fail as well.
I tried debugging with Parallel Nsight but that didn’t work out… break point was not activated… perhaps I will have to re-write the kernel to not include any host code.
I also tried debugging with Visual Profiler… it does show two memory copies before the launch, but then it says there was no kernel launched ?!?
So I think I am doing something wrong with the kernel parameters ?!?
I could try the other way via the extra parameter…
Any idea’s/help is welcome.
(Perhaps later I will give the “deprecated launching methods a try” External Image)
I do notice a 64 bit pointer size down below in the PTX, maybe that is giving problems ?
(Maybe the PTX is generated for 64 bit host pointers ??? I think the C/C++ project was 32 bit though…)
Code:
// from internet:
#include <stdio.h>
const int N = 16;
const int blocksize = 16;
global
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}
int main()
{
char a[N] = “Hello \0\0\0\0\0\0”;
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);
printf("%s", a);
cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaFree( ad );
// notice how one free is missing <img src='http://forums.nvidia.com/public/style_emoticons/<#EMO_DIR#>/wink.gif' class='bbc_emoticon' alt=';)' />
printf("%s\n", a);
return EXIT_SUCCESS;
}
PTX:
.version 1.4
.target sm_10, map_f64_to_f32
// compiled with C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../open64/lib//be.exe
// nvopencc 4.0 built on 2011-05-13
//-----------------------------------------------------------
// Compiling C:/Users/Skybuck/AppData/Local/Temp/tmpxft_000008cc_00000000-11_kernel.cpp3.i (C:/Users/Skybuck/AppData/Local/Temp/ccBI#.a02776)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "C:/Users/Skybuck/AppData/Local/Temp/tmpxft_000008cc_00000000-10_kernel.cudafe2.gpu"
.file 2 "c:\tools\microsoft visual studio 10.0\vc\include\codeanalysis\sourceannotations.h"
.file 3 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\crt/device_runtime.h"
.file 4 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\host_defines.h"
.file 5 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\builtin_types.h"
.file 6 "c:\tools\cuda\toolkit 4.0\v4.0\include\device_types.h"
.file 7 "c:\tools\cuda\toolkit 4.0\v4.0\include\driver_types.h"
.file 8 "c:\tools\cuda\toolkit 4.0\v4.0\include\surface_types.h"
.file 9 "c:\tools\cuda\toolkit 4.0\v4.0\include\texture_types.h"
.file 10 "c:\tools\cuda\toolkit 4.0\v4.0\include\vector_types.h"
.file 11 "c:\tools\cuda\toolkit 4.0\v4.0\include\builtin_types.h"
.file 12 "c:\tools\cuda\toolkit 4.0\v4.0\include\host_defines.h"
.file 13 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\device_launch_parameters.h"
.file 14 "c:\tools\cuda\toolkit 4.0\v4.0\include\crt\storage_class.h"
.file 15 "c:\Tools\Microsoft Visual Studio 10.0\VC\BIN/../../VC/INCLUDE\time.h"
.file 16 "kernel.cu"
.file 17 "C:\Tools\CUDA\Toolkit 4.0\v4.0\bin\/../include\common_functions.h"
.file 18 "c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions.h"
.file 19 "c:\tools\cuda\toolkit 4.0\v4.0\include\math_constants.h"
.file 20 "c:\tools\cuda\toolkit 4.0\v4.0\include\device_functions.h"
.file 21 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_11_atomic_functions.h"
.file 22 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_12_atomic_functions.h"
.file 23 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_13_double_functions.h"
.file 24 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_atomic_functions.h"
.file 25 "c:\tools\cuda\toolkit 4.0\v4.0\include\sm_20_intrinsics.h"
.file 26 "c:\tools\cuda\toolkit 4.0\v4.0\include\surface_functions.h"
.file 27 "c:\tools\cuda\toolkit 4.0\v4.0\include\texture_fetch_functions.h"
.file 28 "c:\tools\cuda\toolkit 4.0\v4.0\include\math_functions_dbl_ptx1.h"
.entry _Z5helloPcPi (
.param .u64 __cudaparm__Z5helloPcPi_a,
.param .u64 __cudaparm__Z5helloPcPi_b)
{
.reg .u16 %rh<5>;
.reg .u64 %rd<8>;
.loc 16 8 0
$LDWbegin__Z5helloPcPi:
.loc 16 10 0
cvt.u64.u16 %rd1, %tid.x;
ld.param.u64 %rd2, [__cudaparm__Z5helloPcPi_a];
add.u64 %rd3, %rd2, %rd1;
ld.global.s8 %rh1, [%rd3+0];
ld.param.u64 %rd4, [__cudaparm__Z5helloPcPi_b];
mul.lo.u64 %rd5, %rd1, 4;
add.u64 %rd6, %rd4, %rd5;
ld.global.s8 %rh2, [%rd6+0];
add.s16 %rh3, %rh1, %rh2;
st.global.s8 [%rd3+0], %rh3;
.loc 16 11 0
exit;
$LDWend__Z5helloPcPi:
} // _Z5helloPcPi