Hi !
I have the following very simple ptx-code ( test.ptx file ):
.version 1.4
.target sm_10
.entry test ( .param .s32 C )
{
.reg .s32 %p<1>;
ld.param.s32 %p0, [C];
st.global.s32 [%p0], 0;
ret;
}
In fact, this ptx-code assigns 0 to the first element of array given as an input parameter.
The main program cuTest.cpp is the following
#include "stdio.h"
#include "malloc.h"
#include "cuda.h"
#include "cuda_runtime_api.h"
#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
int main ( int argc , char** argv ) {
int rt_version;
int N = 8;
int nDevice = atoi ( argv [ 1 ] );
printf ( "Device number = %d\n", nDevice );
char* file_name = argv [ 2 ];
char* func_name = argv [ 3 ];
cudaRuntimeGetVersion ( &rt_version );
printf ( "Runtime version = %d.%d\n", rt_version/1000, rt_version%100 );
int* A = (int*) malloc ( N * sizeof ( int ) );
for ( int i = 0; i < N; i++ )
A [ i ] = -1;
printf ( "Before ->\n" );
for ( int i = 0; i < N; i++ )
printf ( "%d\n", A [ i ] );
CUdeviceptr dA = cuMemAlloc ( &dA, N * sizeof ( int ) );
cuMemcpyHtoD(dA, A, N * sizeof ( int ) );
CUresult result = cuInit(0);
int deviceCount = 0;
cuDeviceGetCount ( &deviceCount );
printf ( "deviceCount = %d\n", deviceCount );
CUdevice cuDevice = 0;
cuDeviceGet ( &cuDevice, nDevice );
CUcontext cuContext;
cuCtxCreate ( &cuContext, 0, cuDevice );
CUmodule cuModule;
result = cuModuleLoad ( &cuModule, file_name );
printf ( "ModuleLoad result = %d\n", result );
CUfunction cuFunc;
result = cuModuleGetFunction ( &cuFunc, cuModule, func_name );
printf ( "GetFunction result = %d\n", result );
int offset = 0;
void* ptr;
ptr = (void*)(size_t)dA;
ALIGN_UP(offset, __alignof(ptr));
result = cuParamSetv ( cuFunc, offset, &ptr, sizeof (ptr) );
printf ( "ParamSetv result = %d\n", result );
offset += sizeof(ptr);
result = cuParamSetSize ( cuFunc, offset );
printf ( "ParamSetSize result = %d\n", result );
int threadsPerBlock = N;
int blocksPerGrid = 1;
result = cuFuncSetBlockShape ( cuFunc, threadsPerBlock, 1, 1);
printf ( "FuncSetBlockShape result = %d\n", result );
result = cuLaunchGrid ( cuFunc, blocksPerGrid, 1 );
printf ( "LaunchGrid result = %d\n", result );
cuMemcpyDtoH( A, dA, N * sizeof ( int ) );
printf ( "After ->\n" );
for ( int i = 0; i < N; i++ )
printf ( "%d\n", A [ i ] );
}
Given program initializes the input array by -1
and expects 0 in the first element of it after executing.
We compile it as
nvcc -L/usr/lib64 -lcuda -o cuTest cuTest.cpp
and run it as
./cuTest 1 test.ptx test
The actual result is wrong:
Device number = 1
Runtime version = 3.20
Before ->
-1
-1
-1
-1
-1
-1
-1
-1
deviceCount = 2
ModuleLoad result = 0
GetFunction result = 0
ParamSetv result = 0
ParamSetSize result = 0
FuncSetBlockShape result = 0
LaunchGrid result = 0
After ->
-2063597568
-2063597568
-2063597568
-2063597568
-2063597568
-2063597568
-2063597568
-2063597568
- we have some random values in the array
without 0 in the first element.
Moreover, when I have commented the fragment in the main program which prepares parameters
/*
int offset = 0;
void* ptr;
ptr = (void*)(size_t)dA;
ALIGN_UP(offset, __alignof(ptr));
result = cuParamSetv ( cuFunc, offset, &ptr, sizeof (ptr) );
printf ( "ParamSetv result = %d\n", result );
offset += sizeof(ptr);
result = cuParamSetSize ( cuFunc, offset );
printf ( "ParamSetSize result = %d\n", result );
*/
the result is the same.
Can anybody explain me what is wrong with my code ?
Thanks.