CUDA Driver API + ptx-code problem

Hi !

I have the following very simple ptx-code ( test.ptx file ):

.version 1.4

 .target sm_10

.entry test ( .param .s32 C )

 {

  .reg .s32 %p<1>;

ld.param.s32 %p0, [C];

  st.global.s32 [%p0], 0;

  ret;

 }

In fact, this ptx-code assigns 0 to the first element of array given as an input parameter.

The main program cuTest.cpp is the following

#include "stdio.h"

#include "malloc.h"

#include "cuda.h"

#include "cuda_runtime_api.h"

#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)

int main ( int argc , char** argv ) {

int rt_version;

int N = 8;

int nDevice = atoi ( argv [ 1 ] );

printf ( "Device number = %d\n", nDevice );

char* file_name = argv [ 2 ];

char* func_name = argv [ 3 ];

cudaRuntimeGetVersion ( &rt_version );

printf ( "Runtime version = %d.%d\n", rt_version/1000, rt_version%100 );

int* A = (int*) malloc ( N * sizeof ( int ) );

for ( int i = 0; i < N; i++ )

A [ i ] = -1;

printf ( "Before ->\n" );

for ( int i = 0; i < N; i++ )

printf ( "%d\n", A [ i ] );

CUdeviceptr dA = cuMemAlloc ( &dA, N * sizeof ( int ) );

cuMemcpyHtoD(dA, A, N * sizeof ( int ) );

CUresult result = cuInit(0);

int deviceCount = 0;

cuDeviceGetCount ( &deviceCount );

printf ( "deviceCount = %d\n", deviceCount );

CUdevice cuDevice = 0;

cuDeviceGet ( &cuDevice, nDevice );

CUcontext cuContext;

cuCtxCreate ( &cuContext, 0, cuDevice );

CUmodule cuModule;

result = cuModuleLoad ( &cuModule, file_name );

printf ( "ModuleLoad result = %d\n", result );

CUfunction cuFunc;

result = cuModuleGetFunction ( &cuFunc, cuModule, func_name );

printf ( "GetFunction result = %d\n", result );

int offset = 0;

void* ptr;

ptr = (void*)(size_t)dA;

ALIGN_UP(offset, __alignof(ptr));

result = cuParamSetv ( cuFunc, offset, &ptr, sizeof (ptr) );

printf ( "ParamSetv result = %d\n", result );

offset += sizeof(ptr);

result = cuParamSetSize ( cuFunc, offset );

printf ( "ParamSetSize result = %d\n", result );

int threadsPerBlock = N;

int blocksPerGrid = 1;

result = cuFuncSetBlockShape ( cuFunc, threadsPerBlock, 1, 1);

printf ( "FuncSetBlockShape result = %d\n", result );

result = cuLaunchGrid ( cuFunc, blocksPerGrid, 1 );

printf ( "LaunchGrid result = %d\n", result );

cuMemcpyDtoH( A, dA, N * sizeof ( int ) );

printf ( "After ->\n" );

for ( int i = 0; i < N; i++ )

printf ( "%d\n", A [ i ] );

}

Given program initializes the input array by -1

and expects 0 in the first element of it after executing.

We compile it as

nvcc -L/usr/lib64 -lcuda -o cuTest cuTest.cpp

and run it as

./cuTest 1 test.ptx test

The actual result is wrong:

Device number = 1

Runtime version = 3.20

Before ->

-1

-1

-1

-1

-1

-1

-1

-1

deviceCount = 2

ModuleLoad result = 0

GetFunction result = 0

ParamSetv result = 0

ParamSetSize result = 0

FuncSetBlockShape result = 0

LaunchGrid result = 0

After ->

-2063597568

-2063597568

-2063597568

-2063597568

-2063597568

-2063597568

-2063597568

-2063597568
  • we have some random values in the array

without 0 in the first element.

Moreover, when I have commented the fragment in the main program which prepares parameters

/*

int offset = 0;

void* ptr;

ptr = (void*)(size_t)dA;

ALIGN_UP(offset, __alignof(ptr));

result = cuParamSetv ( cuFunc, offset, &ptr, sizeof (ptr) );

printf ( "ParamSetv result = %d\n", result );

offset += sizeof(ptr);

result = cuParamSetSize ( cuFunc, offset );

printf ( "ParamSetSize result = %d\n", result );

*/

the result is the same.

Can anybody explain me what is wrong with my code ?

Thanks.

You need to do cuInit() and cuCtxCreate() before you can do cuMemcpyTo

Yes, you are right.

So I have modified the main program as

#include "stdio.h"

#include "malloc.h"

#include "cuda.h"

#include "cuda_runtime_api.h"

#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)

int main ( int argc , char** argv ) {

int rt_version;

int N = 8;

int nDevice = atoi ( argv [ 1 ] );

 printf ( "Device number = %d\n", nDevice );

char* file_name = argv [ 2 ];

 char* func_name = argv [ 3 ];

CUresult result = cuInit(0);

int deviceCount = 0;

 cuDeviceGetCount ( &deviceCount );

 printf ( "deviceCount = %d\n", deviceCount );

CUdevice cuDevice = 0;

 cuDeviceGet ( &cuDevice, nDevice );

CUcontext cuContext;

 cuCtxCreate ( &cuContext, 0, cuDevice );

cudaRuntimeGetVersion ( &rt_version );

 printf ( "Runtime version = %d.%d\n", rt_version/1000, rt_version%100 );

int* A = (int*) malloc ( N * sizeof ( int ) );

 for ( int i = 0; i < N; i++ )

  A [ i ] = -1;

 printf ( "Before ->\n" );

 for ( int i = 0; i < N; i++ )

  printf ( "%d\n", A [ i ] );

CUdeviceptr dA = cuMemAlloc ( &dA, N * sizeof ( int ) );

cuMemcpyHtoD(dA, A, N * sizeof ( int ) );

CUmodule cuModule;

 result = cuModuleLoad ( &cuModule, file_name );

 printf ( "ModuleLoad result = %d\n", result );

CUfunction cuFunc;

 result = cuModuleGetFunction ( &cuFunc, cuModule, func_name );

 printf ( "GetFunction result = %d\n", result );

int offset = 0;

 void* ptr;

ptr = (void*)(size_t)dA;

 ALIGN_UP(offset, __alignof(ptr));

result = cuParamSetv ( cuFunc, offset, &ptr, sizeof (ptr) );

 printf ( "ParamSetv result = %d\n", result );

 offset += sizeof(ptr);

result = cuParamSetSize ( cuFunc, offset );

 printf ( "ParamSetSize result = %d\n", result );

int threadsPerBlock = N;

 int blocksPerGrid   = 1;

result = cuFuncSetBlockShape ( cuFunc, threadsPerBlock, 1, 1);

 printf ( "FuncSetBlockShape result = %d\n", result );

result = cuLaunchGrid ( cuFunc, blocksPerGrid, 1 );

 printf ( "LaunchGrid result = %d\n", result );

cuMemcpyDtoH( A, dA, N * sizeof ( int ) );

printf ( "After ->\n" );

 for ( int i = 0; i < N; i++ )

  printf ( "%d\n", A [ i ] );

}

But after running of it we have

$ ./cuTest 1 test.ptx test

Device number = 1

deviceCount = 2

Runtime version = 3.20

Before ->

-1

-1

-1

-1

-1

-1

-1

-1

ModuleLoad result = 0

GetFunction result = 0

ParamSetv result = 0

ParamSetSize result = 0

FuncSetBlockShape result = 0

LaunchGrid result = 0

After ->

-1

-1

-1

-1

-1

-1

-1

-1

Again the result is wrong: there are no any modifications

in the input array.

And again when I have commented the fragment in the main program which prepares parameters,

the result is the same.

Any assumptions, please ?

Hi All,

I have the same problem. I am trying to write, compile and load some simple ptx files. I don’t know why but also if I do the “right things” I haven’t any result in output. I will appreciate an example because I have done some programs like the previous user but I haven’t the right output also if he whole program seems fine. I haven’t any error during the compilation phase when I use nvcc -fatbin -arch=compute_20 -code=arch_20 file.ptx . After that I use the procedure to load the module like the previous user.

The only difference is that I use cuLaunchKernel(function, 1, 1, 0, 1, 1, 1, 0, NULL, kernel_param, NULL) and I build kernel_param using void *kernel_param = { &d_x, &d_y, &d_z } where d_x, d_y and d_z are CUdeviceptr . I was studying - among many things - the vectorAddDrv and they use this method. I have seen that if you write your code in Cuda C and after the compilation phase open the generated ptx file there are always many things that you don’t wish to write in your ptx file.

In the ptx guide - for example - there are also instructions like .loc that the compiler is using but you don’t know how to use it in your code. I am saying this because I have no other idea because a simple program like this doesn’t work. I need to program in ptx but if I can’t succeed for simple examples like this then I am lost.

Here some tips which you can try out:

  1. Try compiling with nvcc for 32 bit machine.

  2. Try adding following line if it’s not already there in ptx:

    .address_size 32

Hi Skybuck,


Thanks for your reply. I created the filter.ptx file and the main.cu file.
I wrote the filter.ptx file putting inside it address_size 64.

I used two comands to compile the files:

  1. nvcc -fatbin -arch=compute_20 -code=sm_20 -m=32 filter.ptx
  2. nvcc -v -keep -arch=compute_20 -code=sm_20 -m=64 main.cu .

The whole thing was fine but as you know the program wasn’t working in the right way.


I have tried to follow our advices 1) and 2).
I did also sudo apt-get install libc6-dev-i386.
Now I have the following challenge when I do my 2):

/usr/bin/ld: skipping incompatible /usr/lib/gcc/x86_64-linux-gnu/4.4.5/libstdc++.so when searching for -lstdc++
/usr/bin/ld: skipping incompatible /usr/lib/gcc/x86_64-linux-gnu/4.4.5/libstdc++.a when searching for -lstdc++
/usr/bin/ld: skipping incompatible /usr/lib/gcc/x86_64-linux-gnu/4.4.5/libstdc++.so when searching for -lstdc++
/usr/bin/ld: skipping incompatible /usr/lib/gcc/x86_64-linux-gnu/4.4.5/libstdc++.a when searching for -lstdc++
/usr/bin/ld: cannot find -lstdc++

I will try to figure out what I will need to do.


FILTER.PTX FILE


.version 2.3
.target sm_20
.address_size 64

.entry addition ( .param .u64 add_x, .param .u64 add_y, .param .u64 add_z )
{
.reg .u64 %rd<5>;
.reg .f32 %f<5>;

ld	.param		.u64 	%rd1, 	[add_x];
ld	.param		.u64 	%rd2, 	[add_y];
ld	.param		.u64 	%rd3, 	[add_z];

ldu	.global		.f32 	%f1,   	[%rd1];
ldu	.global		.f32 	%f2, 	[%rd2];

add			.f32 	%f3,    %f1,      %f2;

st	.global		.f32 	[%rd3], %f3;

exit;

}


MAIN.CU FILE


/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// CUDA HEADERS
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#include “cuda.h”

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// C/C++ HEADERS
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#include
#include
#include
#include <math.h>
#include <stdio.h>

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// NAMESPACES
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

using namespace std;

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// MAIN PROGRAM
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

int main(){

cuInit(0);

CUdevice cuDevice;
cuDeviceGet(&cuDevice, 0);

CUcontext cuContext;
cuCtxCreate(&cuContext, 0, cuDevice);

CUmodule module;
cuModuleLoad(&module,“filter.fatbin”);
cout << "Module: " << cudaGetErrorString(cudaGetLastError()) << endl;

CUfunction function_filter;
cuModuleGetFunction(&function_filter,module,“addition”);
cout << "Function: " << cudaGetErrorString(cudaGetLastError()) << endl;

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// HOST DATA
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

float * h_x = (float *) malloc(sizeof(float));
float * h_y = (float *) malloc(sizeof(float));
float * h_z = (float *) malloc(sizeof(float));

*h_x = 1.0;
*h_y = 2.0;
*h_z = 0.0;

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// DEVICE DATA
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

CUdeviceptr d_x;
CUdeviceptr d_y;
CUdeviceptr d_z;

cuMemAlloc( &d_x , sizeof(float) );
cuMemAlloc( &d_y , sizeof(float) );
cuMemAlloc( &d_z , sizeof(float) );

cuMemcpyHtoD( d_x , h_x , sizeof(float) );
cuMemcpyHtoD( d_y , h_y , sizeof(float) );
cuMemcpyHtoD( d_z , h_z , sizeof(float) );

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

cout << "Before of Ptx Kernel: " << *h_x << " + " << *h_y << " = " << *h_z << endl;

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// KERNEL DATA
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

void *kernel_param = { &d_x, &d_y, &d_z };

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// LAUNCH
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

cuLaunchKernel(function_filter, 1, 1, 0, 1, 1, 1, 0, NULL, kernel_param, NULL);
cout << "Launch: " << cudaGetErrorString(cudaGetLastError()) << endl;

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// DEVICE TO HOST COPY
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

cuMemcpyDtoH( h_x , d_x , sizeof(float) );
cuMemcpyDtoH( h_y , d_y , sizeof(float) );
cuMemcpyDtoH( h_z , d_z , sizeof(float) );

cout << "Dopo del Ptx Kernel: " << *h_x << " + " << *h_y << " = " << *h_z << endl;

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// END PROGRAM
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

cout << “Bye bye” << endl;
return 0;

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////


OUTPUT


Module: no error
Function: no error

Before of Ptx Kernel: 1 + 2 = 0

Launch: no error

After of Ptx Kernel: 1 + 2 = 0

Bye bye


Any help is welcomed.
Thanks.

You should make sure the host code has the same “bitness” as the gpu code… especially when it comes to parameters.

For example what is a void * on the host side ? is it a 32 bit pointer or a 64 bit pointer ?

Try sizeof(void *) or something like that.

Once you are sure the parameters are of the correct size then you can look into other problems if there are any…

Also if you believe the bitness is correct, then try writing the kernel in cuda c and compile it, run it, see if that works.

Then compare it to yours to see what was different.

Hi Skybuck,

I agree, I already checked that.
The sizeof of a void * is 8 bytes so 64 bits.
I thought the same thing like you.
I also wrote a code in cuda c.


KERNEL CUDA C


global void addition( CUdeviceptr d_x, CUdeviceptr d_y, CUdeviceptr d_z){
((float) d_z) = ((float) d_x) + ((float) d_y);
}


I launched this kernel with addition <<< 1 , 1 >>> ( d_x , d_y , d_z )
This kernel, compiled, did the right thing, 1 + 2 = 3.
The ptx generated by nvcc for this code was the following.


PTX GENERATED BY NVCC FOR THE CUDA C KERNEL


.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12

//-----------------------------------------------------------
// Compiling main.cpp3.i (/tmp/ccBI#.DaLxx9)
//-----------------------------------------------------------

//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
//  Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
//  -O3	(Optimization level)
//  -g0	(Debug level)
//  -m2	(Report advisories)
//-----------------------------------------------------------

.file	1	"<command-line>"
.file	2	"main.cudafe2.gpu"
.file	3	"/usr/lib/gcc/x86_64-linux-gnu/4.4.5/include/stddef.h"
.file	4	"/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file	5	"/usr/local/cuda/bin/../include/host_defines.h"
.file	6	"/usr/local/cuda/bin/../include/builtin_types.h"
.file	7	"/usr/local/cuda/bin/../include/device_types.h"
.file	8	"/usr/local/cuda/bin/../include/driver_types.h"
.file	9	"/usr/local/cuda/bin/../include/surface_types.h"
.file	10	"/usr/local/cuda/bin/../include/texture_types.h"
.file	11	"/usr/local/cuda/bin/../include/vector_types.h"
.file	12	"/usr/local/cuda/bin/../include/device_launch_parameters.h"
.file	13	"/usr/local/cuda/bin/../include/crt/storage_class.h"
.file	14	"/usr/include/bits/types.h"
.file	15	"/usr/include/time.h"
.file	16	"/usr/local/cuda/bin/../include/cuda.h"
.file	17	"main.cu"
.file	18	"/usr/local/cuda/bin/../include/common_functions.h"
.file	19	"/usr/local/cuda/bin/../include/math_functions.h"
.file	20	"/usr/local/cuda/bin/../include/math_constants.h"
.file	21	"/usr/local/cuda/bin/../include/device_functions.h"
.file	22	"/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file	23	"/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file	24	"/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file	25	"/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
.file	26	"/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
.file	27	"/usr/local/cuda/bin/../include/surface_functions.h"
.file	28	"/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file	29	"/usr/local/cuda/bin/../include/math_functions_dbl_ptx3.h"


.entry _Z15addition_normalyyy (
	.param .u64 __cudaparm__Z15addition_normalyyy_d_x,
	.param .u64 __cudaparm__Z15addition_normalyyy_d_y,
	.param .u64 __cudaparm__Z15addition_normalyyy_d_z)
{
.reg .u64 %rd<5>;
.reg .f32 %f<5>;
.loc	17	27	0

$LDWbegin__Z15addition_normalyyy:
.loc 17 29 0
ld.param.u64 %rd1, [__cudaparm__Z15addition_normalyyy_d_x];
ld.global.f32 %f1, [%rd1+0];
ld.param.u64 %rd2, [__cudaparm__Z15addition_normalyyy_d_y];
ld.global.f32 %f2, [%rd2+0];
add.f32 %f3, %f1, %f2;
ld.param.u64 %rd3, [__cudaparm__Z15addition_normalyyy_d_z];
st.global.f32 [%rd3+0], %f3;
.loc 17 31 0
exit;
$LDWend__Z15addition_normalyyy:
} // _Z15addition_normalyyy


Now,this is a challenge.
I can understand almost anything in ptx.
But not the .loc instruction.
In the manual there isn’t a real explanation for the numbers after .loc .
.loc is an instruction to “merge” the code inside the original file.
So the numbers are probably indications for the linker.
If I change this ptx it willn’t probably work anymore - for the numbers after .loc for the linker.
Also changing the ptx I need jump some compilation steps when I use nvcc.
I have to figure out how to “trick” the nvcc to link the ptx directly at the final code.
You can also see that nvcc is using a lot of .file not present in my code but I thinks that can be normal.


I also tried to add all the .file at my ptx code.
Nothing changed, it didn’t work.


If you have other ideas let me know.
I can modify the ptx generated by nvcc but a lot of things can go wrong.
Linking phase, numbers after the .loc instructions, …
Figure out what creates the challenge after the modification will be incredibly hard.