Constant memory when having more than one file external does not work

Hi! I’m trying to use the constant memory. I have found some posts about it but I can’t get it to work. My problem is that I use several files. Hopefully someone can help me. This is what I have so far:

In a header file I have a struct
typedef struct{
int val1;
.
} Struct1;

In a .cpp file I have the host code
.
cudaMemcpyToSymbol(&d_struct1, &h_struct1, sizeof(Struct1), cudaMemcpyHostToDevice);
.

In a .cu file I intend to use the struct like
.
int fromConstMem = d_struct1.val1;
.

I don’t know where to put constant Struct1 d_struct1[1]; I have read something about wrapper functions without understanding it. I’ve also read about putting device in front of constant, use “d_struct” instead of &d_struct in the host code and use d_struct instead of d_struct[1] when declaring. The programming guide doesn’t give me any satisfying help. Any explanation is appreciated!

Thank you!

CUDA does not have a linker on the device side, so everything needs to go into the same .cu file (or files [font=“Courier New”]#include[/font]d from it).

Put [font=“Courier New”]constant Struct1 d_struct1;[/font] into the same .cu file where the kernel is that uses it. Also move the code to initialize it into a host function in the same file. Then call this host function from where the initialization previously was. (Other initialization methods using the driver API are possible, but more complicated.)

You indeed have to use [font=“Courier New”]cudaMemcpyToSymbol(d_struct1, &h_struct1, sizeof(Struct1), cudaMemcpyHostToDevice);[/font] (without the “&” in front of d_struct1), as it is not possible to take the address of a device variable on the host (other than using the runtime library functions dedicated to that task).

CUDA does not have a linker on the device side, so everything needs to go into the same .cu file (or files [font=“Courier New”]#include[/font]d from it).

Put [font=“Courier New”]constant Struct1 d_struct1;[/font] into the same .cu file where the kernel is that uses it. Also move the code to initialize it into a host function in the same file. Then call this host function from where the initialization previously was. (Other initialization methods using the driver API are possible, but more complicated.)

You indeed have to use [font=“Courier New”]cudaMemcpyToSymbol(d_struct1, &h_struct1, sizeof(Struct1), cudaMemcpyHostToDevice);[/font] (without the “&” in front of d_struct1), as it is not possible to take the address of a device variable on the host (other than using the runtime library functions dedicated to that task).

Try

[codebox]#include <stdio.h>

#include “cutil_inline.h”

typedef struct{

char test [7];

int val1;

} Struct1;

Struct1 hoststruct={“foobar”,33};

device constant Struct1 conststruct;

global void testkernel( Struct1 *b )

{

b->val1 = conststruct.val1;

memcpy(b->test,conststruct.test,sizeof(b->test));

}

int main()

{

Struct1 *devicestruct, check;

cudaError_t cerr;

cudaMalloc( &devicestruct, sizeof(*devicestruct) );

cerr = cudaMemcpyToSymbol( "conststruct", &hoststruct, sizeof(hoststruct), 0, cudaMemcpyHostToDevice );

if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));

testkernel <<< 1, 1 >>> ( devicestruct );

cutilCheckMsg( "Kernel execution failed" );

cudaMemcpy( &check , devicestruct, sizeof(check), cudaMemcpyDeviceToHost );

printf( "%s %s, %6d %6d\n", hoststruct.test, check.test, hoststruct.val1, check.val1 );

printf( "%s\n", hoststruct.val1 == check.val1 ? "Passed" : "Failed" );

cudaFree( devicestruct );

}[/codebox]

The string introduces some nasty alignment issues, but the compiler (3.1) takes perfectly good care of that.

If you want to use a .cpp file, create a C (wrapper) function in you .cu file with the kernel which calls the kernel and can be called from your .cpp.

Using device constant is a bit over the top, as constant will do fine, but problems have been reported (using earlier versions of the compiler, I think).

Using a .cpp will cause some declaration hassles, but you need .cu to get the constant and global compiled correctly.

Jan

Try

[codebox]#include <stdio.h>

#include “cutil_inline.h”

typedef struct{

char test [7];

int val1;

} Struct1;

Struct1 hoststruct={“foobar”,33};

device constant Struct1 conststruct;

global void testkernel( Struct1 *b )

{

b->val1 = conststruct.val1;

memcpy(b->test,conststruct.test,sizeof(b->test));

}

int main()

{

Struct1 *devicestruct, check;

cudaError_t cerr;

cudaMalloc( &devicestruct, sizeof(*devicestruct) );

cerr = cudaMemcpyToSymbol( "conststruct", &hoststruct, sizeof(hoststruct), 0, cudaMemcpyHostToDevice );

if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));

testkernel <<< 1, 1 >>> ( devicestruct );

cutilCheckMsg( "Kernel execution failed" );

cudaMemcpy( &check , devicestruct, sizeof(check), cudaMemcpyDeviceToHost );

printf( "%s %s, %6d %6d\n", hoststruct.test, check.test, hoststruct.val1, check.val1 );

printf( "%s\n", hoststruct.val1 == check.val1 ? "Passed" : "Failed" );

cudaFree( devicestruct );

}[/codebox]

The string introduces some nasty alignment issues, but the compiler (3.1) takes perfectly good care of that.

If you want to use a .cpp file, create a C (wrapper) function in you .cu file with the kernel which calls the kernel and can be called from your .cpp.

Using device constant is a bit over the top, as constant will do fine, but problems have been reported (using earlier versions of the compiler, I think).

Using a .cpp will cause some declaration hassles, but you need .cu to get the constant and global compiled correctly.

Jan

I see this as a little bit of an advantage for my purposes, because it allows to split my CUDA code into separate modules, where each one has the full 64kb of constant memory available to itself.

I see this as a little bit of an advantage for my purposes, because it allows to split my CUDA code into separate modules, where each one has the full 64kb of constant memory available to itself.

Wow! Thank you all for fast and good advices. I don’t have more time today but the code actually compiles at the moment. Huge step forward! The wrap around method seems like the easiest way to start with. Will continue to investigate tomorrow.

Wow! Thank you all for fast and good advices. I don’t have more time today but the code actually compiles at the moment. Huge step forward! The wrap around method seems like the easiest way to start with. Will continue to investigate tomorrow.

by the way i have a question about the constant memory and “allocation in the same file”.

I am executing a kernel every frame, and if i point to the constant memory every frame (see following code) - it works

// constant memory

	__constant__ Sphere sphereListConstant[SPHERE_LIST_SIZE];

	// --------------------------------------------------------------------------

	// main kernel which calls the ray tracer functions

	__global__ void g_CUDARTKernel(uchar4 *pixelBuffer, Core* core, int width, int height, float pixelSize, World* world)

	{

		// world content pointers point to the content in constant memory.

		// no solution found yet to do this just once - probably not supported by constant memory.

		// but this method is just a little bit less efficient than passing the pointers as parameters.

		// world object seems to lose constant pointer informations every iteration

		world->_sphereList = sphereListConstant;

	...

		}

The problem: in this code sample, the pointers are set every frame (inefficient) - to point to the constant memory.

But I just want to do this ONCE in the beginning.

I tried to do that with another kernel which i run at the beginning, but the pointer information is empty when i execute the main kernel.

Does anybody know, why this happens?

Thanks

by the way i have a question about the constant memory and “allocation in the same file”.

I am executing a kernel every frame, and if i point to the constant memory every frame (see following code) - it works

// constant memory

	__constant__ Sphere sphereListConstant[SPHERE_LIST_SIZE];

	// --------------------------------------------------------------------------

	// main kernel which calls the ray tracer functions

	__global__ void g_CUDARTKernel(uchar4 *pixelBuffer, Core* core, int width, int height, float pixelSize, World* world)

	{

		// world content pointers point to the content in constant memory.

		// no solution found yet to do this just once - probably not supported by constant memory.

		// but this method is just a little bit less efficient than passing the pointers as parameters.

		// world object seems to lose constant pointer informations every iteration

		world->_sphereList = sphereListConstant;

	...

		}

The problem: in this code sample, the pointers are set every frame (inefficient) - to point to the constant memory.

But I just want to do this ONCE in the beginning.

I tried to do that with another kernel which i run at the beginning, but the pointer information is empty when i execute the main kernel.

Does anybody know, why this happens?

Thanks

[quote name=‘Beteigeuze’ post=‘1106259’ date=‘Aug 19 2010, 10:51 PM’]

by the way i have a question about the constant memory and “allocation in the same file”.

I am executing a kernel every frame, and if i point to the constant memory every frame (see following code) - it works

[codebox]#include

#include “cutil_inline.h”

//#include “cuprintf.cu”

typedef struct{

char test [7];

int val1;

} Struct1;

typedef struct {

Struct1 *stru1;

int somedummies[4];

} Struct2;

Struct1 hoststruct={“foobar”,33};

constant Struct1 conststruct;

global void testkernel( Struct1 *b, Struct2 *c )

{

//cuPrintf("Kernel: pointer %p\n",c->stru1);

b->val1 = c->stru1->val1;							// use pointer in struct2 to get values in constant struct1

memcpy(b->test,c->stru1->test,sizeof(b->test));

}

int main()

{

Struct1 *devicestruct=0, check;

Struct2 *test=0;

void *devptr;

cudaError_t cerr;

cudaMalloc( &devicestruct, sizeof(hoststruct) );

cerr = cudaMemcpyToSymbol( "conststruct", &hoststruct, sizeof(hoststruct), 0, cudaMemcpyHostToDevice );

if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));

cudaMalloc( &test, sizeof(*test) );

cerr = cudaGetSymbolAddress (&devptr, "conststruct");			// cannot do this directly to &test->stru1, need cudaMemcpy() !!

if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));

printf("Host: pointer to constant struct: %p\n",devptr);

cudaMemcpy(&test->stru1,&devptr,sizeof(test->stru1),cudaMemcpyHostToDevice);

//cudaPrintfInit();

testkernel <<< 1, 1 >>> ( devicestruct, test );

testkernel <<< 1, 1 >>> ( devicestruct, test );		// repeat to show pointer is retained

cutilCheckMsg( "Kernel execution failed" );

cudaThreadSynchronize();

//cudaPrintfDisplay(stdout, true);

//cudaPrintfEnd();

cudaMemcpy( &check , devicestruct, sizeof(check), cudaMemcpyDeviceToHost );

printf( "%s %s, %6d %6d\n", hoststruct.test, check.test, hoststruct.val1, check.val1 );

printf( "%s\n", hoststruct.val1 == check.val1 ? "Passed" : "Failed" );

cudaFree( devicestruct );

cudaFree( test );

}[/codebox]

[quote name=‘Beteigeuze’ post=‘1106259’ date=‘Aug 19 2010, 10:51 PM’]

by the way i have a question about the constant memory and “allocation in the same file”.

I am executing a kernel every frame, and if i point to the constant memory every frame (see following code) - it works

[codebox]#include

#include “cutil_inline.h”

//#include “cuprintf.cu”

typedef struct{

char test [7];

int val1;

} Struct1;

typedef struct {

Struct1 *stru1;

int somedummies[4];

} Struct2;

Struct1 hoststruct={“foobar”,33};

constant Struct1 conststruct;

global void testkernel( Struct1 *b, Struct2 *c )

{

//cuPrintf("Kernel: pointer %p\n",c->stru1);

b->val1 = c->stru1->val1;							// use pointer in struct2 to get values in constant struct1

memcpy(b->test,c->stru1->test,sizeof(b->test));

}

int main()

{

Struct1 *devicestruct=0, check;

Struct2 *test=0;

void *devptr;

cudaError_t cerr;

cudaMalloc( &devicestruct, sizeof(hoststruct) );

cerr = cudaMemcpyToSymbol( "conststruct", &hoststruct, sizeof(hoststruct), 0, cudaMemcpyHostToDevice );

if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));

cudaMalloc( &test, sizeof(*test) );

cerr = cudaGetSymbolAddress (&devptr, "conststruct");			// cannot do this directly to &test->stru1, need cudaMemcpy() !!

if( cerr != cudaSuccess ) puts(cudaGetErrorString( cerr ));

printf("Host: pointer to constant struct: %p\n",devptr);

cudaMemcpy(&test->stru1,&devptr,sizeof(test->stru1),cudaMemcpyHostToDevice);

//cudaPrintfInit();

testkernel <<< 1, 1 >>> ( devicestruct, test );

testkernel <<< 1, 1 >>> ( devicestruct, test );		// repeat to show pointer is retained

cutilCheckMsg( "Kernel execution failed" );

cudaThreadSynchronize();

//cudaPrintfDisplay(stdout, true);

//cudaPrintfEnd();

cudaMemcpy( &check , devicestruct, sizeof(check), cudaMemcpyDeviceToHost );

printf( "%s %s, %6d %6d\n", hoststruct.test, check.test, hoststruct.val1, check.val1 );

printf( "%s\n", hoststruct.val1 == check.val1 ? "Passed" : "Failed" );

cudaFree( devicestruct );

cudaFree( test );

}[/codebox]

Without driver API, I think there is also the possibility to use cudaGetSymbolAdress() ine the file that declares the constant variable, allowing to manage constant memory from any class or function that have access the returned pointer.

Without driver API, I think there is also the possibility to use cudaGetSymbolAdress() ine the file that declares the constant variable, allowing to manage constant memory from any class or function that have access the returned pointer.

I’m surprised your code works at all - you are modifying a structure in constant memory, which means it is undefined whether you receive (and modify) the cache or the copy in memory. I guess this also is what happens when you try to modify the world pointer from a kernel. Constant memory should only be modified from the host.

That’s actually what I thought of - I had forgotten that cudaGetSymbolAdress() is part of the runtime API, not driver API.

I’m surprised your code works at all - you are modifying a structure in constant memory, which means it is undefined whether you receive (and modify) the cache or the copy in memory. I guess this also is what happens when you try to modify the world pointer from a kernel. Constant memory should only be modified from the host.

That’s actually what I thought of - I had forgotten that cudaGetSymbolAdress() is part of the runtime API, not driver API.

i have a very similar question, but it has to do with global memory . I am trying to do some memory allocation

[codebox]

///// simpleClass.cu//////////////

class simpleClass {

int *array;

}

device simpleClass *test;

//////////////////////////////////////

/////////Kernel.cu /////////////

#include "simpleClass.cu

host_function()

{

 simpleClass *temp;

 int  *temp_array;

cudaMalloc(&temp,(size_t)(sizeof(simpleClass)*100));

 cudaMemcpyToSymbol(test,&temp,sizeof(simpleClass*),0,cudaMemcpyHostToDevice);

for (i=0;i<100;i++

 {

      cudaMalloc(&temp_array,(size_t)(sizeof(int)*100));

      cudaMemcpyToSymbol(test->array,&temp_array,sizeof(int*),0,cudaMemcpyHostToDevice);

  }

}

[/codebox]

On the SECOND cudaMemcpyToSymbol , i get an invalid device symbol error . Any suggestions on how i could implement this?

i have a very similar question, but it has to do with global memory . I am trying to do some memory allocation

[codebox]

///// simpleClass.cu//////////////

class simpleClass {

int *array;

}

device simpleClass *test;

//////////////////////////////////////

/////////Kernel.cu /////////////

#include "simpleClass.cu

host_function()

{

 simpleClass *temp;

 int  *temp_array;

cudaMalloc(&temp,(size_t)(sizeof(simpleClass)*100));

 cudaMemcpyToSymbol(test,&temp,sizeof(simpleClass*),0,cudaMemcpyHostToDevice);

for (i=0;i<100;i++

 {

      cudaMalloc(&temp_array,(size_t)(sizeof(int)*100));

      cudaMemcpyToSymbol(test->array,&temp_array,sizeof(int*),0,cudaMemcpyHostToDevice);

  }

}

[/codebox]

On the SECOND cudaMemcpyToSymbol , i get an invalid device symbol error . Any suggestions on how i could implement this?

[codebox]

///// simpleClass.cu//////////////

class simpleClass {

public:

 int *array;

};

device simpleClass *test;

//////////////////////////////////////

/////////Kernel.cu /////////////

#include “simpleClass.cu”

host_function()

{

 simpleClass *temp;

 int  *temp_array;

cudaMalloc(&temp,(size_t)(sizeof(simpleClass)*100));

 cudaMemcpyToSymbol(test,&temp,sizeof(simpleClass*),0,cudaMemcpyHostToDevice);

cudaMalloc(&temp_array, (size_t)(sizeof(int)*100));

 cudaMemcpy(&(temp->array), &temp_array, sizeof(temp->array), cudaMemcpyHostToDevice);

}

[/codebox]

or, for a statically allocated class

[codebox]

/////////Kernel.cu /////////////

#include “simpleClass.cu”

device simpleClass stat;

host_function()

{

 int  *temp_array;

cudaMalloc(&temp_array, (size_t)(sizeof(int)*100));

 cudaMemcpyToSymbol(stat, &temp_array, sizeof(stat.array), offsetof(simpleClass, array), cudaMemcpyHostToDevice);

}

[/codebox]