CUDA program crashes at function call

Show us some more code: how is the world object created? If the above code works, world obviously is a host object. How do you create the device object? How do you call your kernel with the device object? How do you invoke the _host device code on the host (obviously there has to be some host code that does the equivalent of the code at the top level of your kernel).

If you want a way to access two separate sets of global variables (one on the host, one on the device) while reusing the same code on the host and the device, you can use my evil construct

[font=“Courier New”]

#ifdef CUDA_ARCH

define host__device device

#else

define host__device

#endif

[/font]

but make sure you understand all of it’s implications. Here is some sample code:

#include <stdio.h>

#ifdef __CUDA_ARCH__

# define __host__device__ __device__

#else

# define __host__device__

#endif

__host__device__ int var = 1;

__device__ int result;

__host__ __device__ int getvar(void)

{

	return var;

}

__global__ void kernel(void)

{

	result = getvar();

}

int main(void)

{

	int r;

	var = 2;

	printf("Host:   var = %d\n", getvar());

	kernel<<<1, 1>>>();

	cudaMemcpyFromSymbol(&r, result, sizeof(r));

	printf("Device: var = %d\n", r);

	return 0;

}

DISCLAIMER: Nvidia probably hasn’t made this the default for a good reason. Use this only if you want to reuse code for running on either host or device that uses global variables, and if you understand all it’s implications. I am not liable if you melt you device and/or host by using this code.

Show us some more code: how is the world object created? If the above code works, world obviously is a host object. How do you create the device object? How do you call your kernel with the device object? How do you invoke the _host device code on the host (obviously there has to be some host code that does the equivalent of the code at the top level of your kernel).

If you want a way to access two separate sets of global variables (one on the host, one on the device) while reusing the same code on the host and the device, you can use my evil construct

[font=“Courier New”]

#ifdef CUDA_ARCH

define host__device device

#else

define host__device

#endif

[/font]

but make sure you understand all of it’s implications. Here is some sample code:

#include <stdio.h>

#ifdef __CUDA_ARCH__

# define __host__device__ __device__

#else

# define __host__device__

#endif

__host__device__ int var = 1;

__device__ int result;

__host__ __device__ int getvar(void)

{

	return var;

}

__global__ void kernel(void)

{

	result = getvar();

}

int main(void)

{

	int r;

	var = 2;

	printf("Host:   var = %d\n", getvar());

	kernel<<<1, 1>>>();

	cudaMemcpyFromSymbol(&r, result, sizeof(r));

	printf("Device: var = %d\n", r);

	return 0;

}

DISCLAIMER: Nvidia probably hasn’t made this the default for a good reason. Use this only if you want to reuse code for running on either host or device that uses global variables, and if you understand all it’s implications. I am not liable if you melt you device and/or host by using this code.

tera, you are very keen-eyed!
indeed, programming the whole day made me week to find this problem ;)
all my member variables are on the device now, but the world object is still allocated in the host - i am so stupid ;)
I hope i can fix it now and don’t have to use your evil-method ;) but thanks anyway for it - maybe i will use it someday! i could have used it for example before i restructured all my code - haha ;)

thanks both to you, oneill and tera to figure out my problem

tera, you are very keen-eyed!
indeed, programming the whole day made me week to find this problem ;)
all my member variables are on the device now, but the world object is still allocated in the host - i am so stupid ;)
I hope i can fix it now and don’t have to use your evil-method ;) but thanks anyway for it - maybe i will use it someday! i could have used it for example before i restructured all my code - haha ;)

thanks both to you, oneill and tera to figure out my problem

usually i don’t post full code, but because i am in a good mood and this problem took so much time i post my full worl object and how it can be used for both - cpu and gpu. a good sideeffect: now i know everything about memory de-/allocation ;)

#ifndef __CUDA_WORLD__

#define __CUDA_WORLD__

#include "RGBColor.cu"

#include "ShadeRec.cu"

#include "Pinhole.cu"

#include "Ambient.cu"

#include "PointLight.cu"

#include "ViewPlane.cu"

#include "Matte.cu"

#include "Sphere.cu"

#include <cutil_inline.h>

namespace CUDARTracer

{

	class World

	{	

	private:

		World*			_devicePointer;

	public:

		Ambient*		_ambientLight;

		Pinhole*		_pinhole;

		ViewPlane*		_viewPlane;

		PointLight*		_pointLightList;

		Matte*			_matteList;

		Sphere*			_sphereList;

		bool			isDeviceCopy;

	public:

		// --------------------------------------------------------------------------

		// constructor

		World()

		{

			// is host world

			_devicePointer = NULL;

			isDeviceCopy = false;

			_ambientLight = new Ambient();

			_pinhole	  = new Pinhole();

			_viewPlane	  = new ViewPlane();

			// allocate host memory

			_pointLightList = (PointLight*) malloc(sizeof(PointLight) * POINTLIGHT_LIST_SIZE);

			_matteList		= (Matte*) malloc(sizeof(Matte) * MATTE_LIST_SIZE);

			_sphereList		= (Sphere*) malloc(sizeof(Sphere) * SPHERE_LIST_SIZE);

			// initialise objects

			for (int i = 0; i < POINTLIGHT_LIST_SIZE; ++i)

				_pointLightList[i].PointLight::PointLight();

			for (int i = 0; i < MATTE_LIST_SIZE; ++i)

				_matteList[i].Matte::Matte();

			for (int i = 0; i < SPHERE_LIST_SIZE; ++i)

				_sphereList[i].Sphere::Sphere();

		}

		// --------------------------------------------------------------------------

		// device copy. Does not allocate host memory anymore

		World(World* world)

		{

			isDeviceCopy = true;

			// allocate device memory

			cutilSafeCall( cudaMalloc( (void**) &_ambientLight, sizeof(Ambient)));

			cutilSafeCall( cudaMalloc( (void**) &_pinhole, sizeof(Pinhole)));

			cutilSafeCall( cudaMalloc( (void**) &_viewPlane, sizeof(ViewPlane)));

			cutilSafeCall( cudaMalloc( (void**) &_pointLightList, sizeof(PointLight) * POINTLIGHT_LIST_SIZE));

			cutilSafeCall( cudaMalloc( (void**) &_matteList, sizeof(Matte) * MATTE_LIST_SIZE));

			cutilSafeCall( cudaMalloc( (void**) &_sphereList, sizeof(Sphere) * SPHERE_LIST_SIZE));

			// copy host memory to device

			cutilSafeCall( cudaMemcpy( _ambientLight, world->_ambientLight,

				sizeof(Ambient), cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _pinhole, world->_pinhole,

				sizeof(Pinhole), cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _viewPlane, world->_viewPlane,

				sizeof(ViewPlane), cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _pointLightList, world->_pointLightList,

				sizeof(PointLight) * POINTLIGHT_LIST_SIZE, cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _matteList, world->_matteList,

				sizeof(Matte) * MATTE_LIST_SIZE, cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _sphereList, world->_sphereList,

				sizeof(Sphere) * SPHERE_LIST_SIZE, cudaMemcpyHostToDevice));

			// is copy of the world members stored in device memory

			cutilSafeCall( cudaMalloc( (void**) &_devicePointer, sizeof(World)));

			cutilSafeCall( cudaMemcpy( _devicePointer, this,

				sizeof(World), cudaMemcpyHostToDevice));

		}

		// --------------------------------------------------------------------------

		// destructor

		~World()

		{

			// if it is the host world

			if (!isDeviceCopy)

			{

				delete _ambientLight;

				delete _pinhole;

				delete _viewPlane;

				// call destructor of objects

				for (int i = 0; i < POINTLIGHT_LIST_SIZE; ++i)

					_pointLightList[i].PointLight::~PointLight();

				for (int i = 0; i < MATTE_LIST_SIZE; ++i)

					_matteList[i].Matte::~Matte();

				for (int i = 0; i < SPHERE_LIST_SIZE; ++i)

					_sphereList[i].Sphere::~Sphere();

				free(_pointLightList);

				free(_matteList);

				free(_sphereList);

			}

			else	// if it is the device world

			{

				cutilSafeCall(cudaFree(_ambientLight));

				cutilSafeCall(cudaFree(_pinhole));

				cutilSafeCall(cudaFree(_viewPlane));

				cutilSafeCall(cudaFree(_pointLightList));

				cutilSafeCall(cudaFree(_matteList));

				cutilSafeCall(cudaFree(_sphereList));

				cutilSafeCall(cudaFree(_devicePointer));

			}

		}

		// --------------------------------------------------------------------------

		// returns a pointer to the world in the device memory

		World* getDevicePointer()

		{

			return _devicePointer;

		}

...

Initialising:

World* world;						// holds the whole scenary in the host memory

			World* worldDevice;					// copy of world members in device memory - world object itself is on host

	// create the scenary for the ray tracer

	world = new World();

	world->build();

		// copy world information to device

		worldDevice = new World(world);

CPU usage:

core.render_scene(world);

GPU usage:

CUDARTKernel(worldDevice->getDevicePointer());

usually i don’t post full code, but because i am in a good mood and this problem took so much time i post my full worl object and how it can be used for both - cpu and gpu. a good sideeffect: now i know everything about memory de-/allocation ;)

#ifndef __CUDA_WORLD__

#define __CUDA_WORLD__

#include "RGBColor.cu"

#include "ShadeRec.cu"

#include "Pinhole.cu"

#include "Ambient.cu"

#include "PointLight.cu"

#include "ViewPlane.cu"

#include "Matte.cu"

#include "Sphere.cu"

#include <cutil_inline.h>

namespace CUDARTracer

{

	class World

	{	

	private:

		World*			_devicePointer;

	public:

		Ambient*		_ambientLight;

		Pinhole*		_pinhole;

		ViewPlane*		_viewPlane;

		PointLight*		_pointLightList;

		Matte*			_matteList;

		Sphere*			_sphereList;

		bool			isDeviceCopy;

	public:

		// --------------------------------------------------------------------------

		// constructor

		World()

		{

			// is host world

			_devicePointer = NULL;

			isDeviceCopy = false;

			_ambientLight = new Ambient();

			_pinhole	  = new Pinhole();

			_viewPlane	  = new ViewPlane();

			// allocate host memory

			_pointLightList = (PointLight*) malloc(sizeof(PointLight) * POINTLIGHT_LIST_SIZE);

			_matteList		= (Matte*) malloc(sizeof(Matte) * MATTE_LIST_SIZE);

			_sphereList		= (Sphere*) malloc(sizeof(Sphere) * SPHERE_LIST_SIZE);

			// initialise objects

			for (int i = 0; i < POINTLIGHT_LIST_SIZE; ++i)

				_pointLightList[i].PointLight::PointLight();

			for (int i = 0; i < MATTE_LIST_SIZE; ++i)

				_matteList[i].Matte::Matte();

			for (int i = 0; i < SPHERE_LIST_SIZE; ++i)

				_sphereList[i].Sphere::Sphere();

		}

		// --------------------------------------------------------------------------

		// device copy. Does not allocate host memory anymore

		World(World* world)

		{

			isDeviceCopy = true;

			// allocate device memory

			cutilSafeCall( cudaMalloc( (void**) &_ambientLight, sizeof(Ambient)));

			cutilSafeCall( cudaMalloc( (void**) &_pinhole, sizeof(Pinhole)));

			cutilSafeCall( cudaMalloc( (void**) &_viewPlane, sizeof(ViewPlane)));

			cutilSafeCall( cudaMalloc( (void**) &_pointLightList, sizeof(PointLight) * POINTLIGHT_LIST_SIZE));

			cutilSafeCall( cudaMalloc( (void**) &_matteList, sizeof(Matte) * MATTE_LIST_SIZE));

			cutilSafeCall( cudaMalloc( (void**) &_sphereList, sizeof(Sphere) * SPHERE_LIST_SIZE));

			// copy host memory to device

			cutilSafeCall( cudaMemcpy( _ambientLight, world->_ambientLight,

				sizeof(Ambient), cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _pinhole, world->_pinhole,

				sizeof(Pinhole), cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _viewPlane, world->_viewPlane,

				sizeof(ViewPlane), cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _pointLightList, world->_pointLightList,

				sizeof(PointLight) * POINTLIGHT_LIST_SIZE, cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _matteList, world->_matteList,

				sizeof(Matte) * MATTE_LIST_SIZE, cudaMemcpyHostToDevice));

			cutilSafeCall( cudaMemcpy( _sphereList, world->_sphereList,

				sizeof(Sphere) * SPHERE_LIST_SIZE, cudaMemcpyHostToDevice));

			// is copy of the world members stored in device memory

			cutilSafeCall( cudaMalloc( (void**) &_devicePointer, sizeof(World)));

			cutilSafeCall( cudaMemcpy( _devicePointer, this,

				sizeof(World), cudaMemcpyHostToDevice));

		}

		// --------------------------------------------------------------------------

		// destructor

		~World()

		{

			// if it is the host world

			if (!isDeviceCopy)

			{

				delete _ambientLight;

				delete _pinhole;

				delete _viewPlane;

				// call destructor of objects

				for (int i = 0; i < POINTLIGHT_LIST_SIZE; ++i)

					_pointLightList[i].PointLight::~PointLight();

				for (int i = 0; i < MATTE_LIST_SIZE; ++i)

					_matteList[i].Matte::~Matte();

				for (int i = 0; i < SPHERE_LIST_SIZE; ++i)

					_sphereList[i].Sphere::~Sphere();

				free(_pointLightList);

				free(_matteList);

				free(_sphereList);

			}

			else	// if it is the device world

			{

				cutilSafeCall(cudaFree(_ambientLight));

				cutilSafeCall(cudaFree(_pinhole));

				cutilSafeCall(cudaFree(_viewPlane));

				cutilSafeCall(cudaFree(_pointLightList));

				cutilSafeCall(cudaFree(_matteList));

				cutilSafeCall(cudaFree(_sphereList));

				cutilSafeCall(cudaFree(_devicePointer));

			}

		}

		// --------------------------------------------------------------------------

		// returns a pointer to the world in the device memory

		World* getDevicePointer()

		{

			return _devicePointer;

		}

...

Initialising:

World* world;						// holds the whole scenary in the host memory

			World* worldDevice;					// copy of world members in device memory - world object itself is on host

	// create the scenary for the ray tracer

	world = new World();

	world->build();

		// copy world information to device

		worldDevice = new World(world);

CPU usage:

core.render_scene(world);

GPU usage:

CUDARTKernel(worldDevice->getDevicePointer());