Passing an object to the device by using zero-copy memory

Hi,

I’m trying [1] to pass an object to the device by using pinned memory.
But I’m not able to compile and link these files together.
My source looks like the following:

MyClass.cpp

#include <cuda_runtime.h>
class MyClass
{
public:
	int value;

	__device__ __host__ MyClass() {
		value = 0;
	}
	__device__ __host__ MyClass(int v) {
		value = v;
	}
	__device__ __host__ void setValue(int v) {
		value = v;
	}
	__device__ __host__ int getValue() {
		return value;
	}
	__device__ __host__ ~MyClass() {
	}
};

MyClass.h

#ifndef MYCLASS_H_
#define MYCLASS_H_

class MyClass {
public:
	__device__ __host__ MyClass();
	__device__ __host__ MyClass(int v);
	__device__ __host__ void setValue(int v);
	__device__ __host__ int getValue();
	__device__ __host__ virtual ~MyClass();
};

#endif /* MYCLASS_H_ */

passing_object_to_kernel.cu

__global__ void device_method(MyClass *d_object) {

	int val = d_object->getValue();
	cuPrintf("Device object value: %d\n", val);
	d_object->setValue(++val);
}

int main(void) {

	MyClass *host_object;
	MyClass *device_object;

	checkCuda(cudaSetDeviceFlags(cudaDeviceMapHost));

	checkCuda(
			cudaHostAlloc((void**) &host_object, sizeof(MyClass),
					cudaHostAllocWriteCombined | cudaHostAllocMapped));

	host_object = new MyClass(1);
	printf("Host object value: %d\n", host_object->getValue());

	checkCuda(cudaHostGetDevicePointer(&device_object, host_object, 0));

	cudaPrintfInit();

	device_method<<<1, 1>>>(device_object);

	cudaPrintfDisplay();
	cudaPrintfEnd();

	printf("Host object value: %d (after gpu execution)\n",
			host_object->getValue());

	return 0;
}

I have tried the following compilation steps and got this Undefined symbols error:

nvcc -c src/MyClass.cpp -o bin/MyClass.o
nvcc -g -O2 -dc -gencode arch=compute_20,code=sm_20 -I/usr/local/cuda/include src/passing_object_to_kernel.cu -o bin/passing_object_to_kernel.o
nvcc -I./src -L./bin/ bin/MyClass.o bin/passing_object_to_kernel.o -o PassingObjectToKernel
Undefined symbols for architecture i386:
  "MyClass::getValue()", referenced from:
      _main in passing_object_to_kernel.o
  "MyClass::MyClass(int)", referenced from:
      _main in passing_object_to_kernel.o
  "___cudaRegisterLinkedBinary_59_tmpxft_000040ee_00000000_6_passing_object_to_kernel_cpp1_ii_51e07f2f", referenced from:
      global constructors keyed to _Z8cuPrintfPKcin passing_object_to_kernel.o
ld: symbol(s) not found for architecture i386
collect2: ld returned 1 exit status

Thanks for your help!!

[1] https://github.com/millecker/applications/tree/master/CUDA/PinnedMemory/PassingObjectToKernel

...
checkCuda(cudaHostAlloc((void**) &host_object, sizeof(MyClass),cudaHostAllocWriteCombined | cudaHostAllocMapped));

host_object = new MyClass(1);
...

Why do You call the constructor after cudaHostAlloc, both on the same ‘host_object’ object? Is this supposed to be so?

MK

P.S.
Isn’t this zero-copy memory maybe? I can’t find any calls to ‘cudaMemcpy’ between host and device objects in Your code…

Hi,
yes you are right, I’m using zero-copy memory or pinned memory.

I thought its best to allocate the memory for the object and then initialize it.
You are right, I should remove executing the constructor!

Finally I got the example working.
The C++ MyClass must be defined inline within the .cu file.
No way to include only the header and link the MyClass.o later.

#include <stdio.h>
#include "util/cuPrintf.cu"
#include <cuda_runtime.h>

class MyClass {
public:
	int value;

	__device__ __host__ MyClass() {
		value = 0;
	}
	__device__ __host__ MyClass(int v) {
		value = v;
	}
	__device__ __host__ void setValue(int v) {
		value = v;
	}
	__device__ __host__ int getValue() {
		return value;
	}
	__device__ __host__ ~MyClass() {
	}
};

// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline cudaError_t checkCuda(cudaError_t result) {
#if defined(DEBUG) || defined(_DEBUG)
	if (result != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
		assert(result == cudaSuccess);
	}
#endif
	return result;
}

__global__ void device_method(MyClass *d_object) {

	int val = d_object->getValue();
	cuPrintf("Device object value: %d\n", val);
	d_object->setValue(++val);
}

int main(void) {

	//check if the device supports mapping host memory.
	cudaDeviceProp prop;
	int whichDevice;
	checkCuda(cudaGetDevice(&whichDevice));
	checkCuda(cudaGetDeviceProperties(&prop, whichDevice));
	if (prop.canMapHostMemory != 1) {
		printf("Device cannot map memory \n");
		return 0;
	}

	MyClass *host_object;
	MyClass *device_object;

	// runtime must be placed into a state enabling to allocate zero-copy buffers.
	checkCuda(cudaSetDeviceFlags(cudaDeviceMapHost));

	// init pinned memory
	checkCuda(
			cudaHostAlloc((void**) &host_object, sizeof(MyClass),
					cudaHostAllocWriteCombined | cudaHostAllocMapped));

	// init value
	host_object->setValue(1);
	printf("Host object value: %d\n", host_object->getValue());

	checkCuda(cudaHostGetDevicePointer(&device_object, host_object, 0));

	// initialize cuPrintf
	cudaPrintfInit();

	// launch a kernel with a single thread
	device_method<<<1, 1>>>(device_object);

	// display the device's output
	cudaPrintfDisplay();
	// clean up after cuPrintf
	cudaPrintfEnd();

	printf("Host object value: %d (after gpu execution)\n",
			host_object->getValue());

	return 0;
}

How can I create an atomic block or some transactions in CUDA?

I want to have the following method atomic.
Only one thread within a block should be able to execute it.

__device__ __host__ void setValue(int v) {
		value = v;
	}

There is an atomic exchange function, ‘atomicExch’. It may help but You should read more about it first. It can be tricky to use.

MK

Thanks, I know about the atomic operations of CUDA.
But i do not only want exchange or increment a variable.

I want a whole block or method to be atomic?
Is this possible? (something like synchronized in Java)

e.g.,

atomic {
...
}

There is no such function/operator/directive in CUDA C that can do so, but one can make a mutex lock - try this link. Be wary though - on some GPUs wrongly written mutex lock can dead lock it, resulting in need for hard reset

MK