Passing an object to the device by using zero-copy memory

millecker · July 18, 2013, 3:18pm

Hi,

I’m trying [1] to pass an object to the device by using pinned memory.
But I’m not able to compile and link these files together.
My source looks like the following:

MyClass.cpp

#include <cuda_runtime.h>
class MyClass
{
public:
	int value;

	__device__ __host__ MyClass() {
		value = 0;
	}
	__device__ __host__ MyClass(int v) {
		value = v;
	}
	__device__ __host__ void setValue(int v) {
		value = v;
	}
	__device__ __host__ int getValue() {
		return value;
	}
	__device__ __host__ ~MyClass() {
	}
};

MyClass.h

#ifndef MYCLASS_H_
#define MYCLASS_H_

class MyClass {
public:
	__device__ __host__ MyClass();
	__device__ __host__ MyClass(int v);
	__device__ __host__ void setValue(int v);
	__device__ __host__ int getValue();
	__device__ __host__ virtual ~MyClass();
};

#endif /* MYCLASS_H_ */

passing_object_to_kernel.cu

__global__ void device_method(MyClass *d_object) {

	int val = d_object->getValue();
	cuPrintf("Device object value: %d\n", val);
	d_object->setValue(++val);
}

int main(void) {

	MyClass *host_object;
	MyClass *device_object;

	checkCuda(cudaSetDeviceFlags(cudaDeviceMapHost));

	checkCuda(
			cudaHostAlloc((void**) &host_object, sizeof(MyClass),
					cudaHostAllocWriteCombined | cudaHostAllocMapped));

	host_object = new MyClass(1);
	printf("Host object value: %d\n", host_object->getValue());

	checkCuda(cudaHostGetDevicePointer(&device_object, host_object, 0));

	cudaPrintfInit();

	device_method<<<1, 1>>>(device_object);

	cudaPrintfDisplay();
	cudaPrintfEnd();

	printf("Host object value: %d (after gpu execution)\n",
			host_object->getValue());

	return 0;
}

I have tried the following compilation steps and got this Undefined symbols error:

nvcc -c src/MyClass.cpp -o bin/MyClass.o
nvcc -g -O2 -dc -gencode arch=compute_20,code=sm_20 -I/usr/local/cuda/include src/passing_object_to_kernel.cu -o bin/passing_object_to_kernel.o
nvcc -I./src -L./bin/ bin/MyClass.o bin/passing_object_to_kernel.o -o PassingObjectToKernel
Undefined symbols for architecture i386:
  "MyClass::getValue()", referenced from:
      _main in passing_object_to_kernel.o
  "MyClass::MyClass(int)", referenced from:
      _main in passing_object_to_kernel.o
  "___cudaRegisterLinkedBinary_59_tmpxft_000040ee_00000000_6_passing_object_to_kernel_cpp1_ii_51e07f2f", referenced from:
      global constructors keyed to _Z8cuPrintfPKcin passing_object_to_kernel.o
ld: symbol(s) not found for architecture i386
collect2: ld returned 1 exit status

Thanks for your help!!

[1] https://github.com/millecker/applications/tree/master/CUDA/PinnedMemory/PassingObjectToKernel

cmaster.matso · July 18, 2013, 3:32pm

...
checkCuda(cudaHostAlloc((void**) &host_object, sizeof(MyClass),cudaHostAllocWriteCombined | cudaHostAllocMapped));

host_object = new MyClass(1);
...

Why do You call the constructor after cudaHostAlloc, both on the same ‘host_object’ object? Is this supposed to be so?

MK

P.S.
Isn’t this zero-copy memory maybe? I can’t find any calls to ‘cudaMemcpy’ between host and device objects in Your code…

millecker · July 18, 2013, 6:42pm

Hi,
yes you are right, I’m using zero-copy memory or pinned memory.

I thought its best to allocate the memory for the object and then initialize it.
You are right, I should remove executing the constructor!

Finally I got the example working.
The C++ MyClass must be defined inline within the .cu file.
No way to include only the header and link the MyClass.o later.

#include <stdio.h>
#include "util/cuPrintf.cu"
#include <cuda_runtime.h>

class MyClass {
public:
	int value;

	__device__ __host__ MyClass() {
		value = 0;
	}
	__device__ __host__ MyClass(int v) {
		value = v;
	}
	__device__ __host__ void setValue(int v) {
		value = v;
	}
	__device__ __host__ int getValue() {
		return value;
	}
	__device__ __host__ ~MyClass() {
	}
};

// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline cudaError_t checkCuda(cudaError_t result) {
#if defined(DEBUG) || defined(_DEBUG)
	if (result != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
		assert(result == cudaSuccess);
	}
#endif
	return result;
}

__global__ void device_method(MyClass *d_object) {

	int val = d_object->getValue();
	cuPrintf("Device object value: %d\n", val);
	d_object->setValue(++val);
}

int main(void) {

	//check if the device supports mapping host memory.
	cudaDeviceProp prop;
	int whichDevice;
	checkCuda(cudaGetDevice(&whichDevice));
	checkCuda(cudaGetDeviceProperties(&prop, whichDevice));
	if (prop.canMapHostMemory != 1) {
		printf("Device cannot map memory \n");
		return 0;
	}

	MyClass *host_object;
	MyClass *device_object;

	// runtime must be placed into a state enabling to allocate zero-copy buffers.
	checkCuda(cudaSetDeviceFlags(cudaDeviceMapHost));

	// init pinned memory
	checkCuda(
			cudaHostAlloc((void**) &host_object, sizeof(MyClass),
					cudaHostAllocWriteCombined | cudaHostAllocMapped));

	// init value
	host_object->setValue(1);
	printf("Host object value: %d\n", host_object->getValue());

	checkCuda(cudaHostGetDevicePointer(&device_object, host_object, 0));

	// initialize cuPrintf
	cudaPrintfInit();

	// launch a kernel with a single thread
	device_method<<<1, 1>>>(device_object);

	// display the device's output
	cudaPrintfDisplay();
	// clean up after cuPrintf
	cudaPrintfEnd();

	printf("Host object value: %d (after gpu execution)\n",
			host_object->getValue());

	return 0;
}

How can I create an atomic block or some transactions in CUDA?

I want to have the following method atomic.
Only one thread within a block should be able to execute it.

__device__ __host__ void setValue(int v) {
		value = v;
	}

cmaster.matso · July 18, 2013, 9:37pm

There is an atomic exchange function, ‘atomicExch’. It may help but You should read more about it first. It can be tricky to use.

MK

millecker · July 18, 2013, 11:05pm

Thanks, I know about the atomic operations of CUDA.
But i do not only want exchange or increment a variable.

I want a whole block or method to be atomic?
Is this possible? (something like synchronized in Java)

e.g.,

atomic {
...
}

cmaster.matso · July 29, 2013, 9:53am

There is no such function/operator/directive in CUDA C that can do so, but one can make a mutex lock - try this link. Be wary though - on some GPUs wrongly written mutex lock can dead lock it, resulting in need for hard reset

MK

Topic		Replies	Views
Pass the class object from the host to the device CUDA Programming and Performance	0	679	January 18, 2012
Does pinned memory can accessed by Device? CUDA Programming and Performance	4	2133	March 18, 2024
How to use class in CUDA C++? CUDA Programming and Performance	1	19579	May 29, 2018
Cuda passing a class to a kernel CUDA Programming and Performance	2	1488	July 5, 2022
OOP Class Design with Device Variables CUDA Programming and Performance cuda	1	1425	August 5, 2021
How to initialize a Class on the device CUDA Programming and Performance	6	3798	February 15, 2023
Passing device memory pointer between classes CUDA Programming and Performance	0	1751	March 21, 2014
Is pinned memory possible in mixed cpp and cuda CUDA Programming and Performance	3	2917	January 29, 2009
use pointer in c++ class CUDA Programming and Performance	5	3523	August 5, 2014
Pass pointer to class as a kernel argument and access class methods CUDA Programming and Performance	1	3563	July 5, 2018

Passing an object to the device by using zero-copy memory

Related topics