Hi,
I’m trying [1] to pass an object to the device by using pinned memory.
But I’m not able to compile and link these files together.
My source looks like the following:
MyClass.cpp
#include <cuda_runtime.h>
class MyClass
{
public:
int value;
__device__ __host__ MyClass() {
value = 0;
}
__device__ __host__ MyClass(int v) {
value = v;
}
__device__ __host__ void setValue(int v) {
value = v;
}
__device__ __host__ int getValue() {
return value;
}
__device__ __host__ ~MyClass() {
}
};
MyClass.h
#ifndef MYCLASS_H_
#define MYCLASS_H_
class MyClass {
public:
__device__ __host__ MyClass();
__device__ __host__ MyClass(int v);
__device__ __host__ void setValue(int v);
__device__ __host__ int getValue();
__device__ __host__ virtual ~MyClass();
};
#endif /* MYCLASS_H_ */
passing_object_to_kernel.cu
__global__ void device_method(MyClass *d_object) {
int val = d_object->getValue();
cuPrintf("Device object value: %d\n", val);
d_object->setValue(++val);
}
int main(void) {
MyClass *host_object;
MyClass *device_object;
checkCuda(cudaSetDeviceFlags(cudaDeviceMapHost));
checkCuda(
cudaHostAlloc((void**) &host_object, sizeof(MyClass),
cudaHostAllocWriteCombined | cudaHostAllocMapped));
host_object = new MyClass(1);
printf("Host object value: %d\n", host_object->getValue());
checkCuda(cudaHostGetDevicePointer(&device_object, host_object, 0));
cudaPrintfInit();
device_method<<<1, 1>>>(device_object);
cudaPrintfDisplay();
cudaPrintfEnd();
printf("Host object value: %d (after gpu execution)\n",
host_object->getValue());
return 0;
}
I have tried the following compilation steps and got this Undefined symbols error:
nvcc -c src/MyClass.cpp -o bin/MyClass.o
nvcc -g -O2 -dc -gencode arch=compute_20,code=sm_20 -I/usr/local/cuda/include src/passing_object_to_kernel.cu -o bin/passing_object_to_kernel.o
nvcc -I./src -L./bin/ bin/MyClass.o bin/passing_object_to_kernel.o -o PassingObjectToKernel
Undefined symbols for architecture i386:
"MyClass::getValue()", referenced from:
_main in passing_object_to_kernel.o
"MyClass::MyClass(int)", referenced from:
_main in passing_object_to_kernel.o
"___cudaRegisterLinkedBinary_59_tmpxft_000040ee_00000000_6_passing_object_to_kernel_cpp1_ii_51e07f2f", referenced from:
global constructors keyed to _Z8cuPrintfPKcin passing_object_to_kernel.o
ld: symbol(s) not found for architecture i386
collect2: ld returned 1 exit status
Thanks for your help!!
[1] https://github.com/millecker/applications/tree/master/CUDA/PinnedMemory/PassingObjectToKernel
...
checkCuda(cudaHostAlloc((void**) &host_object, sizeof(MyClass),cudaHostAllocWriteCombined | cudaHostAllocMapped));
host_object = new MyClass(1);
...
Why do You call the constructor after cudaHostAlloc, both on the same ‘host_object’ object? Is this supposed to be so?
MK
P.S.
Isn’t this zero-copy memory maybe? I can’t find any calls to ‘cudaMemcpy’ between host and device objects in Your code…
Hi,
yes you are right, I’m using zero-copy memory or pinned memory.
I thought its best to allocate the memory for the object and then initialize it.
You are right, I should remove executing the constructor!
Finally I got the example working.
The C++ MyClass must be defined inline within the .cu file.
No way to include only the header and link the MyClass.o later.
#include <stdio.h>
#include "util/cuPrintf.cu"
#include <cuda_runtime.h>
class MyClass {
public:
int value;
__device__ __host__ MyClass() {
value = 0;
}
__device__ __host__ MyClass(int v) {
value = v;
}
__device__ __host__ void setValue(int v) {
value = v;
}
__device__ __host__ int getValue() {
return value;
}
__device__ __host__ ~MyClass() {
}
};
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline cudaError_t checkCuda(cudaError_t result) {
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void device_method(MyClass *d_object) {
int val = d_object->getValue();
cuPrintf("Device object value: %d\n", val);
d_object->setValue(++val);
}
int main(void) {
//check if the device supports mapping host memory.
cudaDeviceProp prop;
int whichDevice;
checkCuda(cudaGetDevice(&whichDevice));
checkCuda(cudaGetDeviceProperties(&prop, whichDevice));
if (prop.canMapHostMemory != 1) {
printf("Device cannot map memory \n");
return 0;
}
MyClass *host_object;
MyClass *device_object;
// runtime must be placed into a state enabling to allocate zero-copy buffers.
checkCuda(cudaSetDeviceFlags(cudaDeviceMapHost));
// init pinned memory
checkCuda(
cudaHostAlloc((void**) &host_object, sizeof(MyClass),
cudaHostAllocWriteCombined | cudaHostAllocMapped));
// init value
host_object->setValue(1);
printf("Host object value: %d\n", host_object->getValue());
checkCuda(cudaHostGetDevicePointer(&device_object, host_object, 0));
// initialize cuPrintf
cudaPrintfInit();
// launch a kernel with a single thread
device_method<<<1, 1>>>(device_object);
// display the device's output
cudaPrintfDisplay();
// clean up after cuPrintf
cudaPrintfEnd();
printf("Host object value: %d (after gpu execution)\n",
host_object->getValue());
return 0;
}
How can I create an atomic block or some transactions in CUDA?
I want to have the following method atomic.
Only one thread within a block should be able to execute it.
__device__ __host__ void setValue(int v) {
value = v;
}
There is an atomic exchange function, ‘atomicExch’. It may help but You should read more about it first. It can be tricky to use.
MK
Thanks, I know about the atomic operations of CUDA.
But i do not only want exchange or increment a variable.
I want a whole block or method to be atomic?
Is this possible? (something like synchronized in Java)
e.g.,
atomic {
...
}
There is no such function/operator/directive in CUDA C that can do so, but one can make a mutex lock - try this link. Be wary though - on some GPUs wrongly written mutex lock can dead lock it, resulting in need for hard reset
MK