I usually use CUDA/C.
but now, I am trying to use CUDA/C++.
Using CUDA/C++ is so hard to me…
because, there is no enough direction…
How to use class in CUDA/C++?
It means how to generate object using class on device memory?
Please, let me know.
this is my code. but there are a few errors.
#ifndef __GPU__attribute_handler_HPP__
#define __GPU__attribute_handler_HPP__
#include <cuda_runtime.h>
class gpu_attribute_handler {
size_t firstnode_id;
size_t lastnode_id;
char * attribute_ptr;
bool boundtype;
bool writefile;
__device__ gpu_attribute_handler() {
firstnode_id = 0;
__device__ ~gpu_attribute_handler() {
__device__ void set(int *temp, size_t SIZE) {
cudaMalloc((void**)&attribute_ptr, SIZE);
cudaMemcpy(attribute_ptr, temp, SIZE, cudaMemcpyHostToDevice);
__device__ void get_result(int *temp, size_t SIZE) {
cudaMemcpy(temp, attribute_ptr, SIZE, cudaMemcpyDeviceToHost);
template<typename valuetype>
__device__ valuetype get_value(size_t nodenumber) {
valuetype * value_ptr = (valuetype *)attribute_ptr;
return value_ptr[nodenumber - firstnode_id];
template<typename valuetype>
__device__ void set_value(size_t nodenumber, valuetype value) {
valuetype * value_ptr = (valuetype *)attribute_ptr;
value_ptr[nodenumber - firstnode_id] = value;
template<typename valuetype>
__device__ void add_value(size_t nodenumber, valuetype value) {
valuetype * value_ptr = (valuetype *)attribute_ptr;
value_ptr[nodenumber - firstnode_id] += value;
__device__ void set_boundtype(bool _boundtype) {
boundtype = _boundtype;
__device__ void set_writefile(bool _writefile) {
writefile = _writefile;
#include "cuda_runtime.h"
#include "GPU_attribute_handler.cuh"
#include <stdio.h>
#include <stdlib.h>
#pragma warning(disable : 4996)
#define SIZE 5
__global__ void addKernel(gpu_attribute_handler a, gpu_attribute_handler b, gpu_attribute_handler c) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < SIZE) {
int sum = a.get_value<float>(tid) + b.get_value<float>(tid);
c.set_value<float>(tid, sum);
class test {
gpu_attribute_handler dev_a, dev_b, dev_c;
cudaError_t err = cudaSuccess;
int *a, *b, *c;
test() {
~test() {
void allocate() {
a = (int*)malloc(SIZE * sizeof(int));
b = (int*)malloc(SIZE * sizeof(int));
c = (int*)malloc(SIZE * sizeof(int));
void initialization() {
for (int i = 0; i < SIZE; i++) {
a[i] = i + 1;
b[i] = i + 1;
void copyData() {
dev_a.set(a, SIZE);
dev_b.set(b, SIZE);
dev_c.set(c, SIZE);
void operation() {
addKernel << <256, 256 >> > (dev_a, dev_b, dev_c);
void syncToHost() {
dev_c.get_result(c, SIZE);
void printValues() {
for (int i = 0; i < SIZE; i++) {
printf("%d = %d\n", i+1, c[i]);
int main() {
test go;
These are error messages.
calling a host function(“cudaMalloc”) from a device function(“gpu_attribute_handler::set”) is not allowed /* line 25 /
identifier “cudaMalloc” is undefined in device code / line 25 */
Same as line 26 and line 30 (cudaMalloc and cudaMemcpy)