Hi all, I’m struggling with the following problem: I need to initialize a class on the GPU and fill it with data using a kernel in order to do it in parallel. The only documentation I was able to find explain how to initialize and fill with data on the host, moving the class on the device at the end which is not what I need.

I show here a simplified example of the code :

```
#include <iostream>
class Dict {
private:
struct KeyValuePair {
float key;
float *value;
int value_size;
};
KeyValuePair *data;
int max_size;
public:
Dict(int max_dict_size, int max_array_size) : max_size(max_dict_size) {
data = new KeyValuePair[max_size];
for (int i = 0; i < max_size; i++) {
data[i].value = new float[max_array_size];
data[i].value_size = max_size; }}
~Dict() { for (int i = 0; i < max_size; i++) {
delete[] data[i].value;}
delete[] data;
}
void add_entry(float key, float value[], int position) {
if (position == max_size){return;}
data[position].key = key;
int value_size = data[position].value_size;
for (int i = 0; i < value_size; i++){data[position].value[i] = value[i];} }
void get_value(float key, float value[]) {
for (int i = 0; i < max_size; i++) {
if (data[i].key == key){
for (int j = 0; j < data[i].value_size; j++) {value[j] = data[i].value[j];}
}}}
};
float cuda_max(float i, float j){
if(i>j){return i;} else{return j;}}
float cuda_min(float i, float j){
if(i>j){return j;} else{return i;}}
int main() {
Dict my_dict(11, 11);
int wthdrt_sz = 6;
int njctn_sz = 5;
int cnjtn_sz = wthdrt_sz + njctn_sz;
float fllng_lvl [cnjtn_sz] = {0.,1.,2.,3.,4.,5.,6.,7.,8.,9.,10.};
float max_vol = 10.;
float min_vol = 0.;
float wthdrt [wthdrt_sz] = {5.,4.,3.,2.,1.,0};
float njctn [njctn_sz] = {0.,1.,2.,3.,4.};
for(int fllng_lvl_idx = 0; fllng_lvl_idx<max_vol+1; fllng_lvl_idx++){
float fllng_lvl_slc = fllng_lvl[fllng_lvl_idx];
float rl [cnjtn_sz];
for(int i=0; i < cnjtn_sz; i++){
if(i<wthdrt_sz){rl[i] = cuda_max(min_vol , fllng_lvl_slc - wthdrt[i]);}
if(i>=wthdrt_sz){rl[i] = cuda_min(max_vol , fllng_lvl_slc + njctn[i-wthdrt_sz]);}}
int temp = cnjtn_sz ;
int count_db = 0;
for(int i=0; i<temp; i++){for(int j=i+1; j < temp; j++){
if(rl[i] == rl[j]){for(int k = j; k <temp; k++){rl[k] = rl[k+1];}
count_db++; j--; temp--;} }}
for(int i = cnjtn_sz- count_db; i < cnjtn_sz ; i++) rl[i]=-1.;
my_dict.add_entry(fllng_lvl_slc, rl, fllng_lvl_idx);}
/*this part display the result and it is not important*/
float result[cnjtn_sz];
for(int i =0; i<cnjtn_sz;i++)
{my_dict.get_value(fllng_lvl[i], result);
std::cout << fllng_lvl[i]<<std::endl;
for (int i = 0; i < cnjtn_sz; i++){ if(result[i]>-1) {std::cout << result[i] << " ";}}
std::cout << "\n" <<std::endl;}
return 0;
}
```

The previous code preform everything on the host while I want to construct the class Dict on the device and then filling with the resulting computation in parallel using a cuda kernel.

For reference I have consulted the following material:

CUDA : How to allocate memory for data member of a class - Stack Overflow

How to pass a C++ class with array of pointers to CUDA? - Stack Overflow

c - Memory allocation on GPU for dynamic array of structs - Stack Overflow

Additional question: Is it a good idea? Or there are reasons to create always the class on the host and move it on the device after it was filled?

Thank you for your support.