Hello Community.
I was searching a solution to the following problem and I found something in the forum but still I cannot make it work (The Official NVIDIA Forums | NVIDIA ).
So the idea is to have an object for each execution unit that does all the calculations and store the results.
The objects may be also different each other.
The fact is that I don’t allocate correctly the arrays of my structure but from the other posts in the forum I don’t get how should I do that.
Could somebody go trough my code here and tell me how to fix it, please?
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
#include <curand_kernel.h>
using namespace std;
struct test{
float *d1_vals; //an 1D array
float **d2_vals; //an 2D array
int rows;
int columns;
void init(int x, int y); //init assign x to rows, y to columns, allocate d2_vals and d1_vals on the HOST
__device__ void foo(); // device function to fill the arrays
void print_r(); //a print function
};
//allocate the arrays of the structure
void test::init(int x, int y)
{
rows=x;
columns=y;
d1_vals=new float[rows];
d2_vals=new float*[rows];
for(int i=0;i<rows;i++)
d2_vals[i]=new float[columns];
}
//print the arrays
void test::print_r()
{
for(int j=0;j<rows;++j)
{
cout<<"d1"<<d1_vals[j]<<endl;
for(int k=0;j<columns;++k)
cout<<"d2"<<d2_vals[j][k]<<endl;
}
}
//foo simply fill d2_vals with numbers position_x*position_y and d1_vals with position_x, it have to be called after init
__device__ void test::foo()
{
for(int j=0;j<rows;++j)
{
d1_vals[j]=j;
for(int k=0;j<columns;++k)
d2_vals[j][k]=j*k;
}
}
__global__ void fillvals(struct test *a)
{
const int id=blockIdx.x;
a[id].foo();
}
int main()
{
int num_obj=2, num_rows=2, num_col=3;
struct test *somestruct = (struct test*)malloc(num_obj*sizeof(struct test));
for(int h=0;h<num_obj;h++)
somestruct[h].init(num_rows,num_col);
struct test *to_device;
cudaMalloc ( (void **)&to_device, num_obj*sizeof(struct test));
cout<<"before the kernel"<<endl;
for(int k=0;k<num_obj;k++)
somestruct[k].print_r();
cudaMemcpy(to_device,somestruct,num_obj*sizeof(struct test),cudaMemcpyHostToDevice);
fillvals<<<num_obj,1>>>(somestruct);
cudaMemcpy(somestruct,to_device,num_obj*sizeof(struct test),cudaMemcpyDeviceToHost);
cout<<"after the kernel"<<endl;
for(int k=0;k<num_obj;k++)
somestruct[k].print_r();
}