I want to create around 1000 objects in host, then copy them to device, and create 1000 threads to run the 1000 object functions in parallel. How can I do this? I am trying the following code. It can be compiled, but when I copy the results from device to host, there is an error: cudaErrorIllegalAddress
#include <cuda_runtime.h>
#include <iostream>
using namespace std;
class AA{
double* a_d;
double* a_h;
unsigned int mem_size;
int n;
public:
AA(double* a_h_,int n_){
a_h=a_h_;
n=n_;
mem_size = sizeof(double) * n;
}
void InitializeOnGPU(){
cudaError_t error;
error = cudaMalloc((void **) &a_d, mem_size);
error = cudaMemcpy(a_d, a_h, mem_size, cudaMemcpyHostToDevice);
//cudaError_t error2;
//error2 = cudaMemcpy(a_h, a_d, mem_size, cudaMemcpyDeviceToHost);
}
__device__ void RunOnGPU(){
for (int ic=0;ic<n;ic++){
a_d[ic]=a_d[ic]*2.0;
}
}
void CopySolutionFromGPU2CPU(){
cudaError_t error;
error = cudaMemcpy(a_h, a_d, mem_size, cudaMemcpyDeviceToHost);
}
void PrintSolutions(){
for(int ic=0;ic<n;ic++){
cout<<a_h[ic]<<',';
}
cout<<endl;
}
};
__global__ void RunObjFun(AA** AAs_d){
int bx = blockIdx.x;
int tx = threadIdx.x;
int Id_t=blockDim.x*bx+tx;
AAs_d[Id_t]->RunOnGPU();
}
int main(){
int n=3;
double* a_h=new double[n];
a_h[0]=1;a_h[1]=2;a_h[2]=3;
int ncase=10;
AA** AAs_h=new AA*[ncase];
for (int ic=0;ic<ncase;ic++){
AAs_h[ic]=new AA(a_h,n);
AAs_h[ic]->InitializeOnGPU();
}
AA** AAs_d;
unsigned int mem_size_As = sizeof(AA)*ncase;
cudaError_t error;
error = cudaMalloc((void **) &AAs_d, mem_size_As);
error = cudaMemcpy(AAs_d, AAs_h, mem_size_As, cudaMemcpyHostToDevice);
RunObjFun<<<2,5>>>(AAs_d);
for (int ic=0;ic<ncase;ic++){
AAs_h[ic]->CopySolutionFromGPU2CPU();
AAs_h[ic]->PrintSolutions();
}
return 0;
}