Hello, I’m new to CUDA programming and I’m trying to figure out how to copy a complex object from the device to the host.
I have an object CUDAPopulation that needs to do some computation on the device.
CUDAPopulation.h:
class CUDAPopulation {
public:
...
CUDAGenome **individuals;
CUDAPopulation(unsigned int popSize, unsigned int genNum, Objective obj = MAXIMIZE);
...
private:
// Performs fitness-proportionate selection to get an individual from the population.
__device__ CUDAGenome *select();
// Scales the individuals' scores to fitnesses.
__device__ void scale();
// Implements ascending odd-even transposition sort on the individuals of the population.
__device__ void sort();
};
// Perform an avaluation on the elements of the given population.
__global__ void evaluate(CUDAPopulation *pop);
// Perform an evolution step (selection, crossover, mutation, replacement) on the given population.
__global__ void step(CUDAPopulation *pop);
I allocate the object on the device like so:
...
// Create the population object on the host.
CUDAPopulation *population = new CUDAPopulation(popSize, genNumber, CUDAPopulation::MINIMIZE);
// Allocate the population on the device.
CUDAPopulation *d_pop;
cudaMalloc(&d_pop, sizeof(CUDAPopulation));
cudaMemcpy(d_pop, population, sizeof(CUDAPopulation), cudaMemcpyHostToDevice);
...
Then I dinamically allocate individuals calling a kernel that helps me implement polymorphism in CUDA code and run the computation:
dim3 gridSize(popSize);
dim3 blockSize(checksNumber);
createCUDAPathGenomePopulation<<<gridSize, 1>>>(d_pop, d_checks, checksNumber);
for (unsigned int i = 0; i < genNumber; i++) {
evaluate<<<gridSize, blockSize>>>(d_pop);
cudaDeviceSynchronize();
step<<<gridSize, blockSize>>>(d_pop);
cudaDeviceSynchronize();
}
createCUDAPathGenomePopulation just initializes the individuals’ field in CUDAPopulation with a subclass of CUDAGenome called CUDAPathGenome.
Everything is ok so far, the problem comes when I try to copy the CUDAPathGenomes to the host after the computation to check the results:
...
// Copy the population object.
CUDAPopulation *pop = (CUDAPopulation *) malloc(sizeof(CUDAPopulation));
cudaMemcpy(pop, d_pop, sizeof(CUDAPopulation), cudaMemcpyDeviceToHost);
cudaCheckError();
// Copy the address of the populations' individuals.
CUDAGenome **tmpIndividuals = (CUDAGenome **) malloc(popSize * sizeof(CUDAGenome *));
cudaMemcpy(tmpIndividuals, pop->individuals, popSize * sizeof(CUDAGenome *), cudaMemcpyDeviceToHost);
cudaCheckError();
// Copy the population's individuals.
CUDAPathGenome **individuals = (CUDAPathGenome **) malloc(popSize * sizeof(CUDAPathGenome *));
for (unsigned int i = 0; i < popSize; i++) {
individuals[i] = (CUDAPathGenome *) malloc(sizeof(CUDAPathGenome));
cudaMemcpy(individuals[i], tmpIndividuals[i], sizeof(CUDAPathGenome), cudaMemcpyDeviceToHost);
cudaCheckError();
}
...
CUDAPathGenome internally has some dynamically allocated arrays that get copied later.
When I call “cudaMemcpy(individuals[i], tmpIndividuals[i], sizeof(CUDAPathGenome), cudaMemcpyDeviceToHost);”, on line 16, I get an “invalid argument” error and the program stops (that’s what cudaCheckError() does).
I can’t figure out why I get the error and what I am doing wrong.
I’m using CUDA 9.0 on a 920m GPU (3.5 compute capability).
Thanks