I realised that I don’t need dev_inputs vector to be in device memory, but just the inputs themselves.
The method will be:
extern "C" void** InputsToDevice(void** host_inputs, unsigned* host_inputSizes, VectorType* host_types, unsigned numberInputs)
{
size_t size = numberInputs * sizeof(void*);
void** dev_inputs;
dev_inputs = new void*[numberInputs];
for (unsigned i=0; i < numberInputs; i++){
if (host_types[i] == FLOAT){
size = host_inputSizes[i] * sizeof(float);
} else {
size = (((host_inputSizes[i] - 1)/ BITS_PER_UNSIGNED) + 1) * sizeof(unsigned);
}
cudaMalloc((void**)&(dev_inputs[i]), size);
cudaMemcpy(dev_inputs[i], host_inputs[i], size, cudaMemcpyHostToDevice);
}
checkCUDAError("Inputs To Device");
return dev_inputs;
}
But I got another method that copies data to the device and I got the same (or very similar) problem there.
extern "C" struct_Layer* LayerHostToDevice(struct_Layer* h_layer, VectorType inputType, VectorType outputType){
struct_Layer* d_layer;
cudaMalloc((void**)&d_layer, sizeof(struct_Layer));
size_t size = sizeof(unsigned);
cudaMemcpy(&(d_layer->numberInputLayers), &(h_layer->numberInputLayers), size, cudaMemcpyHostToDevice);
cudaMemcpy(&(d_layer->totalWeighsPerOutput), &(h_layer->totalWeighsPerOutput), size, cudaMemcpyHostToDevice);
cudaMemcpy(&(d_layer->outputSize), &(h_layer->outputSize), size, cudaMemcpyHostToDevice);
cudaMemcpy(&(d_layer->functionType), &(h_layer->functionType), sizeof(FunctionType), cudaMemcpyHostToDevice);
size = h_layer->numberInputLayers * sizeof(unsigned);
cudaMalloc((void**)&(d_layer->inputLayerSize), size); //Now crashes here!!! I think the reason is the same :(
cudaMemcpy(d_layer->inputLayerSize, h_layer->inputLayerSize, size, cudaMemcpyHostToDevice);
size = h_layer->numberInputLayers * sizeof(void*);
cudaMalloc((void**)d_layer->inputNeurons, size);
if (outputType == FLOAT){
size = sizeof(float) * h_layer->outputSize * h_layer->totalWeighsPerOutput;
} else {
size = sizeof(unsigned char) * h_layer->outputSize * h_layer->totalWeighsPerOutput;
}
cudaMalloc((void**)&(d_layer->weighs), size);
cudaMemcpy(d_layer->weighs, h_layer->weighs, size, cudaMemcpyHostToDevice);
if (outputType == FLOAT){
size = sizeof(float) * h_layer->outputSize;
} else {
size = sizeof(unsigned) * (((h_layer->outputSize - 1)/ BITS_PER_UNSIGNED) + 1);
}
cudaMalloc((void**)&(d_layer->outputNeurons), size);
cudaMemcpy(d_layer->outputNeurons, h_layer->outputNeurons, size, cudaMemcpyHostToDevice);
size = h_layer->outputSize * sizeof(float);
cudaMalloc((void**)&(d_layer->thresholds), size);
cudaMemcpy(d_layer->thresholds, h_layer->thresholds, size, cudaMemcpyHostToDevice);
checkCUDAError("Layer Host To Device");
return d_layer;
}
Any help will be great, but I think I’m going to start a new topic since the title of this one is not appropriate anymore.
Thanks for the help, everybody.