Hi, I want to allocate a struct with some fields that are vectors in device memory but I’m not sure how to do that.
The declaration of the struct:
typedef struct {
unsigned numberInputLayers;
unsigned* inputLayerSize;
unsigned totalWeighsPerOutput;
void** inputNeurons;
unsigned outputSize;
void* outputNeurons;
float* thresholds;
void* weighs;
FunctionType functionType;
} struct_Layer;
And the crashing method is:
extern "C" struct_Layer* LayerHostToDevice(struct_Layer* h_layer, VectorType inputType, VectorType outputType){
struct_Layer* d_layer;
cudaMalloc((void**)&d_layer, sizeof(struct_Layer));
size_t size = sizeof(unsigned);
cudaMemcpy(&(d_layer->numberInputLayers), &(h_layer->numberInputLayers), size, cudaMemcpyHostToDevice);
cudaMemcpy(&(d_layer->totalWeighsPerOutput), &(h_layer->totalWeighsPerOutput), size, cudaMemcpyHostToDevice);
cudaMemcpy(&(d_layer->outputSize), &(h_layer->outputSize), size, cudaMemcpyHostToDevice);
cudaMemcpy(&(d_layer->functionType), &(h_layer->functionType), sizeof(FunctionType), cudaMemcpyHostToDevice);
size = h_layer->numberInputLayers * sizeof(unsigned);
cudaMalloc((void**)&(d_layer->inputLayerSize), size); ////////////CRASHES HERE//////////////////
cudaMemcpy(d_layer->inputLayerSize, h_layer->inputLayerSize, size, cudaMemcpyHostToDevice);
size = h_layer->numberInputLayers * sizeof(void*);
cudaMalloc((void**)&(d_layer->inputNeurons), size);
if (outputType == FLOAT){
size = sizeof(float) * h_layer->outputSize * h_layer->totalWeighsPerOutput;
} else {
size = sizeof(unsigned char) * h_layer->outputSize * h_layer->totalWeighsPerOutput;
}
cudaMalloc((void**)&(d_layer->weighs), size);
cudaMemcpy(d_layer->weighs, h_layer->weighs, size, cudaMemcpyHostToDevice);
if (outputType == FLOAT){
size = sizeof(float) * h_layer->outputSize;
} else {
size = sizeof(unsigned) * (((h_layer->outputSize - 1)/ BITS_PER_UNSIGNED) + 1);
}
cudaMalloc((void**)&(d_layer->outputNeurons), size);
cudaMemcpy(d_layer->outputNeurons, h_layer->outputNeurons, size, cudaMemcpyHostToDevice);
size = h_layer->outputSize * sizeof(float);
cudaMalloc((void**)&(d_layer->thresholds), size);
cudaMemcpy(d_layer->thresholds, h_layer->thresholds, size, cudaMemcpyHostToDevice);
checkCUDAError("Layer Host To Device");
return d_layer;
}
I was told I cannot do that here
But what I’m thinking is maybe I don’t have to allocate the struct in device memory in the first place.
cudaMalloc((void**)&d_layer, sizeof(struct_Layer));
The kernel is declared like this:
global void LayerCalculationKernel(struct_Layer* layer)
but can I do the following?
declare the kernel like this:
global void LayerCalculationKernel(struct_Layer layer)
and calling it like this:
//These two lines are going to be in different places, but I write it like this just as an example
struct_Layer* dev_layer = LayerHostToDevice(h_layer, inputType, outputType);
LayerCalculationKernel(*dev_layer);
and allocate in device memory just the vectors of the struct, while the rest of it is in host memory?
If not, how can I do this kind of allocations?
Any suggestion will be great