Normally I’m using new/delete to allocate memory heap in device code for some variables but it’s too slow and after looking this blog; Using Shared Memory in CUDA C/C++ | NVIDIA Technical Blog I saw that shared memory could be 100x faster and also Fermi (microarchitecture) - Wikipedia says shared memory at L1 cache needs 10-20 cycles to access and global memory needs 600 - 800 cycles to be accessed. So I think the main reason for it takes that much time is using heap memory with new/delete
Currantly I’m using
__device__ void GetNetworkOutput(float* rollingdata, Network* net, int* output_ids, uint8_t* result_id) {
float* outputs = new float[net->num_neurons];
float* values = new float[net->num_neurons];
bool* completed = new bool[net->num_connections];
if (outputs == nullptr || values == nullptr || completed == nullptr) {
printf("Memory allocation failed\n");
return;
}
for (int i = 0; i < net->num_neurons; ++i) {
if (net->Neurons[i].type == 0) {
values[i] = rollingdata[i];
outputs[i] = rollingdata[i];
}
else {
values[i] = 0.0f;
outputs[i] = 0.0f;
}
}
bool finished = false;
for (int i = 0; i < net->num_connections; ++i) {
if (net->Neurons[net->Connections[i].from].type == 0) {
values[net->Connections[i].to] += net->Connections[i].weight * values[net->Connections[i].from];
completed[i] = true;
}
else {
completed[i] = false;
}
}
while (!finished) {
finished = true;
for (int i = 0; i < net->num_connections; ++i) {
if (!completed[i]) {
finished = false;
if (values[net->Connections[i].from] != 0.0f) {
bool ehezowski = true;
for (int j = 0; j < net->Neurons[net->Connections[i].from].connected_num; ++j) {
if (outputs[net->Neurons[net->Connections[i].from].incoming_connections[j]] == 0.0f) {
ehezowski = false;
}
}
if (ehezowski) {
if (outputs[net->Connections[i].from] == 0.0f) {
outputs[net->Connections[i].from] = activationFunction(values[net->Connections[i].from] + net->Neurons[net->Connections[i].from].bias);
}
values[net->Connections[i].to] += net->Connections[i].weight * values[net->Connections[i].from];
completed[i] = true;
}
}
}
}
}
for (int i = 0; i < net->num_neurons; ++i) {
if (net->Neurons[i].type == 2) {
outputs[i] = activationFunction(values[i] + net->Neurons[i].bias);
}
}
float biggest = -1500.0f;
for (uint8_t index = 0; index < 3; ++index) {
int val = output_ids[index];
*result_id = (outputs[val] > biggest) ? index : *result_id;
biggest = (outputs[val] > biggest) ? outputs[val] : biggest;
}
delete[] completed;
delete[] values;
delete[] outputs;
}
__global__ void EvaluateNetworks(float* __restrict__ rollingdata, Network* __restrict__ d_networks, int input_num, uint8_t stepsPerThread, int datasize, TradesForPopulation* tradeee, int pop_idx) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
Network* net = &d_networks[pop_idx];
Trades* trade = &tradeee->jaja[pop_idx];
if (net == nullptr || net->output_ids == nullptr || net->Connections == nullptr) {
printf("Error: Null pointer in network\n");
return;
}
if (trade == nullptr || trade == NULL) {
printf("Error: Null pointer in trades - %i\n", pop_idx);
return;
}
for (int i = idx * stepsPerThread; i < (idx + 1) * stepsPerThread; ++i) {
if (i >= datasize) return;
int start = i * input_num;
int finish = (i+1) * input_num;
float* data_chnk = new float[input_num];
if (data_chnk == nullptr || data_chnk == NULL) {
printf("Memory allocation failed for data_chnk\n");
return;
}
for (int ehe = start; ehe < finish; ++ehe) {
data_chnk[ehe-start] = rollingdata[ehe];
}
uint8_t result_id = 5;
GetNetworkOutput(data_chnk, net, net->output_ids, &result_id);
trade->traades[i] = result_id;
delete[] data_chnk;
}
}
I tried to replace new
with extern __shared__
but it gave me an illegal memory access was encountered
error. I looked into the documentation and 1. Introduction — CUDA C++ Programming Guide says you have to specify the required shared memory size at the kernel via <<<blocks, threadsperblock, requiredmemsize>>>
but the problem is I’m launching the kernel in a loop and I need to copy the memory from device to host and it slows down the proccess. Is there any way to use extern __shared__
instead of heap memory without specifing the required shared memory size at the kernel call? I want to use shared memory for data_chnk
, values
, outputs
and completed
variables.
Here’s the loop that I’m calling kernel
for (int i = 0; i < population_size; ++i) {
EvaluateNetworks<<<evaulation_blocks, threadsPerBlock>>>(d_rolling_data, d_networks, input_num, stepsPerThread, (sizeff - ws), d_traades_for_population, i);
}
gpuErrchk(cudaDeviceSynchronize());
To specify the required shared mem size I have to copy d_networks[i].num_connections
and d_networks[i].num_neurons
into host and then calculate the required size and call the function every time.
Also one more question, do you really think that it will speed up the function like 50x - 100x?