Hi, I’m trying to use shared memory broadcasting in a kernel but I think is not working properly.
The kernel is this:
input_size = 2048
output_size = 2048
float inputs[input_size]
float results[output_size]
float weighs[input_size * output_size]
__global__
void SumFloatsConnectionsKernel(float* inputs, unsigned input_size, unsigned output_size, float* weighs, float* results)
{
extern __shared__ float sdata[];
unsigned outputNeuron = blockIdx.x*blockDim.x + threadIdx.x;
unsigned pos = threadIdx.x;
while (pos < input_size){
sdata[pos] = inputs[pos];
}
pos += blockDim.x;
}
__syncthreads();
if (outputNeuron < output_size){
float result = 0;
for (unsigned i=0; i < input_size; i++){
result += sdata[i] * weighs[(i * output_size) + outputNeuron];
}
results[outputNeuron] += result;
}
}
First, the inputs are loaded to shared memory. Then, some calculations are made. I expected sdata[i] to be broadcasted to every thread in the block, but it seems it doesn’t.
The reason why I think that (maybe it’s not a good reason) is that this other code runs faster while it kernel is using just 4 bytes of shared memory:
//...
shared_mem_size = sizeof(float);
for (unsigned i=0; i < input_size; i++) {
SumFloatsConnectionsKernel3<<< grid_size, block_size, shared_mem_size >>>((float*)inputPtr, input_size, i, output_size, (float*)weighs, results);
}
//...
__global__
void SumFloatsConnectionsKernel3(float* inputs, unsigned input_size, unsigned input_id, unsigned output_size, float* weighs, float* results)
{
extern __shared__ float sdata[];
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx == 0) sdata[0] = inputs[input_id];
__syncthreads();
if (idx < output_size) results[idx] += sdata[0] * weighs[(idx * input_size) + input_id];
}
I think that the second code is using shared memory broadcasting and the first not.
How can I “force” my code to use it?
Thank you