Hi,

I should optimize the running time of a simple neural network (MLP) of 13 layers.

The code do multiplication, addition and relu operations. I multiply weights*in, add bias and then apply a relu function.

__ global__ void kernel_applyweightandbias(float *in, float *weights, float *bias, float *out, const int input_size, const int output_size){

int tid=threadIdx.x+blockIdx.x*blockDim.x;

float sum=0;`if(tid<output_size){ sum = bias[tid]; for(int i=0; i<input_size; i++) { sum += in[i]*weights[tid*input_size+i]; } out[tid]=sum; if (out[tid] <= 0) { out[tid] = 0; } }`

}

I tried to use half precision float but performance is not improved.

__ global__ void kernel_applyweightandbias(half *in, half *weights, half *bias, half

out, const int input_size, const int output_size){blockDim.x;

int tid=threadIdx.x+blockIdx.x

//float sum=0;

half sum;`if(tid<output_size){ sum = bias[tid]; for(int i=0; i<input_size; i++) { sum = __hfma(in[i], weights[tid*input_size+i], sum); }`

if ( __hle(sum , __float2half(0.0)))

sum= __float2half(0.0);`out[tid]=sum; }`

}

Any idea how to optimize this code ?

Please note that the target hardware is an Nvidia Jetson Nano with an architecture sm53.