Hi,

I am using cuda to activate neural networks. It is a simple artificial network type (ann) , and I activate it layer by layer. I do it in 2 phases. First I calculate the sumproducts of [weight] x [input value] , and then I sum all the sumproducts and apply activation function. I use the shared memory and so my kernels are limited to certain array sizes, but this limitation is not the problem. The problem is, kernels are too slow and I want to accelerate them.

In both I have a loop, that calculates the sum and so I was wondering if unrolling the loop (that will increase kernel size) will increase speed? Both loops just iterate 8 times to do a parallel sum operation.

Here are my kernels (still under development), i run these 2 kernels per each layer of the ann:

- Kernel to multiply weights x inputs:

```
// params:
// block.x -> destination node number multiplied by [num_anns] /* division is needed */
// block.y -> offset ( should range from 0 to _MAX_THREADS_PER_KERNEL_ , because of _MAX_THREADS_PER_KERNEL_ limitation of sumproduct calculation )
// thread.x -> link number (ranges from 0 to _MAX_THREADS_PER_KERNEL_ )
extern "C" __global__ void act_layer(uint cur_layer,uint num_outputs,uint num_offsets) {// run: one thread per link
__shared__ __device__ node_value_t tmp[_MAX_THREADS_PER_KERNEL_];
uint i_ann,dst_num,val_idx,weight_idx,offset=0,pow,prev_pow,aux;
node_value_t sum=0.0f,weight,value,tmpval1,tmpval2,*data;
node_value_t *sumproducts;
ann_t *a;
ann_layer_t *al;
i32_t *idxptr;
sumproducts=(node_value_t*) d_gpu_mem.sumproducts_base;
idxptr=(i32_t*) d_gpu_mem.ann_index_list;
data=(node_value_t*) d_gpu_mem.ad_base;
a=(ann_t*) d_gpu_mem.anns_base;
dst_num=blockIdx.x%num_outputs;
aux=blockIdx.x/num_outputs;
i_ann=idxptr[aux];
a=&a[i_ann];
al=&a->layers[cur_layer];
offset=blockIdx.y;
/// here the sum begins
val_idx=offset*_MAX_THREADS_PER_KERNEL_+threadIdx.x;
weight_idx=(dst_num*al->num_inputs)+val_idx;
value=0.0f;weight=0.0f;
if ((val_idx<al->num_inputs)&&(threadIdx.x<_MAX_THREADS_PER_KERNEL_)) {
value=data[al->ia_inputs+val_idx];
weight=data[al->ia_weights+weight_idx];
}
__syncthreads();
tmp[threadIdx.x]=value*weight;
__syncthreads();
prev_pow=0;pow=2;
for(pow=2;pow<=512;pow=pow<<1) {
aux=pow>>1;
if (aux==0) aux=1;
if ( (!(threadIdx.x&(pow-1))) || (prev_pow==threadIdx.x) ) {
if ((threadIdx.x+aux)<blockDim.x) {
tmpval1=tmp[threadIdx.x+aux];
}
}
__syncthreads();
if ( (!(threadIdx.x&(pow-1))) || (prev_pow==threadIdx.x) ) {
if ((threadIdx.x+aux)<blockDim.x) {
tmpval2=tmp[threadIdx.x];
}
}
__syncthreads();
if ( (!(threadIdx.x&(pow-1))) || (prev_pow==threadIdx.x) ) {
if ((threadIdx.x+aux)<blockDim.x) {
tmp[threadIdx.x]=tmpval1+tmpval2;
}
}
prev_pow=pow;
}
__syncthreads();
if (threadIdx.x==0) sum=sum+tmp[0];
__syncthreads();
if (threadIdx.x==0) {
aux=i_ann*(num_outputs*num_offsets)+dst_num*num_offsets+offset;
sumproducts[aux]=sum; // stores corresponding sumproduct, the output of the whole kernel.
}
}
```

- Kernel that sums suproducts and applies activation function

```
extern "C" __global__ void merge_subproducts(uint cur_layer,uint num_outputs,uint num_offsets) { // run: one thread per sumproduct, number of blocks = 1
__shared__ __device__ node_value_t tmp[_MAX_THREADS_PER_KERNEL_];
node_value_t *sumproducts,sum=0.0f,*data,tmpval1,tmpval2;
i32_t *idxptr;
uint i_ann,pow,prev_pow,aux,dst_num,src_idx;
ann_t *a;
ann_layer_t *al;
sumproducts=(node_value_t*) d_gpu_mem.sumproducts_base;
a=(ann_t*) d_gpu_mem.anns_base;
idxptr=(i32_t*) d_gpu_mem.ann_index_list;
data=(node_value_t*) d_gpu_mem.ad_base;
dst_num=blockIdx.x;
i_ann=idxptr[blockIdx.y];
a=&a[i_ann];
al=&a->layers[cur_layer];
/// get all sumproducts
src_idx=i_ann*(num_outputs*num_offsets)+dst_num*num_offsets+threadIdx.x;
__syncthreads();
tmp[threadIdx.x]=sumproducts[src_idx];
__syncthreads();
prev_pow=0;pow=2;
for(pow=2;pow<=512;pow=pow<<1) {
aux=pow>>1;
if (aux==0) aux=1;
if ( (!(threadIdx.x&(pow-1))) || (prev_pow==threadIdx.x) ) {
if ((threadIdx.x+aux)<blockDim.x) {
tmpval1=tmp[threadIdx.x+aux];
}
}
__syncthreads();
if ( (!(threadIdx.x&(pow-1))) || (prev_pow==threadIdx.x) ) {
if ((threadIdx.x+aux)<blockDim.x) {
tmpval2=tmp[threadIdx.x];
}
}
__syncthreads();
if ( (!(threadIdx.x&(pow-1))) || (prev_pow==threadIdx.x) ) {
if ((threadIdx.x+aux)<blockDim.x) {
tmp[threadIdx.x]=tmpval1+tmpval2;
}
}
prev_pow=pow;
}
__syncthreads();
if (threadIdx.x==0) sum=tmp[0];
__syncthreads();
if (threadIdx.x==0) {
sum=sum+data[al->ia_biases_o+dst_num]; // add the Bias
if (sum>20.0f) sum=20.0f; // limit ranges to prevent 'infinite'-type errors
if (sum<-20.0f) sum=-20.0f;
data[al->ia_sum_outputs+dst_num]=sum;
// activation function
// data[al->ia_outputs+dst_num]=1/(1+exp(-sum)); // standard sigmoid
data[al->ia_outputs+dst_num]=2*(1/(1+exp(-sum)))-1; // adjusted sigmoid to fit tangent from -1 to +1
}
}
```

Will appreciate any comments on improving my code

Regards