tips for optimising my neural net kernel

Hi, I wrote a kernel that calculates the state/output of a neural net layer. A layer can be connected (get inputs from) different layers with which it will be fully connected. Another special stuff is that the state of a neuron can be represented by just a bit (in this case, a weigh is a byte instead of a float). I want you to read the code and, maybe, give some advice.

I read the reduction pdf, so some of the code is taken from there.

Each block corresponds to an output neuron.

Here’s the structure I use for the layer:

typedef enum {FLOAT, BIT, SIGN} VectorType;

typedef enum {BINARY_STEP, BIPOLAR_STEP, REAL, IDENTITY, SIGMOID, BIPOLAR_SIGMOID, ANOTHER_FUNCTION} FunctionType;

typedef struct {

	unsigned numberInputLayers;

	unsigned* inputLayerSize;

	unsigned totalWeighsPerOutput;

	void** inputNeurons;

	unsigned outputSize;

	void* outputNeurons;

	float* thresholds;

	void* weighs;

	FunctionType functionType;

} struct_Layer;

Here’s a function for implementing different activation functions (although I’ve not implemented many of them):

__device__ float Func(float number, FunctionType functionType) {

	switch (functionType) {

		case BINARY_STEP:

			if (number > 0){

				return 1;

			} else {

				return 0;

			}

		case BIPOLAR_STEP:

			if (number > 0){

				return 1;

			} else {

				return -1;

			}

		case IDENTITY:

		default:

			return number;

	}

}

And here’s the kernel:

template <unsigned int blockSize, VectorType inputType, VectorType outputType>

__global__ void LayerCalculationKernel(struct_Layer* layer)

{

	extern __shared__ float sdata[];

	unsigned tid = threadIdx.x;

	unsigned outputNeuron = blockIdx.x;

	unsigned weighsOffset = (outputNeuron * layer->totalWeighsPerOutput);

	float result = 0;

	for (unsigned input=0; input < layer->numberInputLayers; input++){

		unsigned i = tid;

		unsigned elementsToRead;

		if (inputType == FLOAT){

			elementsToRead = layer->inputLayerSize[input];

		} else {

			elementsToRead = ((layer->inputLayerSize[input] - 1) / BITS_PER_UNSIGNED) + 1;

					   unsigned mask = 0x80000000;

		}

		while (i < elementsToRead){

			if (inputType == FLOAT){

				result += ((float**)(layer->inputNeurons))[input][i] * ((float*)layer->weighs)[weighsOffset + i];

			}

			if (inputType == BIT){

				for (unsigned j=0; j < BITS_PER_UNSIGNED; j++) {

					if (((unsigned**)(layer->inputNeurons))[input][i] & mask) {

						result += ((unsigned char*)layer->weighs)[weighsOffset + (i * BITS_PER_UNSIGNED) + j] - 128;

					}

					mask >>= 1;

				}

				mask = 0x80000000;

			}

			if (inputType == SIGN){

				for (unsigned j=0; j < BITS_PER_UNSIGNED; j++) {

					if (((unsigned**)(layer->inputNeurons))[input][i] & mask) {

						result += ((unsigned char*)layer->weighs)[weighsOffset + (i * BITS_PER_UNSIGNED) + j] - 128;

					} else {

						result += 128 - ((unsigned char*)layer->weighs)[weighsOffset + (i * BITS_PER_UNSIGNED) + j];

					}

					mask >>= 1;

				}

				mask = 0x80000000;

			}

			i += blockSize;

		}

		if (inputType == FLOAT){

			weighsOffset += elementsToRead;

		} else {

			weighsOffset += elementsToRead * BITS_PER_UNSIGNED;

		}

	}

	sdata[tid] = result; 

	__syncthreads();

	if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }

	if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }

	if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }

	if (tid < 32) {

		if (blockSize >= 64) sdata[tid] += sdata[tid + 32];

		if (blockSize >= 32) sdata[tid] += sdata[tid + 16];

		if (blockSize >= 16) sdata[tid] += sdata[tid + 8];

		if (blockSize >= 8) sdata[tid] += sdata[tid + 4];

		if (blockSize >= 4) sdata[tid] += sdata[tid + 2];

		if (blockSize >= 2) sdata[tid] += sdata[tid + 1];

	}

	if (outputType == FLOAT) {

		if (tid == 0) {

			((float*)(layer->outputNeurons))[outputNeuron] = Func(sdata[0] - layer->thresholds[outputNeuron], layer->functionType);

		}

	}

	if (outputType == BIT) {

		if (tid == 0) {

			unsigned mask = (unsigned)(0x80000000>>(outputNeuron % BITS_PER_UNSIGNED));

			if (sdata[0] - layer->thresholds[outputNeuron] > 0){

				atomicOr(&(((unsigned*)(layer->outputNeurons))[outputNeuron / BITS_PER_UNSIGNED]), mask);

			} else {

				atomicAnd(&(((unsigned*)(layer->outputNeurons))[outputNeuron / BITS_PER_UNSIGNED]), ~mask);

			}

		}

	}

}

Just in case, here’s the function that is called from C/C++:

extern "C" void LayerCalculation(struct_Layer* d_layer, unsigned threads, VectorType inputType, VectorType outputType){

	dim3 dimBlock(threads, 1, 1);

	dim3 dimGrid(d_layer->outputSize, 1, 1);	

	int smemSize = threads * sizeof(float);

	switch (inputType) {

		case FLOAT:

			if (outputType == FLOAT){

				switch (threads)

				{

					case 512:

						LayerCalculationKernel<512, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 256:

						LayerCalculationKernel<256, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 128:

						LayerCalculationKernel<128, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 64:

						LayerCalculationKernel< 64, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 32:

						LayerCalculationKernel< 32, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 16:

						LayerCalculationKernel< 16, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  8:

						LayerCalculationKernel<  8, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  4:

						LayerCalculationKernel<  4, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  2:

						LayerCalculationKernel<  2, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  1:

						LayerCalculationKernel<  1, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

				}

			} else {

				switch (threads)

				{

					case 512:

						LayerCalculationKernel<512, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 256:

						LayerCalculationKernel<256, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 128:

						LayerCalculationKernel<128, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 64:

						LayerCalculationKernel< 64, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 32:

						LayerCalculationKernel< 32, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 16:

						LayerCalculationKernel< 16, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  8:

						LayerCalculationKernel<  8, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  4:

						LayerCalculationKernel<  4, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  2:

						LayerCalculationKernel<  2, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  1:

						LayerCalculationKernel<  1, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

				}

			}

			break;

		case BIT:

			if (outputType == FLOAT){

				switch (threads)

				{

					case 512:

						LayerCalculationKernel<512, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 256:

						LayerCalculationKernel<256, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 128:

						LayerCalculationKernel<128, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 64:

						LayerCalculationKernel< 64, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 32:

						LayerCalculationKernel< 32, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 16:

						LayerCalculationKernel< 16, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  8:

						LayerCalculationKernel<  8, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  4:

						LayerCalculationKernel<  4, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  2:

						LayerCalculationKernel<  2, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  1:

						LayerCalculationKernel<  1, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

				}

			} else {

				switch (threads)

				{

					case 512:

						LayerCalculationKernel<512, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 256:

						LayerCalculationKernel<256, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 128:

						LayerCalculationKernel<128, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 64:

						LayerCalculationKernel< 64, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 32:

						LayerCalculationKernel< 32, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 16:

						LayerCalculationKernel< 16, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  8:

						LayerCalculationKernel<  8, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  4:

						LayerCalculationKernel<  4, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  2:

						LayerCalculationKernel<  2, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  1:

						LayerCalculationKernel<  1, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

				}

			}

			break;

		case SIGN:

			if (outputType == FLOAT){

				switch (threads)

				{

					case 512:

						LayerCalculationKernel<512, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 256:

						LayerCalculationKernel<256, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 128:

						LayerCalculationKernel<128, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 64:

						LayerCalculationKernel< 64, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 32:

						LayerCalculationKernel< 32, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 16:

						LayerCalculationKernel< 16, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  8:

						LayerCalculationKernel<  8, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  4:

						LayerCalculationKernel<  4, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  2:

						LayerCalculationKernel<  2, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  1:

						LayerCalculationKernel<  1, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

				}

			} else {

				switch (threads)

				{

					case 512:

						LayerCalculationKernel<512, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 256:

						LayerCalculationKernel<256, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 128:

						LayerCalculationKernel<128, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 64:

						LayerCalculationKernel< 64, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 32:

						LayerCalculationKernel< 32, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case 16:

						LayerCalculationKernel< 16, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  8:

						LayerCalculationKernel<  8, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  4:

						LayerCalculationKernel<  4, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  2:

						LayerCalculationKernel<  2, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

					case  1:

						LayerCalculationKernel<  1, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;

				}

			}

			break;

	}

	checkCUDAError("Layer Calculation");

}

I don’t know, maybe I’m doing something in a wrong way or something like that.

Thanks in advantage