Hi, I wrote a kernel that calculates the state/output of a neural net layer. A layer can be connected (get inputs from) different layers with which it will be fully connected. Another special stuff is that the state of a neuron can be represented by just a bit (in this case, a weigh is a byte instead of a float). I want you to read the code and, maybe, give some advice.
I read the reduction pdf, so some of the code is taken from there.
Each block corresponds to an output neuron.
Here’s the structure I use for the layer:
typedef enum {FLOAT, BIT, SIGN} VectorType;
typedef enum {BINARY_STEP, BIPOLAR_STEP, REAL, IDENTITY, SIGMOID, BIPOLAR_SIGMOID, ANOTHER_FUNCTION} FunctionType;
typedef struct {
unsigned numberInputLayers;
unsigned* inputLayerSize;
unsigned totalWeighsPerOutput;
void** inputNeurons;
unsigned outputSize;
void* outputNeurons;
float* thresholds;
void* weighs;
FunctionType functionType;
} struct_Layer;
Here’s a function for implementing different activation functions (although I’ve not implemented many of them):
__device__ float Func(float number, FunctionType functionType) {
switch (functionType) {
case BINARY_STEP:
if (number > 0){
return 1;
} else {
return 0;
}
case BIPOLAR_STEP:
if (number > 0){
return 1;
} else {
return -1;
}
case IDENTITY:
default:
return number;
}
}
And here’s the kernel:
template <unsigned int blockSize, VectorType inputType, VectorType outputType>
__global__ void LayerCalculationKernel(struct_Layer* layer)
{
extern __shared__ float sdata[];
unsigned tid = threadIdx.x;
unsigned outputNeuron = blockIdx.x;
unsigned weighsOffset = (outputNeuron * layer->totalWeighsPerOutput);
float result = 0;
for (unsigned input=0; input < layer->numberInputLayers; input++){
unsigned i = tid;
unsigned elementsToRead;
if (inputType == FLOAT){
elementsToRead = layer->inputLayerSize[input];
} else {
elementsToRead = ((layer->inputLayerSize[input] - 1) / BITS_PER_UNSIGNED) + 1;
unsigned mask = 0x80000000;
}
while (i < elementsToRead){
if (inputType == FLOAT){
result += ((float**)(layer->inputNeurons))[input][i] * ((float*)layer->weighs)[weighsOffset + i];
}
if (inputType == BIT){
for (unsigned j=0; j < BITS_PER_UNSIGNED; j++) {
if (((unsigned**)(layer->inputNeurons))[input][i] & mask) {
result += ((unsigned char*)layer->weighs)[weighsOffset + (i * BITS_PER_UNSIGNED) + j] - 128;
}
mask >>= 1;
}
mask = 0x80000000;
}
if (inputType == SIGN){
for (unsigned j=0; j < BITS_PER_UNSIGNED; j++) {
if (((unsigned**)(layer->inputNeurons))[input][i] & mask) {
result += ((unsigned char*)layer->weighs)[weighsOffset + (i * BITS_PER_UNSIGNED) + j] - 128;
} else {
result += 128 - ((unsigned char*)layer->weighs)[weighsOffset + (i * BITS_PER_UNSIGNED) + j];
}
mask >>= 1;
}
mask = 0x80000000;
}
i += blockSize;
}
if (inputType == FLOAT){
weighsOffset += elementsToRead;
} else {
weighsOffset += elementsToRead * BITS_PER_UNSIGNED;
}
}
sdata[tid] = result;
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
if (outputType == FLOAT) {
if (tid == 0) {
((float*)(layer->outputNeurons))[outputNeuron] = Func(sdata[0] - layer->thresholds[outputNeuron], layer->functionType);
}
}
if (outputType == BIT) {
if (tid == 0) {
unsigned mask = (unsigned)(0x80000000>>(outputNeuron % BITS_PER_UNSIGNED));
if (sdata[0] - layer->thresholds[outputNeuron] > 0){
atomicOr(&(((unsigned*)(layer->outputNeurons))[outputNeuron / BITS_PER_UNSIGNED]), mask);
} else {
atomicAnd(&(((unsigned*)(layer->outputNeurons))[outputNeuron / BITS_PER_UNSIGNED]), ~mask);
}
}
}
}
Just in case, here’s the function that is called from C/C++:
extern "C" void LayerCalculation(struct_Layer* d_layer, unsigned threads, VectorType inputType, VectorType outputType){
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(d_layer->outputSize, 1, 1);
int smemSize = threads * sizeof(float);
switch (inputType) {
case FLOAT:
if (outputType == FLOAT){
switch (threads)
{
case 512:
LayerCalculationKernel<512, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 256:
LayerCalculationKernel<256, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 128:
LayerCalculationKernel<128, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 64:
LayerCalculationKernel< 64, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 32:
LayerCalculationKernel< 32, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 16:
LayerCalculationKernel< 16, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 8:
LayerCalculationKernel< 8, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 4:
LayerCalculationKernel< 4, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 2:
LayerCalculationKernel< 2, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 1:
LayerCalculationKernel< 1, FLOAT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
}
} else {
switch (threads)
{
case 512:
LayerCalculationKernel<512, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 256:
LayerCalculationKernel<256, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 128:
LayerCalculationKernel<128, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 64:
LayerCalculationKernel< 64, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 32:
LayerCalculationKernel< 32, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 16:
LayerCalculationKernel< 16, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 8:
LayerCalculationKernel< 8, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 4:
LayerCalculationKernel< 4, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 2:
LayerCalculationKernel< 2, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 1:
LayerCalculationKernel< 1, FLOAT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
}
}
break;
case BIT:
if (outputType == FLOAT){
switch (threads)
{
case 512:
LayerCalculationKernel<512, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 256:
LayerCalculationKernel<256, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 128:
LayerCalculationKernel<128, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 64:
LayerCalculationKernel< 64, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 32:
LayerCalculationKernel< 32, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 16:
LayerCalculationKernel< 16, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 8:
LayerCalculationKernel< 8, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 4:
LayerCalculationKernel< 4, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 2:
LayerCalculationKernel< 2, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 1:
LayerCalculationKernel< 1, BIT, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
}
} else {
switch (threads)
{
case 512:
LayerCalculationKernel<512, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 256:
LayerCalculationKernel<256, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 128:
LayerCalculationKernel<128, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 64:
LayerCalculationKernel< 64, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 32:
LayerCalculationKernel< 32, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 16:
LayerCalculationKernel< 16, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 8:
LayerCalculationKernel< 8, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 4:
LayerCalculationKernel< 4, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 2:
LayerCalculationKernel< 2, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 1:
LayerCalculationKernel< 1, BIT, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
}
}
break;
case SIGN:
if (outputType == FLOAT){
switch (threads)
{
case 512:
LayerCalculationKernel<512, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 256:
LayerCalculationKernel<256, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 128:
LayerCalculationKernel<128, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 64:
LayerCalculationKernel< 64, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 32:
LayerCalculationKernel< 32, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 16:
LayerCalculationKernel< 16, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 8:
LayerCalculationKernel< 8, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 4:
LayerCalculationKernel< 4, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 2:
LayerCalculationKernel< 2, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 1:
LayerCalculationKernel< 1, SIGN, FLOAT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
}
} else {
switch (threads)
{
case 512:
LayerCalculationKernel<512, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 256:
LayerCalculationKernel<256, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 128:
LayerCalculationKernel<128, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 64:
LayerCalculationKernel< 64, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 32:
LayerCalculationKernel< 32, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 16:
LayerCalculationKernel< 16, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 8:
LayerCalculationKernel< 8, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 4:
LayerCalculationKernel< 4, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 2:
LayerCalculationKernel< 2, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
case 1:
LayerCalculationKernel< 1, SIGN, BIT><<< dimGrid, dimBlock, smemSize >>>(d_layer); break;
}
}
break;
}
checkCUDAError("Layer Calculation");
}
I don’t know, maybe I’m doing something in a wrong way or something like that.
Thanks in advantage