andAccumulate kernel wrong output

I’m currently trying to create a spiking neural network in CUDA C/C++. To do this I have to AND the weights of each dendrite with the input at that dendrite. The results of these operations will be accumulated and it will be added to the charge of the neuron. After the charge of the neuron has reached a certain threshold, the neuron will spike. There are various ways these neurons can learn, but I’m not there yet.

I’m currently just trying to accumulate the result of the AND operation into charge. For the most part the outputs of the kernel seem to give the right result, however, some results are wrong. To see why this is so, I loaded the output data of the AND result into weights, and checked each value. However, when I did this, the program seemed to work fine every time I ran it. I don’t know what I’m doing wrong, and I’ve been at this for a couple of iterations, so I’m finally asking you guys for help. I’ve used this document to create the accumulate kernel. The other code I’ve come up with myself. Here’s my code:

#include <cuda_runtime.h>
#include "device_launch_parameters.h"

#include <iostream>

using namespace std;

typedef struct
{
    uint8_t* inputs;
    int8_t* weights;
    int8_t* charge;
}Neuron;

__device__ void InAndWeights(int8_t* accRegS, const uint8_t* in, const int8_t* weights, const uint32_t* bits)
{
    uint32_t bytes = *bits >> 3;

    uint8_t* inS = reinterpret_cast<uint8_t*>(&accRegS[*bits]);

    for (uint32_t i = threadIdx.x; i < bytes; i += blockDim.x)
    {
        inS[i] = in[i];
    }
    __syncthreads();

for (uint32_t i = threadIdx.x; i < *bits; i += blockDim.x)
    {
        uint32_t k = i >> 3;
        uint8_t temp = 1 << (i & 7);
        if(temp & inS[k])
        {
            accRegS[i] = weights[i];
        }
        else
        {
            accRegS[i] = 0;
        }
    }
    __syncthreads();
}

__device__ void warpReduce(volatile int8_t* data)
{
    uint32_t tid = threadIdx.x;
    data[tid] += data[tid + 32];
    data[tid] += data[tid + 16];
    data[tid] += data[tid + 8];
    data[tid] += data[tid + 4];
    data[tid] += data[tid + 2];
    data[tid] += data[tid + 1];
}

__device__ void accumulate(int8_t* data, const uint32_t* length)
{
    for(uint32_t i = *length >> 1; i > 32; i >>= 1)
    {
        for(uint32_t j = threadIdx.x; j < i; j += blockDim.x)
        {
            data[j] += data[j + i];
        }
        __syncthreads();
    }
    if(threadIdx.x < 32)
    {
        warpReduce(data);
    }
}

__global__ void andAccumulate(Neuron* a, const uint8_t* log_DpN, const uint32_t* NoNeurons)
{
    uint32_t DpN = 1 << *log_DpN;

    extern __shared__ int8_t S[];
    int8_t* accRegS = S;

    for(uint32_t i = blockIdx.x; i < *NoNeurons; i += gridDim.x)
    {
        //and bits of inputs with weights and load into shared memory
        InAndWeights(accRegS, a[i].inputs, a[i].weights, &DpN);

/*
        //test test test    -------------------------------------------uncomment this piece of code to make the program seem to work.
        for (uint32_t j = threadIdx.x; j < DpN; j += blockDim.x)
        {
            a[i].weights[j] = accRegS[j];
        }
        //end test end test end test
        */

//accumulate result of and
        accumulate(accRegS, &DpN);
        __syncthreads();
        //integrate into charge
        *a[i].charge += accRegS[0];
    }
}

int main()
{
    //constants
    const uint32_t aLength = 65536;
    const size_t aSize = aLength * sizeof(Neuron);

    const uint8_t log_inputsLength = 10;
    const uint32_t inputsLength = 1 << log_inputsLength;
    const size_t inputsSize = inputsLength * sizeof(uint8_t);

    const uint8_t log_weightsLength = log_inputsLength + 3;
    const uint32_t weightsLength = 1 << log_weightsLength;
    const size_t weightsSize = weightsLength * sizeof(int8_t);

    const size_t chargeSize = sizeof(int8_t);

    //host side allocation
    Neuron* h_h_a = new Neuron[aLength]; //host side array of host side neurons
    for(uint32_t i = 0; i < aLength; ++i)
    {
        h_h_a[i].inputs = new uint8_t[inputsLength];
        h_h_a[i].weights = new int8_t[weightsLength];
        h_h_a[i].charge = new int8_t;
    }

    for(uint32_t i = 0; i < aLength; ++i)
    {
        for (uint32_t j = 0; j < inputsLength; ++j)
        {

            if((j >= inputsLength - 15) & (j < inputsLength))
            {
                h_h_a[i].inputs[j] = uint8_t(i);
            }
            else if(j < 15)
            {
                h_h_a[i].inputs[j] = uint8_t(i);
            }
            else
            {
                h_h_a[i].inputs[j] = uint8_t(0);
            }
        }

        for(uint32_t j = 0; j < weightsLength; ++j)
        {
            h_h_a[i].weights[j] = 1;
        }

        *h_h_a[i].charge = -10;
    }

    //device side allocation
    uint32_t* d_aLength;
    cudaMalloc(&d_aLength, sizeof(uint32_t));
    cudaMemcpy(d_aLength, &aLength, sizeof(uint32_t), cudaMemcpyHostToDevice);

    uint8_t* d_log_weightsLength;
    cudaMalloc(&d_log_weightsLength, sizeof(uint8_t));
    cudaMemcpy(d_log_weightsLength, &log_weightsLength, sizeof(uint8_t), cudaMemcpyHostToDevice);

    Neuron* h_d_a = new Neuron[aLength]; //host side array of device side neurons
    for(uint32_t i = 0; i < aLength; ++i)
    {
        cudaMalloc(&h_d_a[i].inputs, inputsSize);
        cudaMalloc(&h_d_a[i].weights, weightsSize);
        cudaMalloc(&h_d_a[i].charge, chargeSize);

        cudaMemcpy(h_d_a[i].inputs, h_h_a[i].inputs, inputsSize, cudaMemcpyHostToDevice);
        cudaMemcpy(h_d_a[i].weights, h_h_a[i].weights, weightsSize, cudaMemcpyHostToDevice);
        cudaMemcpy(h_d_a[i].charge, h_h_a[i].charge, chargeSize, cudaMemcpyHostToDevice);
    }

    Neuron* d_d_a; //device side array of device side neurons
    cudaMalloc(&d_d_a, aSize);
    cudaMemcpy(d_d_a, h_d_a, aSize, cudaMemcpyHostToDevice); //copy pointers to neuron data from h_d_a to d_d_a

    //start kernel timing //////////////////////////////////////////
    cudaEvent_t start;
    cudaEvent_t stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    //execute kernel
    andAccumulate<< <16384, 1024, 9 << log_inputsLength >> > (d_d_a, d_log_weightsLength, d_aLength);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    //end kernel timing ////////////////////////////////////////////

    //copy output data to host
    for (uint32_t i = 0; i < aLength; ++i)
    {
        cudaMemcpy(h_h_a[i].charge, h_d_a[i].charge, chargeSize, cudaMemcpyDeviceToHost);
        cudaMemcpy(h_h_a[i].weights, h_d_a[i].weights, weightsSize, cudaMemcpyDeviceToHost);
    }

    //free device memory
    for (uint32_t i = 0; i < aLength; ++i)
    {
        cudaFree(h_d_a[i].inputs);
        cudaFree(h_d_a[i].weights);
        cudaFree(h_d_a[i].charge);
    }
    cudaFree(d_aLength);
    cudaFree(d_log_weightsLength);
    delete[] h_d_a;
    cudaFree(d_d_a);

    //print input data
    for (uint32_t j = 0; j < 32; ++j)
    {
        for(uint32_t i = 0; i < 15; ++i)
        {
            cout << int(h_h_a[j].inputs[i]) << " ";
        }
        cout << "... ";
        for (uint32_t i = inputsLength - 15; i < inputsLength; ++i)
        {
            cout << int(h_h_a[j].inputs[i]) << " ";
        }
        cout << endl;
    }
    cout << "..." << endl;
    for (uint32_t j = aLength - 32; j < aLength; ++j)
    {
        for (uint32_t i = 0; i < 15; ++i)
        {
            cout << int(h_h_a[j].inputs[i]) << " ";
        }
        cout << "... ";
        for (uint32_t i = inputsLength - 15; i < inputsLength; ++i)
        {
            cout << int(h_h_a[j].inputs[i]) << " ";
        }
        cout << endl;
    }
    cout << endl << endl;

    //print weights output
    for (uint32_t i = 0; i < 32; ++i)
    {
        for (uint32_t j = 0; j < 11; ++j)
        {
            for (uint32_t k = 7; k < uint32_t(-1); --k)
            {
                cout << int(h_h_a[i].weights[8*j + k]);
            }
            cout << " ";
        }
        cout << "... ";
        for (uint32_t j = inputsLength - 11; j < inputsLength; ++j)
        {
            for(uint32_t k = 7; k < uint32_t(-1); --k)
            {
                cout << int(h_h_a[i].weights[8*j + k]);
            }
            cout << " ";
        }
        cout << endl;
    }
    cout << "..." << endl;
    for (uint32_t i = aLength - 32; i < aLength; ++i)
    {
        for (uint32_t j = 0; j < 11; ++j)
        {
            for (uint32_t k = 7; k < uint32_t(-1); --k)
            {
                cout << int(h_h_a[i].weights[8 * j + k]);
            }
            cout << " ";
        }
        cout << "... ";
        for (uint32_t j = inputsLength - 11; j < inputsLength; ++j)
        {
            for (uint32_t k = 7; k < uint32_t(-1); --k)
            {
                cout << int(h_h_a[i].weights[8 * j + k]);
            }
            cout << " ";
        }
        cout << endl;
    }
    cout << endl << endl;

    //print output data
    for (uint32_t j = 0; j < 32; ++j)
    {
        cout << int(*h_h_a[j].charge) << endl;
    }
    cout << "..." << endl;
    for (uint32_t j = aLength - 32; j < aLength; ++j)
    {
        cout << int(*h_h_a[j].charge) << endl;
    }

    //calculate and print bandwidth
    cout << endl << milliseconds << "ms" << endl;

    //free host memory
    for(uint32_t i = 0; i < aLength; ++i)
    {
        delete[] h_h_a[i].inputs;
        delete[] h_h_a[i].weights;
        delete h_h_a[i].charge;
    }
    delete[] h_h_a;

    cin.get();
}

I’m currently just using 8 bit integers. This might be a little too small, but this is just a proof of concept. Once it works, I’ll start using 16 bit integers and see how many bits I really need.

I’m using visual studio 2017 with CUDA toolkit 9.2 and a laptop gtx1060 3GB.

I’m looking forward to your reply.

-Dehim