CUDA error: unspecified launch failure

Hello,

I am a bit new to CUDA, so apologies. In my code (attached) I get following error:

“CUDA error: unspecified launch failure”, so I want to ask you by what it could be caused. I have noticed when I comment out 4 lines in kernel CalcNetMSE, marked with comment “// error”, there is no fault. So it could be some unallocated memory or bad indexing in arrays net_outputs or prev_outputs, but I checked allocations and size fits for me.

Thanks for your help, if you have further questions, feel free to ask.

File “input.txt” is attached as well, it is training data for the intended neural network, the code requires it in its directory to correct run.

Code:

#include <cstdio>

#include <cstdlib>

#include <cstdarg>

#include <cmath>

#include <vector>

#include <iostream>

#include <sstream>

#include <fstream>

#include <string>

#include <iterator>

#include <algorithm>

using namespace std;

#define INPUTS      2       // pocet vstupnych neuronov

#define OUTPUTS     3       // pocet vystupnych neuronov

#define LAYERS      2

#define L1          6

#define L2          12

#define L3          30

#define RANGE       1       // rozmery hladanych parametrov

#define SLOPE       0.6F     // sklon sigmoidu

#define LOW_B       -RANGE

#define UP_B        RANGE

//---------------premenne sieti-------------------------

vector<int> neuronsCount;

int inputsCount = 0;

vector<float> trainingInputs;

vector<float> trainingOutputs;

vector<float> weights;

float *netErrors;

int DIMENSION = 0;

//================== CUDA NN =============================

__device__ float *device_train_inputs = 0;

__device__ float *device_train_outputs = 0;

__device__ float *device_weights = 0;

__device__ int   *device_neurons_count = 0;

__device__ float *device_prev_outputs = 0;

__device__ float *device_net_outputs = 0;

__device__ float *device_errors = 0;

float **devTrainIn;

float **devTrainOut;

int   **devNeuCount;

float **devPrevOut;

float **devNetOut;

float **devErrors;

float **devWeights;

//------------------------------------------------------

//------------------------------------------------------

float randRanged(float a, float b)

{

    return ((b - a) * ((float)rand() / RAND_MAX)) + a;

}

float sigmoid(float net)

{

    return (float) ((2 / (1 + exp(-1 * SLOPE * net))) - 1);

}

void loadTrainFile (string filename) {

string line;

    ifstream file (filename.c_str());

if (file.is_open()){

	while (!file.eof()) {

	    getline(file, line);

	    if (line.empty()) {

		continue;

	    }

	    vector<string> tokens;

	    istringstream iss(line);

	    copy(istream_iterator<string>(iss),

		 istream_iterator<string>(),

		 back_inserter< vector<string> >(tokens));

	    for (int i = 0; i < INPUTS; i++) {

		trainingInputs.push_back(atof(tokens[i].c_str()));

	    }

	    for (int i = 0; i < OUTPUTS; i++) {

		trainingOutputs.push_back(atof(tokens[i + INPUTS].c_str()));

	    }

	    inputsCount++;

	}

    }

/* vypis vstupov na trenovanie

    for (unsigned i = 0; i < trainingInputs.size(); i++) {

 for (unsigned j = 0; j < INPUTS; j++) {

     cout << trainingInputs[i][j] << " ";

 }

 cout << endl;

    } */

}

void initWeights (int hiddenLayersCount, ...) {

neuronsCount.push_back(INPUTS);

va_list neuron_num;

    va_start(neuron_num, hiddenLayersCount);

    for (int i = 0; i < hiddenLayersCount; i++)

	neuronsCount.push_back(va_arg(neuron_num, unsigned int));

    va_end(neuron_num);

neuronsCount.push_back(OUTPUTS);

for (unsigned i = 0; i < neuronsCount.size() - 1; i++) {

	for (int j = 0; j < neuronsCount[i] * neuronsCount[i+1]; j++) {

	    weights.push_back(randRanged(-RANGE, RANGE));

	}

    }

DIMENSION = weights.size();

}

__device__ float dev_sigmoid(float net)

{

    return (float) ((2 / (1 + exp(-1 * SLOPE * net))) - 1);

}

__global__ void calcNetMSE (int	  maxLayerSize,

			    int	  dim,

			    float *weights,

			    float *train_inputs,

			    float *train_outputs,

			    int	  *neurons_count,

			    float *prev_outputs,

			    float *net_outputs,

			    float *errors)

{

    //unsigned int threadIndex = blockDim.x * blockIdx.x + threadIdx.x;

int idx = blockIdx.x * blockDim.x + threadIdx.x;

    int threadIndex = threadIdx.x;

    //shared[threadIndex] = input[threadIndex];

if (threadIndex < maxLayerSize) {

	//============== calc net output ===================================================

	for (int i = 0; i < INPUTS; i++) {

	    prev_outputs[threadIndex * INPUTS + i] = train_inputs[threadIndex * INPUTS + i];

	}

	int offset = 0;

	for (unsigned i = 1; i < LAYERS + 2; i++) {

	    for (int j = 0; j < neurons_count[i]; j++) {

		//net_outputs[threadIndex * maxLayerSize + j] = 0.0F; // error

		for (unsigned k = 0; k < neurons_count[i-1]; k++) {

		    //net_outputs[threadIndex * maxLayerSize + j] += weights[offset++] * prev_outputs[threadIndex * maxLayerSize + k]; //error

		}

		//net_outputs[threadIndex * maxLayerSize + j] = dev_sigmoid(net_outputs[threadIndex * maxLayerSize + j]); // error

	    }

	    for (int j = 0; j < neurons_count[i]; j++) {

		//prev_outputs[threadIndex * maxLayerSize + j] = net_outputs[threadIndex * maxLayerSize + j]; // error

	    }

	}

	//============== end ===============================================================

	float result = 0.0F;

	for (int i = 0; i < OUTPUTS; i++) {

	    result += abs(pow(train_outputs[threadIndex * OUTPUTS + i] - net_outputs[threadIndex * OUTPUTS + i], 2));

	}

	//errors[threadIndex] = (result / 2.0);

	errors[threadIndex] = train_inputs[threadIndex];

    }

}

float getNetOutput(vector<float> weights_vector,

     float *train_inputs, float *train_outputs, int *neurons_count,

     float *prev_outputs, float *net_outputs, float *errors)

//float getNetOutput(vector<float> weights_vector)

{

float *weights_array = new float[weights_vector.size()];

    cudaMemcpy(device_weights, weights_array, weights_vector.size() * sizeof(float), cudaMemcpyHostToDevice);

//======================= GPU ======================

    const size_t block_size = 512;

    size_t grid_size = inputsCount / block_size;

// deal with a possible partial final block

    if(inputsCount % block_size) {

	grid_size++;

    }

cout << grid_size << endl << block_size << endl;

calcNetMSE<<<grid_size, block_size>>>(inputsCount, DIMENSION, device_weights,

       train_inputs, train_outputs, neurons_count,

       prev_outputs, net_outputs, errors);

    /*calcNetMSE<<<grid_size, block_size>>>(inputsCount, DIMENSION, *devWeights,

					  *devTrainIn, *devTrainOut, *devNeuCount,

					  *devPrevOut, *devNetOut, *devErrors); */

    cudaThreadSynchronize();

cudaError_t cudaError = cudaGetLastError();

if(cudaError != cudaSuccess)

    {

	// print the CUDA error message and exit

	printf("CUDA error: %s\n", cudaGetErrorString(cudaError));

	exit(-1);

    }

cudaMemcpy(netErrors, errors, inputsCount * sizeof(float), cudaMemcpyDeviceToHost);

    cout << "segf" << endl;

    float sum = 0.0F;

for (int i = 0; i < inputsCount; i++) {

	sum += netErrors[i];

	cout << i << " " << netErrors[i] << endl;

    }

cout << sum << endl;

return sum;

}

//int main(int argc, char *argv[])

int main(void)

{

srand(time(NULL));

loadTrainFile("input.txt"); // nacitanie trenovacieho suboru

    initWeights(LAYERS, L1, L2, L3); // inicializacia skrytych vrstiev a vah

int maxCount = 0;

    for (int i = 0; i < neuronsCount.size(); i++) {

	maxCount = (maxCount < neuronsCount[i]) ? (neuronsCount[i]) : (maxCount);

    }

cout << maxCount << endl << neuronsCount.size() << endl;

//===================================

    cudaMalloc((void**)&device_train_inputs,    inputsCount * INPUTS    * sizeof(float));

    cudaMalloc((void**)&device_train_outputs,   inputsCount * OUTPUTS   * sizeof(float));

    cudaMalloc((void**)&device_weights,         DIMENSION               * sizeof(float));

    cudaMalloc((void**)&device_prev_outputs,    inputsCount * maxCount  * sizeof(float));

    cudaMalloc((void**)&device_net_outputs,     inputsCount * maxCount  * sizeof(float));

    cudaMalloc((void**)&device_neurons_count,   neuronsCount.size()     * sizeof(int));

    cudaMalloc((void**)&device_errors,          inputsCount		* sizeof(float));

devTrainIn	= &device_train_inputs;

    devTrainOut	= &device_train_outputs;

    devWeights	= &device_weights;

    devNeuCount	= &device_neurons_count;

    devPrevOut	= &device_prev_outputs;

    devNetOut	= &device_net_outputs;

    devErrors	= &device_errors;

// if any memory allocation failed, report an error message

    if((device_train_inputs == 0) || (device_train_outputs == 0) || (device_weights == 0) ||

	    (device_prev_outputs == 0) || (device_net_outputs == 0)

	    || (device_neurons_count == 0) || (device_errors == 0)) {

	printf("couldn't allocate memory\n");

	return 1;

    }

netErrors = new float[inputsCount];

float *trainingInputsArray = new float[inputsCount * INPUTS];

    float *trainingOutputsArray = new float[inputsCount * OUTPUTS];

    float *neuronsCountArray = new float[neuronsCount.size()];

for (int i = 0; i < trainingInputs.size(); i++) {

	trainingInputsArray[i] = trainingInputs[i];

    }

    for (int i = 0; i < trainingOutputs.size(); i++) {

	trainingOutputsArray[i] = trainingOutputs[i];

    }

    for (int i = 0; i < neuronsCount.size(); i++) {

	neuronsCountArray[i] = neuronsCount[i];

    }

cudaMemcpy(device_train_inputs, trainingInputsArray, inputsCount * INPUTS * sizeof(float), cudaMemcpyHostToDevice);

    cudaMemcpy(device_train_outputs, trainingOutputsArray, inputsCount * OUTPUTS * sizeof(float), cudaMemcpyHostToDevice);

    cudaMemcpy(device_neurons_count, neuronsCountArray, neuronsCount.size() * sizeof(float), cudaMemcpyHostToDevice);

cout << "init completed" << endl;

getNetOutput(weights,

		 device_train_inputs, device_train_outputs, device_neurons_count,

		 device_prev_outputs, device_net_outputs, device_errors);

return 0;

}

sorry for duplicate post, please delete this