General questions about cuda

I wrote an neuronal net some time ago. This net works like a linked list and was optimized with openmp. the usage was simple.

now i want to use cuda to test performance differences. for this i wrote something like a wrapper which converted the assymetric net structure into an array.

at first i wasn’t sure wether to use an 3d-array (for the layer consisting of neurons inheriting the edges), or an 1d array with special indexing.

(i used the last one and overloaded the index operator to handle it like a 3d array.) The last i did, was to write some ‘example’ host functions which are working with these (kind of normal 1d) arrays.

the last step, the port to cuda-kernel functions wasn’t done by now. before im not totally sure whats the best method to use cuda.

What u see below are my current example host functions which handle these 1d array-objects with overloaded index-operator (NeuronArray, DeltaArray, …).

At the moment im not sure wether it is possible to allocate the complete memory for the network (edges, neurons and errordeltas) or wether i have to split each layer into seperate matrices/arrays.

For the last thing i have to allocate and read vram very often, so it would be slower. But on the other hand im not sure how to handle a 3d-array in a kernel.

Also im not sure what will happen, if i want to allocate more vram, than physically available on the device (only error code or crash?!).

sorry for bad english.

// pre declarations of HOST code

inline void SetInput(NeuronArray &Neurons, float *pInput, int iInpSize);

inline void SetOutput(NeuronArray &Neurons, DeltaArray &dErrors, float *pOutput, int iOutpSize);

inline void RunFW(NeuronArray &Neurons, WeightsArray &Weights);

inline void RunBW(NeuronArray &Neurons, DeltaArray &dErrors, WeightsArray &Weights, float fLearningRate);

// already called in RunBW function

inline void CalcErrorDelta(NeuronArray &pNeurons, DeltaArray &pdErrors, WeightsArray &pWeights);

void SetInput(NeuronArray &Neurons, float *pInput, int iInpSize) {

	assert( iInpSize <= Neurons.GetW() );

	for(int i = 0; i < iInpSize; i++) {

		memcpy(&pInput[0], Neurons.GetSubArray(0), iInpSize*sizeof(float));

	}

}

void SetOutput(NeuronArray &Neurons, DeltaArray &dErrors, float *pOutput, int iOutpSize) {

	assert( iOutpSize <= Neurons.GetW() );

	int y = Neurons.GetH()-1;

	for(int i = 0; i < iOutpSize; i++) {

		dErrors[y][i] = pOutput[i] - Neurons[y][i];

	}

}

void RunFW(NeuronArray &Neurons, WeightsArray &Weights) {

	float fVal = 0;

	// begin with first hidden layer

	for(int y = 0; y < Neurons.GetH()-1; y++) {

		for(int x = 0; x < Neurons.GetW(); x++) {

			for(int z = 0; z < Weights.GetD(); z++) {

				fVal += Neurons[y][z]*Weights[y][z][x];

			}

			Neurons[y+1][x] = SigTransferFkt(fVal, 0.f);

		}

	}

}

void CalcErrorDelta(NeuronArray &Neurons, DeltaArray &dErrors, WeightsArray &Weights) {

	// begin with last hidden layer

	for(int y = Weights.GetH()-1; y >= 0; y--) {		// layers

		for(int x = 0; x < Neurons.GetW(); x++) {		// neurons

			for(int z = 0; z < Weights.GetD(); z++) {	// edges

				dErrors[y][x] += Weights[y][x][z]*dErrors[y+1][z];

			}

			dErrors[y][x] = DerSigTransFkt(dErrors[y][x], Neurons[y][x]);

		}

	}

}

void RunBW(NeuronArray &Neurons, DeltaArray &dErrors, WeightsArray &Weights, float fLearningRate) {

	CalcErrorDelta(Neurons, dErrors, Weights);

	// begin with last hidden layer

	for(int y = Weights.GetH()-1; y >= 0; y--) {		// layers

		for(int x = 0; x < Neurons.GetW(); x++) {		// neurons

			for(int z = 0; z < Weights.GetD(); z++) {	// edges

				Weights[y][x][z] += fLearningRate*Neurons[y][x]*dErrors[y+1][z];

			}

		}

	}

}

/*

 * TRAINING

 */

	for(int i = 0; i < 1000; i++) {

		SetInput(nArray, fInp1, 6);

		RunFW(nArray, wArray);

		SetOutput(nArray, dArray, fOut1, 6);

		RunBW(nArray, dArray, wArray, 0.05f);

	}