How constant memory cache works?

I can’t find enough information about constant memory cache. What is the nature of constant cache. I’ve read somewhere that each SP (Streaming multiprocessor) has 8Kb of cache. Is it a separate on-chip memory or is it registers that store these constants? How fast is cache, is it slower than shared memory or registers? I’m using around 300 - 500 bytes of constant memory. As far as I understand it should fit inside those 8 kilobytes that each SP has. What I would like is a procedure that caches all my constants in the beginning.

Here is “Kernel.h”

[codebox]

#include <stdio.h>

#include <cutil.h>

#include <cuda_runtime.h>

#include <cutil_inline.h>

#include “DeviceTypes.h”

constant Single* inputs;

constant UInt32* genotypes;

constant Single* phenotypes;

constant GaConsts gaConfig;

constant MmConsts mmConfig;

constant NnConsts nnConfig;

constant NnXLayer nnLayers[Layers];

#define MatrixARows(layer) nnLayers[layer].OutputRows

#define MatrixACols(layer) nnLayers[layer].InputRows

#define MatrixBRows(layer) nnLayers[layer].InputCols

#define MatrixBCols(layer) nnLayers[layer].OutputCols

#define MatrixSRows(layer) nnLayers[layer].OutputRows

#define MatrixSCols(layer) nnLayers[layer].OutputCols

global void Kernel()

{

#ifdef DEVICE_EMULATION

if ((blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x == 0)

{

	printf("Genetic algorithm configuration:\r\n");

	printf("Steps.......................... %u\r\n", gaConfig.Steps);

	printf("PopulationSize................. %u\r\n", gaConfig.PopulationSize);

	printf("CrossoverProbability........... %u\r\n", gaConfig.CrossoverProbability);

	printf("MutationProbability............ %u\r\n", gaConfig.MutationProbability);

	printf("InversionProbability........... %u\r\n", gaConfig.InversionProbability);

	printf("RangeA......................... %f\r\n", gaConfig.RangeA);

	printf("RangeDelta..................... %f\r\n", gaConfig.RangeDelta);

	printf("Matrix multiplication configuration:\r\n");

	printf("WindowSize..................... %f\r\n", mmConfig.WindowSize);

	printf("WindowCapacity................. %f\r\n", mmConfig.WindowCapacity);

	printf("--------------\r\n");

}

#endif

UInt32 index = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;

UInt32 spicieIndex = index / mmConfig.WindowCapacity;



UInt32 windowIndex = index - spicieIndex * mmConfig.WindowCapacity;



UInt32 windowRow = windowIndex / mmConfig.WindowSize;

UInt32 windowCol = windowIndex - windowRow * mmConfig.WindowSize;

for (UInt32 layer = 0; layer < nnConfig.Layers; layer++)

{

	// Here I evaluate neural network 

}

}

template T* Allocate(UInt32 size)

{

Byte* ptr;

CUDA_SAFE_CALL(cudaMalloc((void**)&ptr, size));

return ptr;

}

template void Free(T* ptr)

{

CUDA_SAFE_CALL(cudaFree(ptr));

}

extern “C” void LaunchKernel

(

Byte* gaConfig,

Byte* mmConfig,

Byte* nnConfig,

Byte* layerConsts,

UInt32 layerConstsSize,

Byte* inputGlobals,

UInt32 inputGlobalsSize,

Byte* genotypeGlobals,

UInt32 genotypeGlobalsSize,

Byte* phenotypeGlobals,

UInt32 phenotypeGlobalsSize

)

{

#ifndef DEVICE_EMULATION

cudaSetDevice(cutGetMaxGflopsDeviceId());

#endif

unsigned int timer = 0;

Byte* inputGlobalsDevice = Allocate<Byte>(inputGlobalsSize);

Byte* genotypeGlobalsDevice = Allocate<Byte>(genotypeGlobalsSize);

Byte* phenotypeGlobalsDevice = Allocate<Byte>(phenotypeGlobalsSize);

CUDA_SAFE_CALL(cudaMemcpy(inputGlobalsDevice, inputGlobals, inputGlobalsSize, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(genotypeGlobalsDevice, genotypeGlobals, genotypeGlobalsSize, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(phenotypeGlobalsDevice, phenotypeGlobals, phenotypeGlobalsSize, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpyToSymbol(inputs, (void*)&inputGlobalsDevice, sizeof(Single*), 0, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpyToSymbol(genotypes, (void*)&genotypeGlobalsDevice, sizeof(UInt32*), 0, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpyToSymbol(phenotypes, (void*)&phenotypeGlobalsDevice, sizeof(Single*), 0, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpyToSymbol(::gaConfig, (void*)gaConfig, sizeof(GaConsts), 0, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpyToSymbol(::mmConfig, (void*)mmConfig, sizeof(MmConsts), 0, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpyToSymbol(::nnConfig, (void*)nnConfig, sizeof(NnConsts), 0, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpyToSymbol(::nnLayers, (void*)layerConsts, layerConstsSize, 0, cudaMemcpyHostToDevice));

cutilCheckError(cutCreateTimer(&timer));

cutilCheckError(cutStartTimer(timer));

Kernel<<<1, 128>>>();

cudaThreadSynchronize();

cutilCheckError(cutStopTimer(timer));

Single timerValue = cutGetTimerValue(timer);

cutilCheckError(cutDeleteTimer(timer));

printf("Processing time: %f (ms)\n", timerValue);

Free(inputGlobalsDevice);

Free(genotypeGlobalsDevice);

Free(phenotypeGlobalsDevice);

}

[/codebox]

Here is “Program.cpp”

[codebox]

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil_inline.h>

#include “Common.h”

#include “HostTypes.h”

extern “C” void LaunchKernel

(

Byte* gaConsts,

Byte* mmConsts,

Byte* nnConsts,

Byte* layerConsts,

UInt32 layerConstsSize,

Byte* inputGlobals,

UInt32 inputGlobalsSize,

Byte* genotypeGlobals,

UInt32 genotypeGlobalsSize,

Byte* phenotypeGlobals,

UInt32 phenotypeGlobalsSize

);

int main(int argc, char** argv)

{

srand(Date);

RandomSingleMatrix input = RandomSingleMatrix(InputRows, InputCols, GrayRangeA, GrayRangeB);

ZeroSingleExLayer phenotypeLayer = ZeroSingleExLayer(InputRows, InputCols, OutputRows, OutputCols);

RandomUInt32ExLayer genotypeLayer = RandomUInt32ExLayer(InputRows, InputCols, OutputRows, OutputCols, None);

Object** genotypeNetwork = new Object*[Layers];

Object** phenotypeNetwork = new Object*[Layers];

for (UInt32 i = 0; i < Layers; i++)

{

	genotypeNetwork[i] = &genotypeLayer;

	phenotypeNetwork[i] = &phenotypeLayer;

}

GeneralCollection genotype = GeneralCollection(Layers, genotypeNetwork, None);

GeneralCollection phenotype = GeneralCollection(Layers, phenotypeNetwork, None);

GeneralVector inputs = GeneralVector(PopulationSize, &input, None);

GeneralVector genotypes = GeneralVector(PopulationSize, &genotype, None);

GeneralVector phenotypes = GeneralVector(PopulationSize, &phenotype, None);

GeneticAlgorithmConfiguration gac = GeneticAlgorithmConfiguration

(

	Steps,

	PopulationSize,

	CrossoverProbabililty,

	MutationProbabililty,

	InversionProbabililty,

	GrayRangeA,

	GrayRangeB

);

NeuralNetworkConfiguration nnc = NeuralNetworkConfiguration

(

	Layers,

	genotype.GetGlobalsSize(),

	phenotype.GetGlobalsSize(),

	genotypeLayer.GetGlobalsSize(),

	phenotypeLayer.GetGlobalsSize()

);

MatrixMultiplicationConfiguration mmc = MatrixMultiplicationConfiguration(WindowSize);

ConstantsBuffer *gacConstants = gac.GetConstants();

ConstantsBuffer *mmcConstants = mmc.GetConstants();

ConstantsBuffer *nncConstants = nnc.GetConstants();

GlobalsBuffer *inputsGlobals = inputs.GetGlobals();

GlobalsBuffer *genotypesGlobals = genotypes.GetGlobals();

GlobalsBuffer *phenotypesGlobals = phenotypes.GetGlobals();

ConstantsBuffer *layerConstants = genotypes.GetConstants();

LaunchKernel

(

	gacConstants->Ptr,

	mmcConstants->Ptr,

	nncConstants->Ptr,

	layerConstants->Ptr,

	layerConstants->Size,

	inputsGlobals->Ptr,

	inputsGlobals->Size,

	genotypesGlobals->Ptr,

	genotypesGlobals->Size,

	phenotypesGlobals->Ptr,

	phenotypesGlobals->Size

);

delete gacConstants;

delete mmcConstants;

delete nncConstants;

delete inputsGlobals;

delete genotypesGlobals;

delete phenotypesGlobals;

delete layerConstants;

}

[/codebox]

Here is “Common.h”

[codebox]

#ifndef COMMON

#define COMMON

// Types

typedef int Int32;

typedef short Int16;

typedef float Single;

typedef double Double;

typedef signed char SByte;

typedef unsigned char Byte;

typedef unsigned int UInt32;

typedef unsigned short UInt16;

// Constants

static const UInt32 Date = 20090409;

static const Single GrayRangeA = -1;

static const Single GrayRangeB = 1;

static const UInt32 WindowSize = 2;

static const UInt32 Steps = 4;

static const UInt32 PopulationSize = 1024;

static const UInt32 Layers = 8;

static const UInt32 InputRows = 8;

static const UInt32 InputCols = 8;

static const UInt32 OutputRows = 8;

static const UInt32 OutputCols = 8;

static const UInt32 CrossoverProbabililty = (UInt32)(0.95 * UINT_MAX);

static const UInt32 MutationProbabililty = (UInt32)(0.03 * UINT_MAX);

static const UInt32 InversionProbabililty = (UInt32)(0.02 * UINT_MAX);

#endif

[/codebox]

Here are “HostTypes.h”

[codebox]

// TODO : Replace ‘new’ with ‘cudaMallocHost’

// TODO : Make objects singleton or mutable

#ifndef HOSTTYPES

#define HOSTTYPES

#include “Common.h”

enum Space

{

None = 0,

Globals = 1,

Constants = 2,

Mixed = 3

};

class Buffer

{

private:

static Byte* Allocate(UInt32 size)

{

	Byte* result;

	CUDA_SAFE_CALL(cudaMallocHost((void**)&result, size));

	return result;

}

protected:

Buffer() : Size(0), Ptr(NULL)

{

	Offset = 0;

}

Buffer(UInt32 size) : Size(size), Ptr(new Byte)

{

	Offset = 0;

}

public:

UInt32 Offset;

Byte *Ptr;

UInt32 Size;

bool IsEmpty()

{

	return Size == 0 || Ptr == NULL;

}



template <typename T> void Add(T value)

{

	// When buffer not specified, we only count elements

	if (Ptr != NULL)

	{

		// Before writing we should check for buffer overflow

		// TODO: add #ifndef NO_BUFFER_OVERFLOW_CHECK

		if (Offset + sizeof(T) <= Size)

		{

			*((T*)(Ptr + Offset)) = value;

		}

		else

		{

			throw "Buffer overflow in Buffer::Add(T value)";

		}

	}

	Offset += sizeof(T);

}

template <typename T> T* Allocate(UInt32 count)

{

	T* result = (T*)(Ptr + Offset);

	Offset += count * sizeof(T);

	if (Offset > Size)

	{

		Offset -= count * sizeof(T);

		throw "Buffer overflow in T* Buffer::Allocate<T>(UInt32 count)";

	}

	return result;

}

template <typename T> void Account(UInt32 count)

{

	Offset += count * sizeof(T);

}

void AccountOffset(UInt32 offset)

{

	Offset += offset;

}

virtual ~Buffer()

{

	if (Ptr != NULL) CUDA_SAFE_CALL(cudaFreeHost(Ptr));

}

};

class GlobalsBuffer : public Buffer

{

public:

GlobalsBuffer() : Buffer()

{

}

GlobalsBuffer(UInt32 size) : Buffer(size)

{

}

};

class ConstantsBuffer : public Buffer

{

public:

ConstantsBuffer() : Buffer()

{

}

ConstantsBuffer(UInt32 size) : Buffer(size)

{

}

};

class Object

{

public:

virtual UInt32 GetGlobalsSize() const

{

	GlobalsBuffer buffer = GlobalsBuffer();

	RenderGlobals(&buffer);

	return buffer.Offset;

}

virtual UInt32 GetConstantsSize() const

{

	ConstantsBuffer buffer = ConstantsBuffer();

	RenderConstants(&buffer);

	return buffer.Offset;

}

virtual void RenderGlobals(GlobalsBuffer *buffer) const = 0;

virtual void RenderConstants(ConstantsBuffer *buffer) const = 0;

virtual ConstantsBuffer* GetConstants() const

{

	return GetConstants(GetConstantsSize());

}

virtual ConstantsBuffer* GetConstants(UInt32 size) const

{

	ConstantsBuffer *result = new ConstantsBuffer(size);

	RenderConstants(result);

	return result;

}

virtual ConstantsBuffer* GetConstants(ConstantsBuffer *buffer) const

{

	RenderConstants(buffer);

	

	return buffer;

}

virtual GlobalsBuffer* GetGlobals() const

{

	return GetGlobals(GetGlobalsSize());

}

virtual GlobalsBuffer* GetGlobals(UInt32 size) const

{

	GlobalsBuffer *result = new GlobalsBuffer(size);

	RenderGlobals(result);

	return result;

}

virtual GlobalsBuffer* GetGlobals(GlobalsBuffer *buffer)

{

	RenderGlobals(buffer);

	return buffer;

}

};

// MixedObject objects can affect both constant and global memory spaces depending on the value of field ‘ObjectSpace’.

class MixedObject : public Object

{

protected:

virtual void RenderObjectGlobals(GlobalsBuffer *buffer) const = 0;

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const = 0;

public:

Space ObjectSpace;

MixedObject(Space objectSpace = Mixed) : ObjectSpace(objectSpace)

{

}

virtual void RenderGlobals(GlobalsBuffer *buffer) const override

{

	if (ObjectSpace & Globals) RenderObjectGlobals(buffer);

}

virtual void RenderConstants(ConstantsBuffer *buffer) const override

{

	if (ObjectSpace & Constants) RenderObjectConstants(buffer);

}

};

// GlobalObject lies in global memory space and therefore not affects constant memory space

class GlobalObject : public Object

{

private:

virtual void RenderConstants(ConstantsBuffer *buffer) const override

{

	// This is empty method, no changes are made to the buffer

}

virtual UInt32 GetConstantsSize() const override

{

	return 0;

}

};

// ConstantObject lies in constant memory space and therefore not affects global memory space

class ConstantObject : public Object

{

private:

virtual void RenderGlobals(GlobalsBuffer *buffer) const override

{

	// This is empty method, no changes are made to the buffer

}

virtual UInt32 GetGlobalsSize() const override

{

	return 0;

}

};

class UInt32Object : public GlobalObject

{

public:

virtual UInt32 GetGlobalsSize() const override

{

	return sizeof(UInt32);

}

};

class SingleObject : public GlobalObject

{

public:

virtual UInt32 GetGlobalsSize() const override

{

	return sizeof(Single);

}

};

class ZeroUInt32 : public UInt32Object

{

public:

virtual void RenderGlobals(GlobalsBuffer *buffer) const override

{

	buffer->Add<UInt32>(0);

}

};

class ZeroSingle : public SingleObject

{

public:

virtual void RenderGlobals(GlobalsBuffer *buffer) const override

{

	buffer->Add<Single>(0);

}

};

class RandomUInt32 : public UInt32Object

{

public:

virtual void RenderGlobals(GlobalsBuffer *buffer) const override

{

	buffer->Add(rand() + (rand() << 16));

}

};

class RandomSingle : public SingleObject

{

public:

Single GrayRangeA;

Single RangeDeltaNormalized;

RandomSingle() : GrayRangeA(0), RangeDeltaNormalized(1.0F / UINT_MAX)

{

}

RandomSingle

(

	Single rangeA,

	Single rangeDelta

) : GrayRangeA(rangeA),

	RangeDeltaNormalized(rangeDelta / UINT_MAX)

{

}

virtual void RenderGlobals(GlobalsBuffer *buffer) const override

{

	buffer->Add(GrayRangeA + (rand() + (rand() << 16)) * RangeDeltaNormalized);

}

};

class Container : public MixedObject

{

private:

virtual void RenderObjectGlobals(GlobalsBuffer *buffer) const override

{

	// Containers can't have any globals, only their children can.

}

protected:

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const override

{

	buffer->Add(Length);

	buffer->Add(GetGlobalsSize());

}

public:

UInt32 Length;

Space ChildSpace;

Container(UInt32 length, Space objectSpace = Constants, Space childSpace = Mixed) : Length(length), ChildSpace(childSpace), MixedObject(objectSpace)

{

}

virtual void RenderConstants(ConstantsBuffer *buffer) const override

{

	if (ObjectSpace & Constants) RenderObjectConstants(buffer);

}

};

// MixedObjectContainer is the object that contains mixed child elements.

class MixedObjectContainer : public Container

{

protected:

virtual void RenderChildGlobals(GlobalsBuffer *buffer) const = 0;

virtual void RenderChildConstants(ConstantsBuffer *buffer) const = 0;

public:

MixedObjectContainer(UInt32 length, Space objectSpace = Constants, Space childSpace = Mixed) : Container(length, objectSpace, childSpace)

{

}

virtual void RenderGlobals(GlobalsBuffer *buffer) const override

{

	if (ChildSpace & Globals) RenderChildGlobals(buffer);

}

virtual void RenderConstants(ConstantsBuffer *buffer) const override

{

	Container::RenderConstants(buffer);

	if (ChildSpace & Constants) RenderChildConstants(buffer);

}

};

class GlobalObjectContainer : public Container

{

protected:

virtual void RenderChildGlobals(GlobalsBuffer *buffer) const = 0;

public:

GlobalObjectContainer(UInt32 length, Space objectSpace = Constants, Space childSpace = Globals) : Container(length, objectSpace, childSpace)

{

}

virtual void RenderGlobals(GlobalsBuffer *buffer) const override

{

	if (ChildSpace & Globals) RenderChildGlobals(buffer);

}

};

class ConstantObjectContainer : public Container

{

protected:

virtual void RenderChildConstants(ConstantsBuffer *buffer) const = 0;

public:

ConstantObjectContainer(UInt32 length, Space objectSpace = Constants, Space childSpace = Constants) : Container(length, objectSpace, childSpace)

{

}

virtual void RenderGlobals(GlobalsBuffer *buffer) const override

{

	// No globals 

}

virtual void RenderConstants(ConstantsBuffer *buffer) const override

{

	Container::RenderConstants(buffer);

	if (ChildSpace & Constants) RenderChildConstants(buffer);

}

};

class UInt32Vector : public GlobalObjectContainer

{

protected:

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const override

{

	GlobalObjectContainer::RenderObjectConstants(buffer);

	buffer->Add(Length * sizeof(UInt32));

}

public:

UInt32Vector(UInt32 length, Space objectSpace = Constants, Space childSpace = Globals) : GlobalObjectContainer(length, objectSpace, childSpace)

{

}

virtual UInt32 GetGlobalsSize() const override

{

	return Length * sizeof(UInt32);

}

};

class SingleVector : public GlobalObjectContainer

{

protected:

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const override

{

	GlobalObjectContainer::RenderObjectConstants(buffer);

	buffer->Add(Length * sizeof(Single));

}

public:

SingleVector(UInt32 length, Space objectSpace = Constants, Space childSpace = Globals) : GlobalObjectContainer(length, objectSpace, childSpace)

{

}

virtual UInt32 GetGlobalsSize() const override

{

	return Length * sizeof(Single);

}

};

class ZeroUInt32Vector : public UInt32Vector

{

protected:

virtual void RenderChildGlobals(GlobalsBuffer *buffer) const override

{

	if (buffer->IsEmpty())

	{

		buffer->Account<UInt32>(Length);

	}

	else

	{

		UInt32 length = Length;

		UInt32 *pointer = buffer->Allocate<UInt32>(length);

		for (UInt32 i = 0; i < length; i++) pointer[i] = 0;

	}

}

public:

ZeroUInt32Vector(UInt32 length, Space objectSpace = Constants, Space childSpace = Globals) : UInt32Vector(length, objectSpace, childSpace)

{

}

};

class ZeroSingleVector : public SingleVector

{

protected:

virtual void RenderChildGlobals(GlobalsBuffer *buffer) const override

{

	if (buffer->IsEmpty())

	{

		buffer->Account<Single>(Length);

	}

	else

	{

		UInt32 length = Length;

		Single *pointer = buffer->Allocate<Single>(length);

		for (UInt32 i = 0; i < length; i++) pointer[i] = 0.0F;

	}

}

public:

ZeroSingleVector(UInt32 length, Space objectSpace = Constants, Space childSpace = Globals) : SingleVector(length, objectSpace, childSpace)

{

}

};

class RandomUInt32Vector : public UInt32Vector

{

protected:

virtual void RenderChildGlobals(GlobalsBuffer *buffer) const override

{

	if (buffer->IsEmpty())

	{

		buffer->Account<UInt32>(Length);

	}

	else

	{

		UInt32 length = Length;

		UInt32 *pointer = buffer->Allocate<UInt32>(length);

		for (UInt32 i = 0; i < length; i++) pointer[i] = rand() + (rand() << 16);

	}

}

public:

RandomUInt32Vector(UInt32 length, Space objectSpace = Constants, Space childSpace = Globals) : UInt32Vector(length, objectSpace, childSpace)

{

}

};

class RandomSingleVector : public SingleVector

{

protected:

virtual void RenderChildGlobals(GlobalsBuffer *buffer) const override

{

	if (buffer->IsEmpty())

	{

		buffer->Account<Single>(Length);

	}

	else

	{

		UInt32 length = Length;

		Single *pointer = buffer->Allocate<Single>(length);

		Single rangeA = GrayRangeA;

		Single rangeDelta = RangeDeltaNormalized;

		for (UInt32 i = 0; i < length; i++) pointer[i] = rangeA + (rand() + (rand() << 16)) * rangeDelta;

	}

}

public:

Single GrayRangeA;

Single RangeDeltaNormalized;

RandomSingleVector(UInt32 length, Single rangeA, Single rangeDelta, Space objectSpace = Constants, Space childSpace = Globals) : GrayRangeA(rangeA), RangeDeltaNormalized(rangeDelta / UINT_MAX), SingleVector(length, objectSpace, childSpace)

{

}

};

class ZeroUInt32Matrix : public ZeroUInt32Vector

{

protected:

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const override

{

	ZeroUInt32Vector::RenderObjectConstants(buffer);

	buffer->Add(Rows);

	buffer->Add(Cols);

}

public:

UInt32 Rows;

UInt32 Cols;

ZeroUInt32Matrix(UInt32 rows, UInt32 cols, Space objectSpace = Constants, Space childSpace = Globals) : Rows(rows), Cols(cols), ZeroUInt32Vector(rows * cols, objectSpace, childSpace)

{

}

};

class ZeroSingleMatrix : public ZeroSingleVector

{

protected:

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const override

{

	ZeroSingleVector::RenderObjectConstants(buffer);

	buffer->Add(Rows);

	buffer->Add(Cols);

}

public:

UInt32 Rows;

UInt32 Cols;

ZeroSingleMatrix(UInt32 rows, UInt32 cols, Space objectSpace = Constants, Space childSpace = Globals) : Rows(rows), Cols(cols), ZeroSingleVector(rows * cols, objectSpace, childSpace)

{

}

};

class RandomUInt32Matrix : public RandomUInt32Vector

{

protected:

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const override

{

	UInt32Vector::RenderObjectConstants(buffer);

	buffer->Add(Rows);

	buffer->Add(Cols);

}

public:

UInt32 Rows;

UInt32 Cols;

RandomUInt32Matrix(UInt32 rows, UInt32 cols, Space objectSpace = Constants, Space childSpace = Globals) : Rows(rows), Cols(cols), RandomUInt32Vector(rows * cols, objectSpace, childSpace)

{

}

};

class RandomSingleMatrix : public RandomSingleVector

{

protected:

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const override

{

	RandomSingleVector::RenderObjectConstants(buffer);

	buffer->Add(Rows);

	buffer->Add(Cols);

}

public:

UInt32 Rows;

UInt32 Cols;

RandomSingleMatrix(UInt32 rows, UInt32 cols, Single rangeA, Single rangeDelta, Space objectSpace = Constants, Space childSpace = Globals) : Rows(rows), Cols(cols), RandomSingleVector(rows * cols, rangeA, rangeDelta, objectSpace, childSpace)

{

}

};

class GeneralCollection : public MixedObjectContainer

{

protected:

virtual void RenderChildGlobals(GlobalsBuffer *buffer) const override

{

	if (buffer->IsEmpty())

	{

		buffer->AccountOffset(GetGlobalsSize());

	}

	else

	{

		for (UInt32 i = 0; i < Length; i++) Elements[i]->RenderGlobals(buffer);

	}

}

virtual void RenderChildConstants(ConstantsBuffer *buffer) const override

{

	if (buffer->IsEmpty())

	{

		buffer->AccountOffset(GetConstantsSize());

	}

	else

	{

		UInt32 length = Length;

		for (UInt32 i = 0; i < length; i++) Elements[i]->RenderConstants(buffer);

	}

}

public:

Object **Elements;

GeneralCollection(UInt32 length, Object **elements, Space objectSpace = Constants, Space childSpace = Mixed) : Elements(elements), MixedObjectContainer(length, objectSpace, childSpace)

{

}

virtual UInt32 GetGlobalsSize() const override

{

	UInt32 size = 0;

	UInt32 length = Length;

	for (UInt32 i = 0; i < length; i++) size += Elements[i]->GetGlobalsSize();

	return size;

}

virtual UInt32 GetConstantsSize() const override

{

	UInt32 size = 0;

	UInt32 length = Length;

	for (UInt32 i = 0; i < length; i++) size += Elements[i]->GetConstantsSize();

	return size;

}

};

class GeneralVector : public MixedObjectContainer

{

protected:

virtual void RenderChildGlobals(GlobalsBuffer *buffer) const override

{

	if (buffer->IsEmpty())

	{

		buffer->AccountOffset(Length * Element->GetGlobalsSize());

	}

	else

	{

		for (UInt32 i = 0; i < Length; i++) Element->RenderGlobals(buffer);

	}

}

virtual void RenderChildConstants(ConstantsBuffer *buffer) const override

{

	Element->RenderConstants(buffer);

}

public:

Object *Element;

GeneralVector(UInt32 length, Object *element, Space objectSpace = Constants, Space childSpace = Mixed) : Element(element), MixedObjectContainer(length, objectSpace, childSpace)

{

}

virtual UInt32 GetGlobalsSize() const override

{

	return Length * Element->GetGlobalsSize();

}

};

class GeneralMatrix : public GeneralVector

{

protected:

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const override

{

	GeneralVector::RenderObjectConstants(buffer);

	buffer->Add(Rows);

	buffer->Add(Cols);

}

public:

UInt32 Rows;

UInt32 Cols;

GeneralMatrix(UInt32 rows, UInt32 cols, Object *element, Space objectSpace = Constants, Space childSpace = Mixed) : Rows(rows), Cols(cols), GeneralVector(rows * cols, element, objectSpace, childSpace)

{

}

};

// Standard neural network layer consists of a vector “Connections” that represents connections

// and a vector “Biases” that represents biases. Having some input vector “Input” to calculate

// current layer pass we need to do the following:

//

// “Connections” * “Input” + “Biases” (corresponding vector dimensions must agree)

//

// Though not in every case it is easy to work with vectors, you need non mathematical procedures

// to create a 2-dimensional matrix out of a vector, and in my opinion it is not good.

//

// In this case I’m using 2 matrices “Left” and “Right” as the connections and matrix “Biases”

// as the biases. To complete current layer pass I will have to do the following:

//

// “Left” * “Input” * “Right” + “Biases”

//

// By having the input matrix dimensions equal to “InputRows x InputCols” and by willing to receive

// “OutputRows x OutputCols” matrix from current layer pass the above operation should look like:

//

// “OutputRows x InputRows” * “InputRows x InputCols” * “InputCols x OutputCols” + “OutputRows x OutputCols”

//

class ExLayer : public MixedObject

{

private:

virtual void RenderObjectGlobals(GlobalsBuffer *buffer) const

{

	// No globals

}

protected:

virtual void RenderObjectConstants(ConstantsBuffer *buffer) const

{

	UInt32 offset = 0;

	buffer->Add(InputRows);

	buffer->Add(InputCols);

	buffer->Add(OutputRows);

	buffer->Add(OutputCols);

	buffer->Add(offset += Left->GetGlobalsSize());

	buffer->Add(offset += Right->GetGlobalsSize());

	buffer->Add(offset += Biases->GetGlobalsSize());

	buffer->Add(offset += LeftOutput->GetGlobalsSize());

	buffer->Add(offset += RightOutput->GetGlobalsSize());

}

ExLayer

(

	UInt32 inputRows,

	UInt32 inputCols,

	UInt32 outputRows,

	UInt32 outputCols,

	Object *left,

	Object *leftOutput,

	Object *right,

	Object *rightOutput,

	Object *biases,

	Space outSpace = Globals,

	Space lrbSpace = Globals,

	Space objectSpace = Constants

) : Left(left),

	Right(right),

	Biases(biases),

	LeftOutput(leftOutput),

	RightOutput(rightOutput),

	InputRows(inputRows),

	InputCols(inputCols),

	OutputRows(outputRows),

	OutputCols(outputCols),

	LrbSpace(lrbSpace),

	OutSpace(outSpace),

	MixedObject(objectSpace)

{

}

public:

Space OutSpace;

Space LrbSpace;

Object *Left;

Object *Right;

Object *Biases;

Object *LeftOutput;

Object *RightOutput;

UInt32 InputRows;

UInt32 InputCols;

UInt32 OutputRows;

UInt32 OutputCols;

virtual void RenderGlobals(GlobalsBuffer *buffer) const

{

	if (LrbSpace & Globals)

	{

		Left->RenderGlobals(buffer);

		Right->RenderGlobals(buffer);

		Biases->RenderGlobals(buffer);

	}

	if (OutSpace & Globals)

	{

		LeftOutput->RenderGlobals(buffer);

		RightOutput->RenderGlobals(buffer);

	}

}

virtual void RenderConstants(ConstantsBuffer *buffer) const

{

	MixedObject::RenderConstants(buffer);

	if (LrbSpace & Constants)

	{

		Left->RenderConstants(buffer);

		Right->RenderConstants(buffer);

		Biases->RenderConstants(buffer);

	}

	if (OutSpace & Constants)

	{

		LeftOutput->RenderConstants(buffer);

		RightOutput->RenderConstants(buffer);

	}

}

~ExLayer()

{

	delete Left;

	delete Right;

	delete Biases;

	delete LeftOutput;

	delete RightOutput;

}

};

class ZeroUInt32ExLayer : public ExLayer

{

public:

ZeroUInt32ExLayer(UInt32 inputRows, UInt32 inputCols, UInt32 outputRows, UInt32 outputCols, Space outSpace = Globals, Space lrbSpace = Globals, Space objectSpace = Constants) : ExLayer

(

	inputRows,

	inputCols,

	outputRows,

	outputCols,

	new ZeroUInt32Matrix(outputRows, inputRows, Constants, Mixed),

	new ZeroUInt32Matrix(inputCols, outputCols, Constants, Mixed),

	new ZeroUInt32Matrix(outputRows, outputCols, Constants, Mixed),

	new ZeroUInt32Matrix(outputRows, inputCols, Constants, Mixed),

	new ZeroUInt32Matrix(outputRows, outputCols, Constants, Mixed),

	outSpace, lrbSpace, objectSpace

)

{

}

};

class RandomUInt32ExLayer : public ExLayer

{

public:

RandomUInt32ExLayer(UInt32 inputRows, UInt32 inputCols, UInt32 outputRows, UInt32 outputCols, Space outSpace = Globals, Space lrbSpace = Globals, Space objectSpace = Constants) : ExLayer

(

	inputRows,

	inputCols,

	outputRows,

	outputCols,

	new RandomUInt32Matrix(outputRows, inputRows, Constants, Mixed),

	new RandomUInt32Matrix(inputCols, outputCols, Constants, Mixed),

	new RandomUInt32Matrix(outputRows, outputCols, Constants, Mixed),

	new RandomUInt32Matrix(outputRows, inputCols, Constants, Mixed),

	new RandomUInt32Matrix(outputRows, outputCols, Constants, Mixed),

	outSpace, lrbSpace, objectSpace

)

{

}

};

class ZeroSingleExLayer : public ExLayer

{

public:

ZeroSingleExLayer

(

	UInt32 inputRows,

	UInt32 inputCols,

	UInt32 outputRows,

	UInt32 outputCols,

	Space outSpace = Globals,

	Space lrbSpace = Globals,

	Space objectSpace = Constants

) : ExLayer

(

	inputRows,

	inputCols,

	outputRows,

	outputCols,

	new ZeroSingleMatrix(outputRows, inputRows, Constants, Globals),

	new ZeroSingleMatrix(inputCols, outputCols, Constants, Globals),

	new ZeroSingleMatrix(outputRows, outputCols, Constants, Globals),

	new ZeroSingleMatrix(outputRows, inputCols, Constants, Globals),

	new ZeroSingleMatrix(outputRows, outputCols, Constants, Globals),

	outSpace, lrbSpace, objectSpace

)

{

}

};

class RandomSingleExLayer : public ExLayer

{

public:

RandomSingleExLayer(UInt32 inputRows, UInt32 inputCols, UInt32 outputRows, UInt32 outputCols, Single rangeA, Single rangeDelta, Space outSpace = Globals, Space lrbSpace = Globals, Space objectSpace = Constants) : ExLayer

(

	inputRows,

	inputCols,

	outputRows,

	outputCols,

	new RandomSingleMatrix(outputRows, inputRows, rangeA, rangeDelta, Constants, Globals),

	new RandomSingleMatrix(inputCols, outputCols, rangeA, rangeDelta, Constants, Globals),

	new RandomSingleMatrix(outputRows, outputCols, rangeA, rangeDelta, Constants, Globals),

	new RandomSingleMatrix(outputRows, inputCols, rangeA, rangeDelta, Constants, Globals),

	new RandomSingleMatrix(outputRows, outputCols, rangeA, rangeDelta, Constants, Globals),

	outSpace, lrbSpace, objectSpace

)

{

}

};

class GeneralExLayer : public ExLayer

{

public:

GeneralExLayer(UInt32 inputRows, UInt32 inputCols, UInt32 outputRows, UInt32 outputCols, Object *element, Space outSpace = Globals, Space lrbSpace = Globals, Space objectSpace = Constants) :

ExLayer

(

	inputRows,

	inputCols,

	outputRows,

	outputCols,

	new GeneralMatrix(outputRows, inputRows, element, Constants, Mixed),

	new GeneralMatrix(inputCols, outputCols, element, Constants, Mixed),

	new GeneralMatrix(outputRows, outputCols, element, Constants, Mixed),

	new GeneralMatrix(outputRows, InputCols, element, Constants, Mixed),

	new GeneralMatrix(outputRows, outputCols, element, Constants, Mixed),

	outSpace, lrbSpace, objectSpace

)

{

}

};

class GeneticAlgorithmConfiguration : public ConstantObject

{

public:

UInt32 Steps;

UInt32 PopulationSize;

UInt32 CrossoverProbability;

UInt32 MutationProbability;

UInt32 InversionProbability;

Single GrayRangeA;

Single GrayRangeB;

GeneticAlgorithmConfiguration

(

	UInt32 steps,

	UInt32 populationSize,

	UInt32 crossoverProbability,

	UInt32 mutationProbability,

	UInt32 inversionProbability,

	Single grayRangeA,

	Single grayRangeB

)

:

	Steps(steps),

	PopulationSize(populationSize),

	CrossoverProbability(crossoverProbability),

	MutationProbability(mutationProbability),

	InversionProbability(inversionProbability),

	GrayRangeA(grayRangeA),

	GrayRangeB(grayRangeB)

{

}

virtual void RenderConstants(ConstantsBuffer *buffer) const override

{

	buffer->Add(Steps);

	buffer->Add(PopulationSize);

	buffer->Add(CrossoverProbability);

	buffer->Add(MutationProbability);

	buffer->Add(InversionProbability);

	buffer->Add(GrayRangeA);

	buffer->Add((GrayRangeB - GrayRangeA) / UINT_MAX);

}

};

class MatrixMultiplicationConfiguration : public ConstantObject

{

public:

UInt32 WindowSize;

UInt32 WindowCapacity;

MatrixMultiplicationConfiguration(UInt32 windowSize) : WindowSize(windowSize), WindowCapacity(windowSize * windowSize)

{

}

virtual void RenderConstants(ConstantsBuffer *buffer) const override

{

	buffer->Add(WindowSize);

	buffer->Add(WindowCapacity);

}

};

class NeuralNetworkConfiguration : public ConstantObject

{

public:

UInt32 Layers;

UInt32 GenotypeSize;

UInt32 PhenotypeSize;

UInt32 GenotypeLayerSize;

UInt32 PhenotypeLayerSize;

NeuralNetworkConfiguration

(

	UInt32 layers,

	UInt32 genotypeSize,

	UInt32 phenotypeSize,

	UInt32 genotypeLayerSize,

	UInt32 phenotypeLayerSize

) : Layers(layers),

	GenotypeSize(genotypeSize),

	PhenotypeSize(phenotypeSize),

	GenotypeLayerSize(genotypeLayerSize),

	PhenotypeLayerSize(phenotypeLayerSize)

{

}

virtual void RenderConstants(ConstantsBuffer *buffer) const override

{

	buffer->Add(Layers);

	buffer->Add(GenotypeSize);

	buffer->Add(PhenotypeSize);

	buffer->Add(GenotypeLayerSize);

	buffer->Add(PhenotypeLayerSize);

}

};

#endif

[/codebox]

Here are “DeviceTypes.h”

[codebox]

#ifndef DEVICETYPES

#define DEVICETYPES

#include “Common.h”

typedef struct align(8)

{

UInt32 Length;

UInt32 Size;

} Array;

typedef struct align(8)

{

UInt32 Length;

UInt32 Size;

} Collection;

typedef struct align(16)

{

UInt32 Length;

UInt32 Size;

UInt32 Rows;

UInt32 Cols;

} Matrix;

typedef struct align(16)

{

UInt32 InputRows;

UInt32 InputCols;

UInt32 OutputRows;

UInt32 OutputCols;

UInt32 LeftOffset;

UInt32 RightOffset;

UInt32 BiasesOffset;

UInt32 LeftOutputOffset;

UInt32 RightOutputOffset;

} NnXLayer;

typedef struct align(4)

{

UInt32 Layers;

UInt32 GenotypeSize;

UInt32 PhenotypeSize;

UInt32 GenotypeLayerSize;

UInt32 PhenotypeLayerSize;

} NnConsts;

typedef struct align(16)

{

UInt32 Steps;

UInt32 PopulationSize;

UInt32 CrossoverProbability;

UInt32 MutationProbability;

UInt32 InversionProbability;

Single RangeA;

Single RangeDelta;

} GaConsts;

typedef struct align(8)

{

UInt32 WindowSize;

UInt32 WindowCapacity;

} MmConsts;

#endif

[/codebox]

Each MP (not SP) has 8kb of cache. (source: Appendix A, CUDA Programming guide)

It is a separate memory.

It is as fast as a register. (source: CUDA programming guide section 5.1.2.2)

I didn’t look at the pages and pages of code you pasted, but there is one thing you have to be aware of with constant memory. The cache is broadcast only. The programming guide says: (section 5.1.2.2) “The cost scales linearly with the number of different addresses read by all threads [in a warp].” In other words, the constant cache only performs at its best when you have all threads in a warp accessing the same value at once. If your threads are reading different values, the performance becomes much worse. In this case, a shared memory cache preloaded at the beginning of each block is often faster than constant memory.

Thank you for your detailed answer, it really helped. Another maybe silly question. After data migrated to cache how long will it stay there. I mean what actions can incidentally kick it out of there? As far as I understand when I exceed 8 kilobytes limit the oldest data will eventually be popped out of the cache. Is that correct?

One would assume so, yes. However, the details of the cache replacement policy are not documented anywhere that I am aware of.

Well at least i’m not the only one who thinks so. Thank you for your answer MisterAnderson42