There I was, so proud that I had a case, and then it turned out to be a stupid programming mistake on my side. I apologize for that. I might as well include the whole program then, it’s about 1 kloc consisting of mostly fluff. It’s debeeked but hopefully, it still flies:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <time.h>
#include <iostream>
#include <fstream>
#include <iterator>
#include <ctime>
#include <locale>
#include <cstdlib>
#include <Windows.h>
#define QS_MAX_LEVELS 300
typedef long long int64;
typedef unsigned long long uint64;
#define MAX_ZERO_SOLUTIONS 1000000
/* CUDA grid configuration */
#define GDIMX 4//8
#define GDIMY 4//8
#define GDIMZ 1
#define BDIMX 4//8
#define BDIMY 4//8
#define BDIMZ 1
#define NUM_THREADS 256 //4096
#ifndef __CUDACC__
#define __CUDACC__
#endif
struct scEnzConfig
{
float *vEnzA; // Vector of elements of enzyme A
float *vEnzB; // and B respectively
float *vDigest; // Vector of resulting double digest of A and B
int iSizeA; // Vector sizes ...
int iSizeB;
int iSizeDigest;
};
struct simConfig
{
float fAlpha;
float fBeta;
int64 i64Seed;
int64 i64Interval;
};
/* To the KernelStatus struct describes the status for each kernel
that is intended to be visible to other kernels during execution.
The cache veriables are there to assist with index specific operations
on the struct when reporting status. They are disposable variables used
to compute global status parameters by reduction. If a thread has found
a solution, it dies and the isAlive is there to indicate whether that
has happened to a particular thread.
*/
struct scKernelStatus
{
float currentOptH;
int currentOptCfgA;
int currentOptCfgB;
//int currentCfgA;
//int currentCfgB;
int64 numIterations;
int threadIdx;
int intCache; // a copy of threadIdx to find currentIdx (which thread has lowest energy)
int64 int64Cache; // a copy numIterations for globalNumIterations
float floatCache; // a copy of currentOptH for currentEnergy
bool boolCache; // a copy of isAlive for evaluation of allThreadsDead
bool isAlive;
};
/*
This struct keeps track of found solutions and current progress. It's also
there to monitor some essential functionality. The 'current' prefixed
elements contain information about current status. Some operations are
conducted by only one thread. Very little CPU time will be spent on single
thread operations but they cannot be avoided completely. Since threads die
after a zero solution is found (we may change that later), it would be
unfortunate if single-thread operations were appointed to that thread.
So before such a thread dies, the 'activeThread' can be changed to some
other thread that is alive, which can be found in the boolean 'isAlive'
in the scKernelStatus struct.
*/
struct scGlobalStatus
{
/* A struct of arrays SOA */
int zeroConfA[MAX_ZERO_SOLUTIONS];
int zeroConfB[MAX_ZERO_SOLUTIONS];
int numZeroes;
int64 globalNumIterations;
int64 currentIdx;
int64 activeThreadIdx;
bool stopSignal;
bool allThreadsDead;
float currentEnergy;
int currentGlobalOptCfgA;
int currentGlobalOptCfgB;
};
/*
********** BEGIN: Windows System Console ANSI color enabler module
******************************************************************************
*/
struct colors_t
{
HANDLE hStdOut;
int initial_colors;
colors_t()
{
hStdOut = GetStdHandle(STD_OUTPUT_HANDLE);
initial_colors = getColors();
}
~colors_t()
{
setColors(initial_colors);
}
int getColors() const
{
CONSOLE_SCREEN_BUFFER_INFO csbi;
GetConsoleScreenBufferInfo(hStdOut, &csbi);
return csbi.wAttributes;
}
void setColors(int color)
{
SetConsoleTextAttribute(hStdOut, color);
}
void setFg(int color)
{
int current_colors = getColors();
setColors((color & 0x0F) | (current_colors & 0xF0));
}
void setBg(int color)
{
int current_colors = getColors();
setColors(((color & 0x0F) << 4) | (current_colors & 0x0F));
}
int getFg() const { return getColors() & 0x0F; }
int getBg() const { return (getColors() >> 4) & 0x0F; }
COORD getCurPos()
{
CONSOLE_SCREEN_BUFFER_INFO csbi;
GetConsoleScreenBufferInfo(hStdOut, &csbi);
return csbi.dwCursorPosition;
}
void setCurPos(int x, int y) {
COORD curPos;
curPos.X = x;
curPos.Y = y;
SetConsoleCursorPosition(hStdOut, curPos);
}
};
enum
{
Black, dBlue, dGreen, dCyan, dRed, dMagenta, dYellow, hGray,
dGray, hBlue, hGreen, hCyan, hRed, hMagenta, hYellow, White
};
/*
********** END: Windows System Console ANSI color enabler module
******************************************************************************
*/
/*
********** BEGIN: System time measurement module
******************************************************************************
*/
const __int64 DELTA_EPOCH_IN_MICROSECS = 11644473600000000;
struct timezone2
{
__int32 tz_minuteswest; /* minutes W of Greenwich */
bool tz_dsttime; /* type of dst correction */
};
struct timeval2
{
__int32 tv_sec; /* seconds */
__int32 tv_usec; /* microseconds */
};
int gettimeofday(struct timeval2 *tv/*in*/, struct timezone2 *tz/*in*/)
{
FILETIME ft;
__int64 tmpres = 0;
TIME_ZONE_INFORMATION tz_winapi;
int rez = 0;
ZeroMemory(&ft, sizeof(ft));
ZeroMemory(&tz_winapi, sizeof(tz_winapi));
GetSystemTimeAsFileTime(&ft);
tmpres = ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (__int32)(tmpres*0.000001);
tv->tv_usec = (tmpres % 1000000);
//_tzset(),don't work properly, so we use GetTimeZoneInformation
rez = GetTimeZoneInformation(&tz_winapi);
if (tz)
{
tz->tz_dsttime = (rez == 2) ? true : false;
tz->tz_minuteswest = tz_winapi.Bias + ((rez == 2) ? tz_winapi.DaylightBias : 0);
}
return 0;
}
double cpuSecond()
{
struct timeval2 tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}
/*
********** END: System time measurement module
******************************************************************************
*/
/*
********** BEGIN: System time and date module
******************************************************************************
*/
class currentDate
{
public:
currentDate(std::string argFormat) : dateFormat(argFormat) {}
friend std::ostream& operator << (std::ostream &, currentDate const &);
private:
std::string dateFormat;
};
std::ostream& operator << (std::ostream &outputStream,
currentDate const &dateObject)
{
std::ostream::sentry initSuccess(outputStream);
if (initSuccess) {
std::time_t t = std::time(0);
std::tm const *timeStruct = std::localtime(&t);
std::ostreambuf_iterator<char> output(outputStream);
std::use_facet<std::time_put<char>>(outputStream.getloc())
.put(output, outputStream, outputStream.fill(),
timeStruct, &dateObject.dateFormat[0],
&dateObject.dateFormat[0] + dateObject.dateFormat.size());
}
outputStream.width(0);
return outputStream;
}
/*
********** END: System time and date module
******************************************************************************
*/
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code: %d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
} \
template<typename iox1>
__device__ void evalDoubleDigest(iox1 *resultingDigest,
const iox1 *enzA, const iox1 *enzB,
const int *cfgA, const int *cfgB,
const int sizeA, const int sizeB)
{
int sizeDigest = sizeA + sizeB - 1;
for (int i = 0; i < sizeA; ++i) {
resultingDigest[i] = 0;
for (int j = 0; j <= i; ++j)
resultingDigest[i] += enzA[cfgA[j]];
}
for (int i = sizeA; i < sizeDigest; ++i) {
resultingDigest[i] = 0;
for (int j = 0; j <= i - sizeA; ++j)
resultingDigest[i] += enzB[cfgB[j]];
}
for (int i = 0; i < sizeDigest; ++i)
{
int j = i;
while ((j > 0) && (resultingDigest[j] < resultingDigest[j - 1]))
{
iox1 swapper = resultingDigest[j];
resultingDigest[j] = resultingDigest[j - 1];
resultingDigest[j - 1] = swapper;
--j;
}
}
for (int i = sizeDigest - 1; i > 0; --i)
resultingDigest[i] -= resultingDigest[i - 1];
for (int i = 0; i < sizeDigest; ++i)
{
int j = i;
while ((j > 0) && (resultingDigest[j] > resultingDigest[j - 1]))
{
iox1 swapper = resultingDigest[j];
resultingDigest[j] = resultingDigest[j - 1];
resultingDigest[j - 1] = swapper;
--j;
}
}
}
__device__ int getGlobalThreadIdx()
{
int blockId = blockIdx.x
+ gridDim.x * blockIdx.y
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
__device__ int generateLehmerCode(const int *vCfg, int iSize)
{
int *iMask = new int[iSize];
for (int i = 0; i < iSize; ++i)
iMask[i] = 1;
int *lehmerVector = new int[iSize];
int lehmerValue(0), iFactorial(1);
for (int i = 0; i < iSize; ++i)
{
iMask[vCfg[i]] = 0;
lehmerVector[i] = 0;
for (int j = 0; j<=vCfg[i]; ++j) lehmerVector[i] += iMask[j];
}
for (int i = 1; i<=iSize; ++i) {
iFactorial *= i + 1;
lehmerValue += lehmerVector[iSize - i] * iFactorial;
}
delete[] iMask;
delete[] lehmerVector;
return lehmerValue;
}
__device__ void permuteFromLehmerCode(int *vPermutation,
const int iSize, const int iLehmerValue)
{
int iFactorial(1), iRestValue(iLehmerValue);
for (int i = 1; i <= iSize; ++i)
iFactorial *= i;
int *lehmerVector = new int[iSize];
for (int i = 0; i < iSize; ++i)
{
lehmerVector[i] = iRestValue / iFactorial;
iRestValue -= lehmerVector[i] * iFactorial;
iFactorial /= iSize - i;
}
int *iMask = new int[iSize];
for (int i = 0; i < iSize; ++i)
iMask[i] = 1;
for (int i = 0; i < iSize; ++i)
{
int cumIElements = 0;
int cumSumIMask = 0;
while ((cumSumIMask < lehmerVector[i]+1) &&
(cumIElements<iSize))
cumSumIMask += iMask[cumIElements++];
--cumIElements;
vPermutation[i] = cumIElements;
iMask[cumIElements] = 0;
}
delete[] iMask;
delete[] lehmerVector;
}
void permuteFromLehmerCodeOnHost(int *vPermutation,
const int iSize, const int iLehmerValue)
{
int iFactorial(1), iRestValue(iLehmerValue);
for (int i = 1; i <= iSize; ++i)
iFactorial *= i;
int *lehmerVector = new int[iSize];
for (int i = 0; i < iSize; ++i)
{
lehmerVector[i] = iRestValue / iFactorial;
iRestValue -= lehmerVector[i] * iFactorial;
iFactorial /= iSize - i;
}
int *iMask = new int[iSize];
for (int i = 0; i < iSize; ++i)
iMask[i] = 1;
for (int i = 0; i < iSize; ++i)
{
int cumIElements = 0;
int cumSumIMask = 0;
while ((cumSumIMask < lehmerVector[i] + 1) &&
(cumIElements<iSize))
cumSumIMask += iMask[cumIElements++];
--cumIElements;
vPermutation[i] = cumIElements;
iMask[cumIElements] = 0;
}
delete[] iMask;
delete[] lehmerVector;
}
/*
Evaluates the global status from the statuses of the individual kernels
by method of reduction.
*/
__global__ void initializeKernel(scKernelStatus *currentKernelStatus,
scEnzConfig *inpCfg, simConfig *inpSimCfg)
{
int swapIdx(0), swapper(0);
curandStateMRG32k3a swpElemState;
int threadId = getGlobalThreadIdx();
curand_init(inpSimCfg->i64Seed, threadId, 0, &swpElemState);
int sizeA(inpCfg->iSizeA), sizeB(inpCfg->iSizeB);
int *cfgA = new int;
int *cfgB = new int;
for (int i = 0; i < sizeA; ++i) cfgA[i] = i;
for (int i = 0; i < sizeB; ++i) cfgB[i] = i;
for (int i = 0; i < sizeA-1; ++i)
{
swapIdx = curand(&swpElemState) % (sizeA - 1 - i);
swapper = cfgA[i];
cfgA[i] = cfgA[i + swapIdx];
cfgA[i + swapIdx] = swapper;
}
for (int i = 0; i < sizeB-1; ++i)
{
swapIdx = curand(&swpElemState) % (sizeB - 1 - i);
swapper = cfgB[i];
cfgB[i] = cfgB[i + swapIdx];
cfgB[i + swapIdx] = swapper;
}
currentKernelStatus[threadId].currentOptCfgA
= generateLehmerCode(cfgA, sizeA);
currentKernelStatus[threadId].currentOptCfgB
= generateLehmerCode(cfgB, sizeB);
currentKernelStatus[threadId].numIterations = 0;
currentKernelStatus[threadId].intCache = 0;
currentKernelStatus[threadId].int64Cache = 0;
currentKernelStatus[threadId].floatCache = 0;
currentKernelStatus[threadId].currentOptH = 1.0E6;
currentKernelStatus[threadId].threadIdx = threadId;
currentKernelStatus[threadId].isAlive = true;
currentKernelStatus[threadId].boolCache = true;
delete[] cfgA, cfgB;
}
__global__ void annealProcess(scEnzConfig *inpCfg, simConfig *inpSimCfg,
scKernelStatus *currentKernelStatus,
scGlobalStatus *currentGlobalStatus)
{
int threadId = getGlobalThreadIdx();
if (currentKernelStatus[threadId].isAlive) {
int64 compuTimer(currentKernelStatus[threadId].numIterations);
int64 maxCalcs(inpSimCfg->i64Interval);
float coefAlpha(inpSimCfg->fAlpha), coefBeta(inpSimCfg->fBeta);
float machineEpsilon(1.0E-5), currentEnergy(0);
float optimalEnergy(currentKernelStatus[threadId].currentOptH);
curandStateMRG32k3a swpElemState;
curandState perturbState;
curand_init(inpSimCfg->i64Seed, threadId, 0, &swpElemState);
curand_init(inpSimCfg->i64Seed, threadId + 1, 0, &perturbState);
int swapIdx1(0), swapIdx2(0), swapper;
int sizeA(inpCfg->iSizeA), sizeB(inpCfg->iSizeB);
int sizeDigest(inpCfg->iSizeDigest);
float *enzA = new float;
float *enzB = new float;
int *cfgA = new int;
int *cfgB = new int;
int *optCfgA = new int;
int *optCfgB = new int;
float *realDigest = new float;
float *resultingDigest = new float;
if (threadId == 1)
printf("annealProcess InitData: Tid %d, CfgA %d, Iter %d.\n", threadId,
currentKernelStatus[threadId].currentOptCfgA,
currentKernelStatus[threadId].numIterations);
/* We transfer the data in the input arguments into local variable */
for (int i = 0; i < sizeA; ++i) {
enzA[i] = inpCfg->vEnzA[i];
}
permuteFromLehmerCode(cfgA, sizeA,
currentKernelStatus[threadId].currentOptCfgA);
for (int i = 0; i < sizeB; ++i)
enzB[i] = inpCfg->vEnzB[i];
permuteFromLehmerCode(cfgB, sizeB,
currentKernelStatus[threadId].currentOptCfgB);
for (int i = 0; i < sizeDigest; ++i)
realDigest[i] = inpCfg->vDigest[i];
/*
Begin: Main computational loop
***************************************************************************
*/
//while (/*(optimalEnergy > machineEpsilon) && */(((compuTimer+1) % (maxCalcs+1)) != 0 ))
//{
compuTimer += 1; //This may need to be changed in case int2float isn't working
__syncthreads();
//printf("Iter:(%d)%d(%d). ", threadId, compuTimer, cfgA);
coefBeta = coefAlpha*exp(coefAlpha*(float)compuTimer);
if (compuTimer & 1)
{
swapIdx1 = curand(&swpElemState) % sizeA;
swapIdx2 = curand(&swpElemState) % sizeA;
swapper = cfgA[swapIdx1];
cfgA[swapIdx1] = cfgA[swapIdx2];
cfgA[swapIdx2] = swapper;
}
else
{
if (threadId == 1)
swapIdx1 = curand(&swpElemState) % sizeB;
swapIdx2 = curand(&swpElemState) % sizeB;
swapper = cfgB[swapIdx1];
cfgB[swapIdx1] = cfgB[swapIdx2];
cfgB[swapIdx2] = swapper;
}
evalDoubleDigest(resultingDigest, enzA, enzB,
cfgA, cfgB, sizeA, sizeB);
currentEnergy = 0;
for (int i = 0; i < sizeDigest; ++i)
currentEnergy += (realDigest[i] - resultingDigest[i])*
(realDigest[i] - resultingDigest[i])
/ realDigest[i];
if ((currentEnergy < optimalEnergy) || curand_uniform(&perturbState) <
exp(-coefBeta*(currentEnergy - optimalEnergy)))
{
optimalEnergy = currentEnergy;
for (int i = 0; i < sizeA; ++i)
optCfgA[i] = cfgA[i];
for (int i = 0; i < sizeB; ++i)
optCfgB[i] = cfgB[i];
}
//}
__syncthreads();
/*
End: Main computational loop
***************************************************************************
*/
/* Update the kernel status */
currentKernelStatus[threadId].currentOptH = optimalEnergy;
currentKernelStatus[threadId].currentOptCfgA
= generateLehmerCode(optCfgA, sizeA);
currentKernelStatus[threadId].currentOptCfgB
= generateLehmerCode(optCfgB, sizeB);
currentKernelStatus[threadId].numIterations += compuTimer;
currentKernelStatus[threadId].int64Cache += compuTimer;
currentKernelStatus[threadId].floatCache = optimalEnergy;
__syncthreads();
if (threadId == 1)
printf("annealProcess newData: Tid %d, CfgA %d , Iter %d.\n", threadId,
currentKernelStatus[threadId].currentOptCfgA,
currentKernelStatus[threadId].numIterations);
/* If the while loop terminated due to found zero solution */
if (!(optimalEnergy > machineEpsilon))
{
bool uniqueSolution = true;
for (int i = 0; i < currentGlobalStatus->numZeroes; ++i)
if (currentGlobalStatus->zeroConfA[i]
== currentKernelStatus[threadId].currentOptCfgA)
uniqueSolution = false;
if (uniqueSolution)
{
++currentGlobalStatus->numZeroes;
currentGlobalStatus->zeroConfA[currentGlobalStatus->numZeroes]
= currentKernelStatus[threadId].currentOptCfgA;
currentGlobalStatus->zeroConfB[currentGlobalStatus->numZeroes]
= currentKernelStatus[threadId].currentOptCfgB;
}
currentKernelStatus[threadId].isAlive = false;
/* This thread no longer has an optimal energy */
currentKernelStatus[threadId].currentOptH = 1.0E6;
}
delete[] enzA, enzB;
delete[] cfgA, cfgB;
delete[] optCfgA, optCfgB;
delete[] realDigest;
delete[] resultingDigest;
}
}
/*
Evaluates the global status from the statuses of the individual kernels
by method of reduction.
*/
__global__ void updateGlobalStatus(scKernelStatus *currentKernelStatus,
scGlobalStatus *currentGlobalStatus)
{
uint64 tid = blockDim.x * blockDim.y *threadIdx.z +
blockDim.x * threadIdx.y +
threadIdx.x;
uint64 idx = getGlobalThreadIdx();
uint64 bidx = blockIdx.z * gridDim.x * gridDim.y +
blockIdx.y * gridDim.x +
blockIdx.x;
uint64 bdim = blockDim.x*blockDim.y*blockDim.z;
uint64 n = bdim*gridDim.x*gridDim.y*gridDim.z;
currentKernelStatus[idx].boolCache = !currentKernelStatus[idx].isAlive;
__syncthreads();
/* Convert global data pointer to a local pointer for each block,
so each pointer roams between 0 and blockDim.x-1 for each block.
*/
scKernelStatus *idata = currentKernelStatus + bidx*bdim;
/* Some troubleshooting code */
if (idx == 1)
printf("updateGlobalStatus - Data from annealK: Tid %d, CfgA %d , Iter %d.\n", idx,
currentKernelStatus[idx].currentOptCfgA,
currentKernelStatus[idx].numIterations);
for (int64 stride = bdim / 2; stride > 0; stride >>= 1)
{
if (tid < stride)
{
idata[tid].int64Cache += idata[tid + stride].int64Cache;
idata[tid].boolCache &= !idata[tid + stride].isAlive;
if (idata[tid].floatCache < idata[tid + stride].floatCache)
idata[tid].intCache = idata[tid].threadIdx;
else
{
idata[tid].floatCache = idata[tid + stride].floatCache;
idata[tid].intCache = idata[tid + stride].threadIdx;
}
}
__syncthreads();
if (idx == currentGlobalStatus->activeThreadIdx)
{
/* Final evaluation of optimal energy */
currentGlobalStatus->currentEnergy = idata[tid].floatCache;
currentGlobalStatus->currentIdx = idata[tid].intCache;
for (uint64 i = 0; i < n; i += bdim)
if (currentGlobalStatus->currentEnergy
< currentKernelStatus[i].floatCache)
{
currentGlobalStatus->currentEnergy =
currentKernelStatus[i].floatCache;
currentGlobalStatus->currentIdx =
currentKernelStatus[i].intCache;
}
currentGlobalStatus->currentGlobalOptCfgA =
currentKernelStatus[currentGlobalStatus->currentIdx].currentOptCfgA;
currentGlobalStatus->currentGlobalOptCfgB =
currentKernelStatus[currentGlobalStatus->currentIdx].currentOptCfgB;
/* Final evaluation of total number of iterations */
float currentEnergy(0);
currentGlobalStatus->allThreadsDead = true;
for (int i = 0; i < n; i += bdim)
{
currentEnergy += currentKernelStatus[i].floatCache;
currentGlobalStatus->allThreadsDead &= currentKernelStatus[i].boolCache;
}
currentGlobalStatus->currentEnergy = currentEnergy;
}
}
}
int main(int argc, char** argv)
{
colors_t colors;
LARGE_INTEGER startTime, lapTime, countFreq;
QueryPerformanceFrequency((LARGE_INTEGER *)&countFreq);
std::string outputFile("foundSolutions.log");
/* Parsing input arguments from the command line */
if (argc > 1) {
for (int i = 0; i < argc; ++i) {
if (!strcmp(argv[i], "-o") ||
!strcmp(argv[i], "--output-file"))
outputFile = argv[++i];
/* Should verify that the argument is a valid file name */
/* Support for more input arguments to come ... */
}
}
/* Open and lock into the output file */
std::ofstream outputFS(outputFile, std::ios::app);
if (!outputFS)
{
colors.setFg(hYellow); std::cerr << outputFile;
colors.setFg(hRed); std::cerr << " could not be opened!";
colors.setFg(hGray); std::cerr << std::endl;
exit(1);
}
outputFS << currentDate("%c") << "\n\n";
outputFS << "Starting iterations ... \n" << std::endl;
/* Set up device */
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
CHECK(cudaSetDevice(dev));
std::cout << "Device "; colors.setFg(dCyan); std::cout << dev;
colors.setFg(hGray); std::cout << ": "; colors.setFg(dYellow);
std::cout << deviceProp.name << std::endl; colors.setFg(hGray);
colors.setFg(hGray); std::cout << "Number of threads ";
colors.setFg(White); std::cout << ": ";
colors.setFg(hYellow); std::cout << NUM_THREADS;
colors.setFg(hGray); std::cout << std::endl;
colors.setFg(dGray); std::cout << "Grid configuration ";
colors.setFg(White); std::cout << ": (";
colors.setFg(dMagenta); std::cout << GDIMX;
colors.setFg(White); std::cout << ", ";
colors.setFg(hMagenta); std::cout << GDIMY;
colors.setFg(White); std::cout << ", ";
colors.setFg(dMagenta); std::cout << GDIMZ;
colors.setFg(White); std::cout << ")";
colors.setFg(hGray); std::cout << std::endl;
colors.setFg(hGray); std::cout << "Block configuration ";
colors.setFg(White); std::cout << ": (";
colors.setFg(dMagenta); std::cout << BDIMX;
colors.setFg(White); std::cout << ", ";
colors.setFg(hMagenta); std::cout << BDIMY;
colors.setFg(White); std::cout << ", ";
colors.setFg(dMagenta); std::cout << BDIMZ;
colors.setFg(White); std::cout << ")";
colors.setFg(hGray); std::cout << std::endl;
/* Input parameters */
/* Setup of computational configuration */
simConfig *simConfig1 = new simConfig;
simConfig1->fAlpha = static_cast<float>(1E-4);
simConfig1->fBeta = static_cast<float>(1E-6);
simConfig1->i64Interval = 10 ^ 6; //Change this
simConfig1->i64Seed = static_cast<int64>(cpuSecond());
/* Transfer the config to managed memory */
simConfig *dSimConfig1;
CHECK(cudaMallocManaged((simConfig **)&dSimConfig1, sizeof(simConfig)));
CHECK(cudaMemcpy(dSimConfig1, simConfig1,
sizeof(simConfig), cudaMemcpyHostToDevice));
/* Setup of enzyme parameters */
const int enzymeA1Size(6), enzymeB1Size(6), digestedProtein1Size(11);
const int enzymeA2Size(10), enzymeB2Size(11), digestedProtein2Size(20);
const int A1[enzymeA1Size] = { 8479, 4868, 3696, 2646, 169, 142 };
const int B1[enzymeB1Size] = { 11968, 5026, 1081, 1050, 691, 184 };
const int C1[digestedProtein1Size] = { 8479, 4167, 2646, 1081, 881,
859, 701, 691, 184, 169, 142 };
const int A2[enzymeA2Size] = { 9979, 9348, 8022, 4020, 2693,
1892, 1714, 1371, 510, 451 };
const int B2[enzymeB2Size] = { 9492, 8453, 7749, 7365, 2292,
2180, 1023, 959, 278, 124, 85 };
const int C2[digestedProtein2Size] = { 7042, 5608, 5464, 4371,
3884, 3121, 1901, 1768, 1590, 959, 899, 707, 702, 510, 451,
412, 278, 124, 124, 85 };
scEnzConfig *enzConfig1 = new scEnzConfig;
scEnzConfig *enzConfig2 = new scEnzConfig;
enzConfig1->iSizeA = enzymeA1Size;
enzConfig1->iSizeB = enzymeB1Size;
enzConfig1->iSizeDigest = digestedProtein1Size;
enzConfig1->vEnzA = new float[enzymeA1Size];
enzConfig1->vEnzB = new float[enzymeB1Size];
enzConfig1->vDigest = new float[digestedProtein1Size];
for (int i = 0; i < enzymeA1Size; ++i)
enzConfig1->vEnzA[i] = static_cast<float>(A1[i]);
for (int i = 0; i < enzymeB1Size; ++i)
enzConfig1->vEnzB[i] = static_cast<float>(B1[i]);
for (int i = 0; i < digestedProtein1Size; ++i)
enzConfig1->vDigest[i] = static_cast<float>(C1[i]);
enzConfig2->iSizeA = enzymeA2Size;
enzConfig2->iSizeB = enzymeB2Size;
enzConfig2->iSizeDigest = digestedProtein2Size;
enzConfig2->vEnzA = new float[enzymeA2Size];
enzConfig2->vEnzB = new float[enzymeB2Size];
enzConfig2->vDigest = new float[digestedProtein2Size];
for (int i = 0; i < enzymeA2Size; ++i)
enzConfig2->vEnzA[i] = static_cast<float>(A2[i]);
for (int i = 0; i < enzymeB2Size; ++i)
enzConfig2->vEnzB[i] = static_cast<float>(B2[i]);
for (int i = 0; i < digestedProtein2Size; ++i)
enzConfig2->vDigest[i] = static_cast<float>(C2[i]);
scEnzConfig *dEnzConfig1, *dEnzConfig2;
float *vEnzA1, *vEnzB1, *vDigest1;
float *vEnzA2, *vEnzB2, *vDigest2;
/* Allocate main struct of enzyme pair 1*/
CHECK(cudaMalloc((scEnzConfig **)&dEnzConfig1, sizeof(scEnzConfig)));
CHECK(cudaMemcpy(dEnzConfig1, enzConfig1, sizeof(scEnzConfig), cudaMemcpyHostToDevice));
/* Allocate and transfer data for array with enzyme A1 */
CHECK(cudaMalloc(&vEnzA1, enzymeA1Size*sizeof(float)));
CHECK(cudaMemcpy(&dEnzConfig1->vEnzA, &vEnzA1, sizeof(float *), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(vEnzA1, enzConfig1->vEnzA, enzymeA1Size*sizeof(float), cudaMemcpyHostToDevice));
/* Allocate and transfer data for array with enzyme B1 */
CHECK(cudaMalloc(&vEnzB1, enzymeB1Size*sizeof(float)));
CHECK(cudaMemcpy(&dEnzConfig1->vEnzB, &vEnzB1, sizeof(float *), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(vEnzB1, enzConfig1->vEnzB, enzymeB1Size*sizeof(float), cudaMemcpyHostToDevice));
/* Allocate and transfer data for array with double digest 1 */
CHECK(cudaMalloc(&vDigest1, digestedProtein1Size*sizeof(float)));
CHECK(cudaMemcpy(&dEnzConfig1->vDigest, &vDigest1, sizeof(float *), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(vDigest1, enzConfig1->vDigest, digestedProtein1Size*sizeof(float), cudaMemcpyHostToDevice));
/* Allocate main struct of enzyme pair 2 */
CHECK(cudaMalloc((scEnzConfig **)&dEnzConfig2, sizeof(scEnzConfig)));
CHECK(cudaMemcpy(dEnzConfig2, enzConfig2, sizeof(scEnzConfig), cudaMemcpyHostToDevice));
/* Allocate and transfer data for array with enzyme A2 */
CHECK(cudaMalloc(&vEnzA2, enzymeA2Size*sizeof(float)));
CHECK(cudaMemcpy(&dEnzConfig2->vEnzA, &vEnzA2, sizeof(float *), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(vEnzA2, enzConfig2->vEnzA, enzymeA2Size*sizeof(float), cudaMemcpyHostToDevice));
/* Allocate and transfer data for array with enzyme B2 */
CHECK(cudaMalloc(&vEnzB2, enzymeB2Size*sizeof(float)));
CHECK(cudaMemcpy(&dEnzConfig2->vEnzB, &vEnzB2, sizeof(float *), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(vEnzB2, enzConfig2->vEnzB, enzymeB2Size*sizeof(float), cudaMemcpyHostToDevice));
/* Allocate and transfer data for array with double digest 2 */
CHECK(cudaMalloc(&vDigest2, digestedProtein2Size*sizeof(float)));
CHECK(cudaMemcpy(&dEnzConfig2->vDigest, &vDigest2, sizeof(float *), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(vDigest2, enzConfig2->vDigest, digestedProtein2Size*sizeof(float), cudaMemcpyHostToDevice));
/* Setting up global status parameters in managed memory */
scGlobalStatus *globalStatus1;
CHECK(cudaMallocManaged((scGlobalStatus **)&globalStatus1,
sizeof(scGlobalStatus)));
globalStatus1->numZeroes = 0;
globalStatus1->activeThreadIdx = 0;
globalStatus1->stopSignal = false;
globalStatus1->allThreadsDead = false;
globalStatus1->globalNumIterations = 0;
/* Set up grid and block for execution */
dim3 grid(GDIMX, GDIMY, GDIMZ);
dim3 block(BDIMX, BDIMY, BDIMZ);
/* Setup of kernel status */
scKernelStatus *kernelStatus1;
int kernelStatusSpace = NUM_THREADS*sizeof(scKernelStatus);
/* No need for it to be globally accessible */
CHECK(cudaMalloc((scKernelStatus **)&kernelStatus1,
kernelStatusSpace));
colors.setFg(dRed); std::cout << std::string(15, '-');
colors.setFg(hRed); std::cout << " the trouble is below this line ";
colors.setFg(dRed); std::cout << std::string(15, '-');
colors.setFg(hGray); std::cout << "\n" << std::endl;
/* Initialize kernel space */
initializeKernel <<<grid, block>>> (kernelStatus1, dEnzConfig1, dSimConfig1);
CHECK(cudaDeviceSynchronize());
bool noIntervention(true);
int numSol(0);
int loopLimiter(1);
COORD cursorPositioner(colors.getCurPos());
while (noIntervention)
{
QueryPerformanceCounter((LARGE_INTEGER *)&startTime);
/* Perform a number of iterations */
annealProcess <<<grid, block>>> (dEnzConfig1, dSimConfig1,
kernelStatus1, globalStatus1);
CHECK(cudaDeviceSynchronize());
QueryPerformanceCounter((LARGE_INTEGER *)&lapTime);
/* Update global status */
updateGlobalStatus <<<grid, block>>> (kernelStatus1, globalStatus1);
CHECK(cudaDeviceSynchronize());
colors.setFg(dRed); std::cout << "\n" << std::string(15, '-');
colors.setFg(hRed); std::cout << " the trouble is above this line ";
colors.setFg(dRed); std::cout << std::string(15, '-');
colors.setFg(hGray); std::cout << std::endl;
std::cout << "After globalstatusupdate, threads are now ";
std::cout << ((globalStatus1->allThreadsDead) ? "dead" : "online");
std::cout << "." << std::endl;
int64 numIter1 = globalStatus1->globalNumIterations;
if (globalStatus1->numZeroes > numSol)
{
/* Write newfound zero solutions to file */
for (int i = numSol; i < globalStatus1->numZeroes; ++i) {
int currSolA = globalStatus1->zeroConfA[i];
int currSolB = globalStatus1->zeroConfB[i];
int *zConfIdxA = new int[enzConfig1->iSizeA];
int *zConfIdxB = new int[enzConfig1->iSizeB];
permuteFromLehmerCodeOnHost(zConfIdxA,
enzConfig1->iSizeA, currSolA);
permuteFromLehmerCodeOnHost(zConfIdxB,
enzConfig1->iSizeB, currSolB);
outputFS << "Solution " << i << "\n";
outputFS << "Lehmer: " << "A = " << currSolA;
outputFS << "B = " << currSolB << ".\n";
outputFS << "Enzyme A: [";
for (int j = 0; j < enzConfig1->iSizeA; ++j)
outputFS << " " << enzConfig1->vEnzA[zConfIdxA[j]];
outputFS << "].\n";
outputFS << "Enzyme B: [";
for (int j = 0; j < enzConfig1->iSizeB; ++j)
outputFS << " " << enzConfig1->vEnzB[zConfIdxB[j]];
outputFS << "].\n" << std::endl;
delete[] zConfIdxA;
delete[] zConfIdxB;
}
}
/* Report current progress to stdout */
//colors.setCurPos(cursorPositioner.X, cursorPositioner.Y);
/*Total number of iterations */
colors.setFg(dGray); std::cout << "Number of iterations ";
colors.setFg(White); std::cout << ": ";
colors.setFg(hCyan);
std::cout << globalStatus1->globalNumIterations;
colors.setFg(hGray); std::cout << std::endl;
/* Number of iterations per second */
colors.setFg(hGray); std::cout << "Computational speed ";
colors.setFg(White); std::cout << ": ";
colors.setFg(hCyan);
std::cout << (globalStatus1->globalNumIterations - numIter1) /
((lapTime.QuadPart - startTime.QuadPart) /
static_cast<double>(countFreq.QuadPart));
colors.setFg(White); std::cout << " iterations/second ";
colors.setFg(hGray); std::cout << std::endl;
/* Found true zero solutions */
colors.setFg(hGray); std::cout << "Found zeroes ";
colors.setFg(White); std::cout << ": ";
colors.setFg(hCyan); std::cout << globalStatus1->numZeroes;
colors.setFg(hGray); std::cout << std::endl;
/* Current lowest energy */
colors.setFg(dGray); std::cout << "Current optimal energy";
colors.setFg(White); std::cout << ": ";
colors.setFg(hCyan); std::cout << globalStatus1->currentEnergy;
colors.setFg(hGray); std::cout << std::endl;
/* Current configuration with lowest energy */
int currentEnzA(globalStatus1->currentGlobalOptCfgA);
int currentEnzB(globalStatus1->currentGlobalOptCfgB);
int *currIdxA = new int[enzConfig1->iSizeA];
int *currIdxB = new int[enzConfig1->iSizeB];
permuteFromLehmerCodeOnHost(currIdxA,
enzConfig1->iSizeA, currentEnzA);
permuteFromLehmerCodeOnHost(currIdxB,
enzConfig1->iSizeB, currentEnzB);
/* : Enzyme A */
colors.setFg(hGray); std::cout << "Enzyme A ";
colors.setFg(White); std::cout << ": ";
for (int j = 0; j < enzConfig1->iSizeA; ++j) {
if (colors.getFg() == dCyan) colors.setFg(hCyan);
else colors.setFg(dCyan);
std::cout << enzConfig1->vEnzA[currIdxA[j]] << " ";
}
colors.setFg(hGray); std::cout << "." << std::endl;
/* : Enzyme B */
colors.setFg(dGray); std::cout << "Enzyme B ";
colors.setFg(White); std::cout << ": ";
for (int j = 0; j < enzConfig1->iSizeB; ++j) {
if (colors.getFg() == dCyan) colors.setFg(hCyan);
else colors.setFg(dCyan);
std::cout << enzConfig1->vEnzB[currIdxB[j]] << " ";
}
colors.setFg(dGray); std::cout << "." << std::endl;
colors.setFg(hGray);
delete[] currIdxA;
delete[] currIdxB;
/* If all threads are dead, or user intervention detected;
terminate while loop. */
if (globalStatus1->allThreadsDead) {
noIntervention = false;
std::cout << "All threads are completed" << std::endl;
}
/* This line only works within VS*/
/*if (GetAsyncKeyState(0x51)) {
noIntervention = false;
std::cout << "User interruption detected." << std::endl;
}*/
if (loopLimiter > 0) {
noIntervention = false;
std::cout << "Break after " << loopLimiter << " loop cycles." << std::endl;
}
++loopLimiter;
}
outputFS.close();
CHECK(cudaFree(dEnzConfig1));
CHECK(cudaFree(vEnzA1));
CHECK(cudaFree(vEnzB1));
CHECK(cudaFree(vDigest1));
CHECK(cudaFree(dEnzConfig2));
CHECK(cudaFree(vEnzA2));
CHECK(cudaFree(vEnzB2));
CHECK(cudaFree(vDigest2));
CHECK(cudaFree(globalStatus1));
CHECK(cudaFree(dSimConfig1));
CHECK(cudaFree(kernelStatus1));
delete simConfig1;
delete enzConfig1;
delete enzConfig2;
}
Don’t worry about the compiler warnings, they are there because I have broken the while loop inside the main kernel. The execution sequence in main is the following:
(859) initializeKernel <<<grid, block>>> (kernelStatus1, dEnzConfig1, dSimConfig1);
(870) annealProcess <<<grid, block>>> (dEnzConfig1, dSimConfig1, kernelStatus1, globalStatus1);
(875) updateGlobalStatus <<<grid, block>>> (kernelStatus1, globalStatus1);
The struct here to watch out for is the kernelStatus1 (forwarded as currentKernelStatus inside the kernels) which is an AoS which is declared and allocated on lines 849-853 before it is initialized through the CUDA call on line 859. For clarity, only thread 1 outputs the printf statements. Inside the annealProcess kernel a section of the freshly initialized data is outputted on line 497. Then some modifications to data occur and only one iteration step is progressed. The aftermath of this is outputted on line 575. So far everything is fine.
But then when we move on to the updateGlobalStatus kernel, some weird things start to happen. The data, forwarded from annealKernel is outputted on 631. Thew three printf statements looks as follows:
printf("annealProcess InitData: Tid %d, CfgA %d, Iter %d.\n", threadId,
currentKernelStatus[threadId].currentOptCfgA,
currentKernelStatus[threadId].numIterations);
printf("annealProcess newData: Tid %d, CfgA %d , Iter %d.\n", threadId,
currentKernelStatus[threadId].currentOptCfgA,
currentKernelStatus[threadId].numIterations);
printf("updateGlobalStatus - Data from annealK: Tid %d, CfgA %d , Iter %d.\n", idx,
currentKernelStatus[idx].currentOptCfgA,
currentKernelStatus[idx].numIterations);
An example output is as follows:
annealProcess InitData: Tid 1, CfgA 1040, Iter 0.
annealProcess newData: Tid 1, CfgA 10470, Iter 1.
updateGlobalStatus - Data from annealK: Tid 1, CfgA 0, Iter 10470.
The top output is the output from the initialization. The middle output is after one iteration and a change to cfgA. The bottom output should be exactly the same as the middle but it’s not. Instead “Iter” somehow is CfgA and all CfgAs are 0.
I can’t see how that happened as no changes are made to these variables in the process…