While creating a test-case for the problem described above, I stumbled upon another strange behaviour that is reproducable in Linux / Windows and CUDA Toolkit 2.3 and 3.0. The problem is that all blocks should use atomicAdd to increment a counter of the buckets that couldn’t be hashed correctly and given X input sequences that definitely cannot be hashed correctly this counter should have X as the resulting value. But depending on the size of the cuckooTables the counter varied drastically. How is this possible?
Here is the code of the minimal example (it is also included in the attachment):
global void reproduceError(int *input, size_t inputSize, int *failedFlag, int tableSize)
{
/* shared int cycleCounterAr[1];
__shared__ int conflictsExistAr[1];
int &cycleCounter = cycleCounterAr[0];
int &conflictsExist = conflictsExistAr[0];*/
// shared int cuckooTables[3][TableSize];
int &cycleCounter = sharedMemory[0];
int &conflictsExist = sharedMemory[1];
int *cuckooTables = &sharedMemory[2];
// check for abortion
//if (*failedFlag)
// return;
// only the first thread is allowed to write the shared information.
if (threadIdx.x == 0)
{
cycleCounter = 0;
conflictsExist = 0;
}
// clear the table, otherwise crappy information might still be in the
// memory.
if (threadIdx.x < tableSize)
{
for (int i=0; i < 3; i++)
(&cuckooTables[i])[threadIdx.x] = 0;
}
// all threads wait until first one has set the necessary data.
__syncthreads();
int maxNumThreads = blockDim.x;
if (blockIdx.x == gridDim.x -1) // last block
{
int tmp = inputSize % blockDim.x; // isn't complete full
if (tmp != 0)
maxNumThreads = tmp;
}
if (threadIdx.x < maxNumThreads)
{
int element = input[blockIdx.x * blockDim.x + threadIdx.x];
int lastTableIndex = 1;
int currentTableIndex = 0;
int hashedValue;
while(cycleCounter <= MaxCycleCount)
{
// reset abortion flag.
if (threadIdx.x == 0)
{
conflictsExist = 0;
cycleCounter++;
}
// __syncthreads();
// if we need to check a new table -> write into the hashed position
// (this position may be written by many others as well).
if (lastTableIndex != currentTableIndex)
{
hashedValue = element % tableSize;
(&cuckooTables[currentTableIndex])[hashedValue] = element;
lastTableIndex = currentTableIndex;
}
// sync threads so that we can check which thread won the write competition.
__syncthreads();
int successfullElement = (&cuckooTables[currentTableIndex])[hashedValue];
if (successfullElement != element)
{
// I lost, hence update block flag and set next table to use
conflictsExist++;
currentTableIndex++;
currentTableIndex %= 3;
}
__syncthreads();
if (conflictsExist == 0)
{
break;
}
} // while(cycleCounter <= MaxCycleCount)
} // if (threadIdx.x < maxNumThreads)
__syncthreads();
if (conflictsExist)
{
// set the abortion flag for the rest of the buckets (blocks)
if(threadIdx.x == 0)
atomicAdd(failedFlag, 1);
return;
}
}
void generateTestData(vector& testData, uint bucketCount, int cuckooTableSize)
{
for (uint i=0; i < bucketCount; i++)
{
testData.push_back(0);
for (uint j=1; j < cuckooTableSize*3+3; j++)
{
if (j%cuckooTableSize == 0)
continue;
testData.push_back(j);
}
}
}
int main(int argc, char** args)
{
int *d_input = NULL;
int *d_failedFlag = NULL;
int device = -1;
cudaDeviceProp prop;
prop.major = 1;
prop.minor = 2;
cutilSafeCall( cudaChooseDevice(&device, &prop) );
if (device == -1)
{
cerr << "No device with compute capability at least 1.2 found!";
exit(1);
}
cutilSafeCall( cudaGetDeviceProperties(&prop, device) );
const int maxKernelRuns = 100;
int bucketCount = 1;
vector<int> testData;
int *h_failedFlag = new int(0);
cout << "Every kernel setup is tested for " << maxKernelRuns << " times." << endl;
cout << "Testing on card (" << device << "): " << prop.name << " with " << prop.multiProcessorCount << " cores." << endl;
// test with 1, 1*procCount, 2*procCount, etc.
for (uint numBlocksPerProc = 0; numBlocksPerProc <= 10. ; numBlocksPerProc++)
{
if (numBlocksPerProc > 0)
{
bucketCount = 10*numBlocksPerProc * prop.multiProcessorCount;
}
cout << " -Running with bucketCount = " << bucketCount << endl;
// the number of threads per block should be divisable by the warp size = 32
for (int cuckooTableSize=prop.warpSize;
cuckooTableSize <= prop.maxThreadsPerBlock;
cuckooTableSize += prop.warpSize)
{
testData.clear();
generateTestData(testData, bucketCount, cuckooTableSize);
const size_t inputSize = sizeof(int)*testData.size();
cutilSafeCall( cudaMalloc(&d_input, inputSize) );
cutilSafeCall( cudaMemcpy(d_input, &testData[0], inputSize, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMalloc(&d_failedFlag, sizeof(int)) );
uint maxElementsPerBucket = 3 * cuckooTableSize;
size_t sharedMemorySize = sizeof(int) * maxElementsPerBucket +2; // 2 = conflictsExists + cycleCounter
long failedBucketCount = 0;
for (int testCount = 0; testCount < maxKernelRuns; testCount++)
{
*h_failedFlag = 0;
cutilSafeCall( cudaMemset(d_failedFlag, 0, sizeof(int)) );
// cout << "[I] Starting kernel..." << endl;
reproduceError<<<bucketCount, min(maxElementsPerBucket, prop.maxThreadsPerBlock), sharedMemorySize>>>
(d_input, testData.size(), d_failedFlag, cuckooTableSize);
// cout << "[I] Kernel endet..." << endl;
cutilSafeCall( cudaMemcpy(h_failedFlag, d_failedFlag, sizeof(int), cudaMemcpyDeviceToHost) );
failedBucketCount += *h_failedFlag;
}
long divisionCheck = failedBucketCount / maxKernelRuns;
failedBucketCount = ceil(failedBucketCount / (double)maxKernelRuns);
cout << "\t[*] cukooTableSize = " << cuckooTableSize << ", ceiledAverageOfFailedBucketCount = " << failedBucketCount << ", integerDivisionAverage = "<< divisionCheck << endl;
cutilSafeCall( cudaFree(d_input) );
cutilSafeCall( cudaFree(d_failedFlag) );
}
}
delete h_failedFlag;
cout << "\nPress <Enter> to Quit..." << endl;
getchar();
return 0;
}[/codebox]
This test case also includes the infinite looping problem for the CUDA NEXUS Beta 1 Toolkit (compile it in debug mode).
As you can see in the code, there is always only one thread per block doing the atomicAdd, hence the final counter should be the number of buckets, since the input for every bucket is the same and isn’t possibly distributable among the three cuckoo tables.
Here is a small example of the input. Given a cuckooTableSize of 5, the following input is created for every bucket:
0,1,2,3,4,6,7,8,9,11,12,13,14,16,17
The “hash” function is only calculating the input key % tableSize, hence there are 4 elements trying to be distributed among three tables into the first and the second position (namely: 1,6,11,16 and 2,7,12,17). Hence, the kernel can never be finished successfully and the failed counter should be incremented for every block.
Still the counters returned are often wrong, especially if the size of the cuckooTables is larger (480,512 and sometimes even if smaller). Note, that the number of threads per Block is at most 512, hence there may be 3 times as many elements per buckets as there are threads existing. But this shouldn’t change the behaviour of the kernel, right?
This has been tested with:
Windows 7 (64bit),
cl: Microsoft ® 32-bit C/C++ Optimizing Compiler Version 15.00.30729.01 for 80x86
Ubuntu 09.10 (64-bit)
gcc 4.3 (exact version currently not available - I am under Windows right now External Image
Windows nvcc: Built on Fri_Oct_30_12:39:37_PDT_2009 Cuda compilation tools, release 3.0, V0.2.1221 as well as the one from 2.3 Toolkit
I guess the nvcc versions should be the same for linux (I’ll check that tomorrow).
If anyone has any idea how this can happen, I’ll be delighted to hear your hints ;)
P.S. In case you are testing the Linux version with the 2.3 toolkit, you won’t be able to compile it with the make files generated by cmake (There is an nvcc bug that prevents the creation of dependency files if additionally given the “-G0” flag). So just copy the compile command from the console and remove “-M” as well as change the output file name to your liking. deviceProblem.zip (20.3 KB)