I’ve started writing a kernel compatibility checker and want to share it for enhancement by the community. Please add checks which you know about and/or have caused problems, or correct any problems with my implementation and publish your enhanced code. Thanks.
bool
getKernelCompatibility( int nblocks,
int nthreadsPerBlock,
int nsharedGlobalMemPerBlock,
int dev,
void *pkernel)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
int nregistersPerThread;
switch(deviceProp.major)
{
case 1:
nregistersPerThread = 128;
break;
case 2:
nregistersPerThread = 63;
break;
case 3:
switch(deviceProp.minor)
{
case 0:
nregistersPerThread = 63;
break;
case 5:
nregistersPerThread = 255;
break;
}
break;
}
struct cudaFuncAttributes funcAttrib;
cudaFuncGetAttributes(&funcAttrib, pkernel);
if(nthreadsPerBlock > deviceProp.maxThreadsPerBlock ||
nthreadsPerBlock > funcAttrib.maxThreadsPerBlock)
{
printf("numThreads=%d > deviceThreads=%d\n",
nthreadsPerBlock, deviceProp.maxThreadsPerBlock);
return false;
}
if(funcAttrib.numRegs * nthreadsPerBlock > deviceProp.regsPerBlock)
{
printf("numRegs=%d > deviceRegs=%d\n",
funcAttrib.numRegs * nthreadsPerBlock, deviceProp.maxThreadsPerBlock);
return false;
}
if(funcAttrib.numRegs > nregistersPerThread)
{
printf("numRegs=%d > deviceRegs=%d\n",
funcAttrib.numRegs, nregistersPerThread);
return false;
}
if(funcAttrib.constSizeBytes * nblocks * nthreadsPerBlock > deviceProp.totalConstMem)
{
printf("numConst=%d > deviceTotConst=%d\n",
funcAttrib.constSizeBytes * nblocks * nthreadsPerBlock, deviceProp.totalConstMem);
return false;
}
if(funcAttrib.sharedSizeBytes + nsharedGlobalMemPerBlock > deviceProp.sharedMemPerBlock)
{
printf("memShared=%d > deviceShared=%d\n",
funcAttrib.sharedSizeBytes + nsharedGlobalMemPerBlock, deviceProp.sharedMemPerBlock);
return false;
}
return true;
}