Ok I found something out… When connecting through ssh to the machine I get a launch time out at a specific kernel. Could you have a look at it?
void mrfutils_calcLabelFlow(const mrfutilsDevPtr flow, const mrfutilsDevPtr flowDims, const mrfutilsDevPtr seg, const mrfutilsDevPtr segDims, mrfutilsDevPtr labelFlow, const int& searchWindowSize, const int& labelsNum, const int& dif)
{
// int blockSize = 1;
int blockSize = ceil(sqrt(10000.0 / labelsNum / sizeof(float)));
blockSize = blockSize < 16 ? blockSize : 16;
dim3 dimGrid(searchWindowSize, searchWindowSize);
dim3 dimBlock(blockSize, blockSize);
printf("Executing 'mrfutils_calcLabelFlow' - gridSize: %d * %d, blockSize: %d * %d, sharedmem: %d\n", searchWindowSize, searchWindowSize, blockSize, blockSize, sizeof(float) * labelsNum * blockSize * blockSize);
// Execute Kernel
calcLabelFlow_kernel<<<dimGrid, dimBlock, sizeof(float) * labelsNum * blockSize * blockSize>>>((float*) flow, (int*) flowDims, (int*)seg, (int*) segDims, (float*)labelFlow, labelsNum, dif);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
}
The above code executes the kernel below
__global__ void calcLabelFlow_kernel(float* flow, int* flowDims, int *seg, int* segDims, float *labelFlow, const int labelsNum, const int dif)
{
int dx = blockIdx.x;
int dy = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int blockSize = blockDim.x;
// Thread index ~ position within SAD surfaces
// int tx = threadIdx.x;
// int ty = threadIdx.y;
extern __shared__ float sumPerLabel[];
for(int i = 0; i < labelsNum; i++)
{
sumPerLabel[i * blockSize * blockSize + tx * blockSize + ty] = 0;
}
// put parameters into registers
int regFlowDims[4] = {flowDims[0], flowDims[1], flowDims[2], flowDims[3]};
int regSegDims[2] = {segDims[0], segDims[1]};
int flowBlockSizeY = ceil((regFlowDims[2] - 2 * dif) / (float)blockSize);
int flowBlockSizeX = ceil((regFlowDims[3] - 2 * dif) / (float)blockSize);
int startY = dif + ty * flowBlockSizeY;
int endY = min(dif + (ty + 1) * flowBlockSizeY, regFlowDims[2] - dif);
int startX = dif + tx * flowBlockSizeX;
int endX = min(dif + (tx + 1) * flowBlockSizeX, regFlowDims[3] - dif);
// Calc flow depending on label
for(int y = startY; y < endY; y++)
{
for(int x = startX; x < endX; x++)
{
int label = MAT_ELEM2(seg, regSegDims, y, x);
sumPerLabel[label * blockSize * blockSize + tx * blockSize + ty] += MAT_ELEM4(flow, regFlowDims, dy, dx, y, x);
}
}
__syncthreads();
if(ty == 0)
{
for(int y = 1; y < blockSize; y++)
{
for(int i = 0; i < labelsNum; i++)
{
sumPerLabel[i * blockSize * blockSize + tx * blockSize] += sumPerLabel[i * blockSize * blockSize + tx * blockSize + y];
}
}
}
__syncthreads();
if(ty == 0 && tx ==0)
{
for(int x = 1; x < blockSize; x++)
{
for(int i = 0; i < labelsNum; i++)
{
sumPerLabel[i * blockSize * blockSize] += sumPerLabel[i * blockSize * blockSize + x * blockSize];
}
}
// write results
for(int i = 0; i < labelsNum; i++)
{
labelFlow[i * regFlowDims[0] * regFlowDims[1] + dx * regFlowDims[0] + dy] = sumPerLabel[i * blockSize * blockSize];
}
}
}
When the kernel time out appears I get this from my printf-statement:
Executing 'mrfutils_calcLabelFlow' - gridSize: 15 * 15, blockSize: 15 * 15, sharedmem: 10800
At compile time it tells me I use 17 registers. What could the reason for the timeout be?