Hi all,
I am trying to use the lu decomposition algorithm to solve a tridiagonal matrix on CUDA. I am using the serial lu decomposition method in the kernel and one thread in each block will solve the matrix serially in a loop.
I am hoping that running several blocks in parallel will speed up the performance.
But when I have a large number of blocks solveing matrices of size 256 I get the following error after kernel is called:
too many resouces requested for launch
int blockSize = 1;
int numBlocks = 8;
int sharedmem = 256 * sizeof(float2);
cu_solve<<<numBlocks, blockSize, sharedmem >>> (device_upperdiag, device_diag, device_lowerdiag, device_inputVector, device_outputVector);
and here is the kernel:
global void cu_solve(float2 *device_upperdiag, float2 *device_diag, float2 *device_lowerdiag, float2 *device_inputVector, float2 *device_outputVector)
{
extern __shared__ float psave[];
int loop;
float bsave;
int idx=blockIdx.x*SIZE;
idx=blockIdx.x*SIZE;
bsave = device_diag[idx].x;
device_outputVector[idx].x = device_inputVector[idx].x / bsave;
for (loop = idx+1; loop < idx+SIZE; loop++ )
{
psave[loop] = device_upperdiag[loop - 1].x / bsave;
bsave = device_diag[loop].x - psave[loop] * device_lowerdiag[loop - 1].x;
device_outputVector[loop].x = (device_inputVector[loop].x - device_lowerdiag[loop - 1].x * device_outputVector[loop - 1].x) / bsave;
}
for (loop = idx+SIZE - 2; loop >= idx; loop-- )
device_outputVector[loop].x -= psave[loop + 1] * device_outputVector[loop + 1].x;
}
can anyone help me with this please :)