Hi,when I setup a GPU grid as following to solve linear equations:

dim3 dimBlock(1,512);

dim3 dimGrid((dim+dimBlock.x-1)/dimBlock.x, (dim+dimBlock.y-1)/dimBlock.y);

solveAk<<<128,128>>>(d_A,d_B,k,dim,d_returnValue);

LUbuildKernel<<<dimGrid,dimBlock>>>(d_A,d_B,k,dim,d_returnValue)

I can only get correct answers with (N:the power of linear equations)N<512.

if the GPU grid as following:

dim3 dimBlock(16,16);

dim3 dimGrid((dim+dimBlock.x-1)/dimBlock.x, (dim+dimBlock.y-1)/dimBlock.y);

solveAk<<<128,128>>>(d_A,d_B,k,dim,d_returnValue);

LUbuildKernel<<<dimGrid,dimBlock>>>(d_A,d_B,k,dim,d_returnValue)

I can only get correct answers with (N:the power of linear equations)N<16.

How can I design a GPU grid or how to modify kernel functions to supass 512 threads?

thanks

my kernel functions are as following:

**global** void LUbuildKernel(float *A, float B,int k,int dim,int returnValue)*(A+i

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

int j = blockIdx.y * blockDim.y + threadIdx.y;

if(i!=k && j<dim && i<dim)

{

(A+idim+j)=(A+idim+j)- (

*dim+k))*(

*(A+k*dim+j));

}

returnValue[0]=1;

}

**global** void solveAk(float *A,float B,int k,int dim,int returnValue)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if( fabsf((A+kdim+k)) < 0.000000001 ) returnValue[0]=100000000;
while(i<dim){
if(i>k)
{
(A+kdim+i)=*(A+k

*dim+i)/(*(A+k

*dim+k));*

}

if(i!=k)(A+i

}

if(i!=k)

*(B+i)=*(B+i)-*dim+k)*(

*(B+k));*

i+=blockDim.xgridDim.x;

i+=blockDim.x

}

}