Hi,
I have been working on implementing a minimization technique for the least squares fitting of a circle on the GPU. I pass in an nx3 array along with a 3x1 vector and obtain a 3x1vector as the result. My implementation is right but I have two issues

I am using just one block onto which I load all data which limits my array size

Also the implementation is very slow.
I tried using blocks but the kernel does not allow me to pass in an array size more than 256x3 even if I use about 200 threads per block and increase the number of blocks in the grid. I’m not too sure how i can get around this. Here is the code which I know is very inefficient.
host code
void runTest(float *data, float *estimates, int n )
{
CUT_CHECK_DEVICE();
unsigned int hTimer;
cutCreateTimer(&hTimer);
cutStartTimer(hTimer);
unsigned int size_A = n*3;
unsigned int mem_size_A = sizeof(float)* size_A;
unsigned int size_B = 3;
unsigned int mem_size_B = sizeof(float)*size_B;
unsigned int size_updates = 3;
unsigned int mem_size_updates = sizeof(float)*size_updates;
// allocate device memory
float* d_A;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, mem_size_A));
float* d_B;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_B, mem_size_B));
float* d_updates;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_updates, mem_size_updates));
//allocate device memory to store results
//unsigned int mem_size_C = sizeof(float)* 3 * 3;
//float*d_C;
//CUDA_SAFE_CALL(cudaMalloc((void**) &d_C, mem_size_C));
// allocate mem for the result on host side
float* h_D = (float*) malloc(mem_size_updates);
// setup execution parameters
dim3 grid(1,1,1);
dim3 threads(3,n,1);
double err0;
double eps = 0.000001;
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_A, data, mem_size_A,
cudaMemcpyHostToDevice) );
do{
CUDA_SAFE_CALL(cudaMemcpy(d_B, estimates, mem_size_B,
cudaMemcpyHostToDevice) );
// execute the kernel
kernel<<< grid, threads, mem_size_A*4>>>(d_A, d_B,d_updates, n);
// check if kernel execution generated an error
CUT_CHECK_ERROR("Kernel execution failed");
// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(h_D, d_updates, mem_size_updates,
cudaMemcpyDeviceToHost));
estimates[0]+=h_D[0];
estimates[1]+=h_D[1];
estimates[2]+=h_D[2];
err0=(double)(fabs(h_D[0]) + fabs(h_D[1]) + fabs(h_D[2]));
}while(err0>eps);
// cleanup memory
free(h_D);
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_updates));
cutStopTimer(hTimer);
printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));
printf("\nD\n");
for(unsigned int i = 0; i<3; i++)
{
printf("%f \n", estimates[i]);
}
}
KERNEL
#define AS(i) data[ASindex+(i)]
#define BS(i) data[BSindex+(i)]
#define CS(i) data[CSindex+(i)]
#define DS(i) data[DSindex+(i)]
#define PS(i) data[PSindex+(i)]
#define RS(i) data[RSindex+(i)]
#define LS(i) data[LSindex+(i)]
#define YS(i) data[YSindex+(i)]
#define XS(i) data[XSindex+(i)]
///////////////////////////////////////////////////////////////////////////////
//! Kernel for minimization of parameters for least squares fit of a circle
///////////////////////////////////////////////////////////////////////////////
__global__ void kernel(float* A, float* B, float* D, unsigned int n)
{
extern __shared__ float data[];
unsigned int ASindex=0;
unsigned int BSindex=3*n;
unsigned int CSindex=0;
unsigned int DSindex=3*n+3;
unsigned int PSindex=4*n+3;
unsigned int RSindex=4*n+12;
unsigned int LSindex=4*n+15;
unsigned int YSindex=4*n+24;
unsigned int XSindex=4*n+27;
// access thread id
const unsigned int tid = threadIdx.x; // 03
// warp id
const unsigned int wid = threadIdx.y; // 0n
const unsigned int ltid = (wid * blockDim.x) + tid;
//loading data onto shared memory
AS(ltid)=A[ltid];
__syncthreads();
BS(0) = B[0];
BS(1) = B[1];
BS(2) = B[2];
float r=0.0;
//Populating the Jacobian
if((ltid%3)==0)
{
r=sqrt((AS(ltid)BS(0))*(AS(ltid)BS(0))+(AS(ltid+1)BS(1))*(AS(ltid+1)BS(1)));
CS(ltid) = (AS(ltid)BS(0))/r;
CS(ltid+1) = (AS(ltid+1)BS(1))/r;
CS(ltid+2) = (AS(ltid+2));
DS(wid)= (rBS(2));
}
__syncthreads();
float Csub = 0.0;
//multiplying the transpose with the Jacobian
for (int k = 0; k < n; ++k)
Csub += CS(k*blockDim.x + tid)* CS( blockDim.x * k + wid );
__syncthreads();
PS(ltid) = Csub;
//C[ltid]= PS(ltid);
float Dsub = 0.0;
//multiplying the Jacobian by the right hand side vector
for (int k = 0; k < n; ++k)
Dsub += CS(k*blockDim.x + tid)* DS(k);
__syncthreads();
RS(tid)=Dsub;
//D[tid] = RS(tid);
float sum;
//Solving the matrix using Cholesky decomposition
for(int i=0; i<blockDim.x;i++)
{
LS(tid*(blockDim.x+1))=sqrt(PS(tid*(blockDim.x+1)));
for(int j=0;j<blockDim.x;j++)
{
sum=0.0;
for(int k=0; k <= i1; k++)
{
sum += LS(i*blockDim.x+k)*LS(j*blockDim.x+k);
}
LS(j*blockDim.x+i) = (PS(i*blockDim.x+j)sum)/LS((blockDim.x+1)*i);
}
}
__syncthreads();
//Forward Substitution
YS(0)= RS(0)/LS(0);
for (int i=1; i<blockDim.x; i++)
{
sum = 0.0;
for (int j=0; j<=i1; j++)
{
sum += LS(i*blockDim.x+j)*YS(j);
}
YS(i) = (RS(i)sum)/LS((blockDim.x+1)*i);
}
XS(blockDim.x1)=YS(blockDim.x1)/LS(blockDim.x*(blockDim.x1)+(blockDim.x1));
for (int i=blockDim.x2; i>=0; i)
{
sum = 0.0;
for (int j=i+1; j<blockDim.x; j++)
{
sum += LS(j*blockDim.x+i)*XS(j);
}
XS(i) = (YS(i)sum)/LS((blockDim.x+1)*(i));
}
__syncthreads();
D[tid] = XS(tid);
}
Thanks in advance for any help.
Shyam