Hi,

I am trying to implement a large chunk of computation involved in the least squares fit of a circle on the GPU. I had posted my code earlier as well and got a couple of suggestions which I implemented and got a significant performance gain. But there still seem to be some issues which cause my implementation to be much slower than the CPU. One of them I think is this

1)I am multiplying a matrix with its transpose ((3xn)*(nx3)). I have a block size of 50x3 and my threadIdx.x (tid) goes from 0-50 and threadIdx.y (wid) from 0-3. The result obviously is a 3x3 but the problem is that the index tid goes beyond 3 which would result in unnecessary computation.

Here is my code…

Host code

```
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil.h>
// includes, kernels
#include <fitcircle_kernel.cu>
//
////////////////////////////////////////////////////////////////////////////////
//! Entry point for Cuda functionality on host side
//! @param data data to process on the device
//! @param estimates initial estimates of parameters
////////////////////////////////////////////////////////////////////////////////
extern "C"
void runTest(float *data, float *estimates, int n )
{
CUT_CHECK_DEVICE();
unsigned int hTimer;
cutCreateTimer(&hTimer);
unsigned int size_A = n*3;
unsigned int mem_size_A = sizeof(float)* size_A;
unsigned int size_B = 3;
unsigned int mem_size_B = sizeof(float)*size_B;
unsigned int size_updates = 3;
unsigned int mem_size_updates = sizeof(float)*size_updates;
// setup execution parameters
dim3 threads(50,3,1);
dim3 grid(n/threads.x,1,1);
// allocate device memory
float* d_A;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, mem_size_A));
float* d_B;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_B, mem_size_B));
float* d_updates;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_updates, mem_size_updates));
// allocate mem for the result on host side
float* h_D = (float*) malloc(mem_size_updates);
float err0;
float eps = 0.00001;
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_A, data, mem_size_A,
cudaMemcpyHostToDevice) );
cutResetTimer(hTimer);
cutStartTimer(hTimer);
do{
CUDA_SAFE_CALL(cudaMemcpy(d_B, estimates, mem_size_B,
cudaMemcpyHostToDevice) );
// execute the kernel
kernel<<< grid, threads>>>( d_A, d_B, d_updates, n);
// check if kernel execution generated an error
CUT_CHECK_ERROR("Kernel execution failed");
// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(h_D, d_updates, mem_size_updates,
cudaMemcpyDeviceToHost));
estimates[0]+=h_D[0];
estimates[1]+=h_D[1];
estimates[2]+=h_D[2];
err0=(fabs(h_D[0]) + fabs(h_D[1]) + fabs(h_D[2]));
}while(err0>eps);
cutStopTimer(hTimer);
printf("Total GPU time: %f msecs.\n", cutGetTimerValue(hTimer));
printf("\n D\n");
for(unsigned int i = 0; i<3; i++)
{
printf("%f \n", estimates[i]);
}
// cleanup memory
free(h_D);
CUDA_SAFE_CALL(cudaFree(d_A));
CUDA_SAFE_CALL(cudaFree(d_B));
CUDA_SAFE_CALL(cudaFree(d_updates));
}
```

Kernel

```
#ifndef _FITCIRCLE_KERNEL_H_
#define _FITCIRCLE_KERNEL_H_
#include<stdio.h>
#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
#define AS(i,j) CUT_BANK_CHECKER(((float*)&As[0][0]))
#define BS(i) CUT_BANK_CHECKER(((float*)&Bs[0]))
#define CS(i, j) CUT_BANK_CHECKER(((float*)&Cs[0][0]))
#define DS(i) CUT_BANK_CHECKER(((float*)&Ds[0]))
#define PS(i) CUT_BANK_CHECKER(((float*)&Ps[0]))
#define RS(i) CUT_BANK_CHECKER(((float*)&Rs[0]))
#define LS(i) CUT_BANK_CHECKER(((float*)&Ls[0]))
#define YS(i) CUT_BANK_CHECKER(((float*)&Ys[0]))
#define XS(i) CUT_BANK_CHECKER(((float*)&Xs[0]))
#else
#define AS(i, j) As[i][j]
#define BS(i) Bs[i]
#define CS(i, j) Cs[i][j]
#define DS(i) Ds[i]
#define PS(i) Ps[i]
#define RS(i) Rs[i]
#define LS(i) Ls[i]
#define YS(i) Ys[i]
#define XS(i) Xs[i]
#endif
/////////////////////////////////////////////////////////////////////////////
//! Kernel for minimization of parameters for least squares fit of a circle//
/////////////////////////////////////////////////////////////////////////////
__global__ void kernel(float* A, float* B, float* D, unsigned int n)
{
// Block index
const unsigned int bx = blockIdx.x;
const unsigned int by = blockIdx.y;
// Thread index
const unsigned int tid = threadIdx.x;
const unsigned int wid = threadIdx.y;
const unsigned int htid = (tid * blockDim.y) + wid;
// Index of the first sub-matrix of A processed by the block
int aBegin = n * blockDim.x * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + n*3;
// Step size used to iterate through the sub-matrices of A
int aStep = blockDim.x*blockDim.y;
__shared__ float As[50][3];
__shared__ float Bs[3];
__shared__ float Cs[50][3];
__shared__ float Ds[50];
__shared__ float Ps[9];
__shared__ float Rs[3];
__shared__ float Ls[9];
__shared__ float Ys[3];
__shared__ float Xs[3];
float Csub = 0.0;
float Dsub = 0.0;
float sum = 0.0;
for (int a = aBegin;a < aEnd;a += aStep)
{
//loading data onto shared memory
AS(tid, wid) = A[a+htid];
__syncthreads();
BS(0) = B[0];
BS(1) = B[1];
BS(2) = B[2];
float r=0.0;
//Populating the Jacobian
r = sqrt((AS(tid, 0)-BS(0))*(AS(tid, 0)-BS(0))+(AS(tid, 1)-BS(1))*(AS(tid, 1)-BS(1)));
CS(tid, 0) = -(AS(tid, 0)-BS(0))/r;
CS(tid, 1) = -(AS(tid, 1)-BS(1))/r;
CS(tid, 2) = -(AS(tid, 2));
DS(tid) = -(r - BS(2));
__syncthreads();
//multiplying the Jacobian with its transpose
for(int k = 0; k < blockDim.x; k++)
Csub += CS(k,wid)* CS(k,tid);
__syncthreads();
PS(htid) = Csub;
//the right-hand side vector
for(int k = 0; k < blockDim.x; ++k)
Dsub += CS(k, wid) * DS(k);
__syncthreads();
RS(wid) = Dsub;
}
//Solving the matrix using Cholesky decomposition to obtain the lower triangular matrix
for(int i=0; i<blockDim.y;++i)
{
LS(i*(blockDim.y+1)) = sqrt(PS(i*(blockDim.y+1)));
for(int j=0;j<blockDim.y;++j)
{
sum=0.0;
for(int k=0; k <= i-1; ++k)
{
sum += LS(i*blockDim.y+k)*LS(j*blockDim.y+k);
}
LS(j*blockDim.y+i) = (PS(i*blockDim.y+j)-sum)/LS((blockDim.y+1)*i);
}
}
//forward substitution
YS(0)= RS(0)/LS(0);
for (int i=1; i<blockDim.y; i++)
{
sum = 0.0;
for (int j=0; j<=i-1; j++)
{
sum += LS(i*blockDim.y+j)*YS(j);
}
YS(i) = (RS(i)-sum)/LS((blockDim.y+1)*i);
}
XS(blockDim.y-1) = YS(blockDim.y-1)/LS(blockDim.y*(blockDim.y-1)+(blockDim.y-1));
for (int i=blockDim.y-2; i>=0; i--)
{
sum = 0.0;
for (int j=i+1; j<blockDim.y; j++)
{
sum += LS(j*blockDim.y+i)*XS(j);
}
XS(i) = (YS(i)-sum)/LS((blockDim.y+1)*(i));
}
D[wid] = XS(wid);
}
#endif // #ifndef _FITCIRCLE_KERNEL_H_
```

Here n is the size which is any multiple of 50 as my block size is 50.

The results are alright but I am just not able to figure out why I am not getting the performance I need. I would greatly appreciate any suggestions about how Icould make this more parallel as I really need to get this working quickly.

Thanks in advance.

Shyam