Currently I write a small program from cuda - it renders electrical field potential. I have very low global load & store efficiency I do not know what to do. Nvvp profiler says I have coalesced access for both read/write I do not use local memory, only shared for computations, I do not see any reason. Can someone explain , what am I doing wrong?

```
__device__ void CudaSum(float *B, int threadsPerPixel, int bIdx, int cIdx,
unsigned char * C) {
const float scale = 1.0f;
const float offset = 128.0f;
register float force = 0.0f;
register float val = 0.0f;
register int l = 0;
for (l = 0; l < threadsPerPixel; l++)
force += B[bIdx + l];
val = scale * force + offset;
val = max(0.0f, min(255.0f, val));
C[cIdx] = val;
}
__global__ void CudaFieldCalculate(float *A, int numPoints, int width,
int height, unsigned char *C, int numElements, float *B,
int pointsPerThread, int threadsPerPixel) {
const float scaleX = 0.01f;
const float scaleY = 0.01f;
__shared__ float3 temp[20];
int pos = blockDim.x * blockIdx.x + threadIdx.x;
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = pos / threadsPerPixel;
int k = pos - j*threadsPerPixel;
int elem = i * width + j;
int cIdx = elem * numElemsPerPoint;
int bIdx = elem * threadsPerPixel;
float3 *A1 = (float3 *) A;
register float diffX, diffY;
register float force = 0.0f;
register float3 wars;
if (i >= height || j >= width)
return;
if (threadIdx.x < numPoints)
temp[threadIdx.x] = A1[threadIdx.x];
__syncthreads();
//if (k != 0)
// return;
int limit = min(pointsPerThread, numPoints-k*pointsPerThread);
for (register int m = 0; m < limit; m++) {
//wars = temp[k*pointsPerThread+m];
diffX = (j - temp[k*pointsPerThread+m].x) * scaleX;
diffY = (i - temp[k*pointsPerThread+m].y) * scaleY;
force += temp[k*pointsPerThread+m].z / sqrt(diffX * diffX + diffY * diffY);
}
return;
//if (k != 0)
// return;
// musimy przez pami?? dzielon?
B[bIdx+k] = force;
__syncthreads();
if (k == 0)
CudaSum(B, threadsPerPixel, bIdx, cIdx, C);
}
```

Only kernel source