sliding correlation between pixel and neighborhood too many ressources requested...

Hi everyone,

I’m trying to code a program which calculates a sliding correlation between a central pixel (from a frame at time=t) and his 5x5 pixels neighborhood from the previous frame at time=t-1 by using the Bravais-Pearson formula:

My vector X is a 5x5 pixels central region
My vector Y is a 5x5 pixels region which move around the central pixel.
rp is the correlation coefficient.

The goal of this algorithm is to detect what pixel in a 5x5 neighborhood ( from frame at time = t-1) matches the best with a central pixel (from frame at time = t) so it means to look for what is the highest value of rp.

The code crashes by saying “too many ressources requested” when I use the “for loop” for scanning the neighborhood.

I think there is too much registers used but I don’t know how to decrease it…

Anyone can help me? :tearywave: Thanks :)

My code :

__shared__ float s_A[BLOCK_WIDTH+4][BLOCK_HEIGHT+4];
 __shared__ float s_B[BLOCK_WIDTH+8][BLOCK_HEIGHT+8];

const	int xIndex = threadIdx.x + (blockDim.x) * blockIdx.x;
const	int yIndex = threadIdx.y * width + width * blockIdx.y * (blockDim.x);

int x = threadIdx.x +2;
int y = threadIdx.y +2;
int x_B = threadIdx.x +4;
int y_B = threadIdx.y +4;

float corrMax=0;

//Linear Corr of Bravais-Pearson between original and previous data

  s_B[x_B][y_B]   = previousDataFormat[xIndex + yIndex];      //load texture of previous frame in shared memory

  if((x_B-4) < 4){
  s_B[x_B-4][y_B] = previousDataFormat[xIndex + yIndex];
  }

  if((x_B-4) < 4 && (y_B-4) > BLOCK_HEIGHT -8 ){
  s_B[x_B-2][y_B+2] = previousDataFormat[xIndex + yIndex];
  }

  if((x_B-4) < 4 && (y_B-4) < 4 ){
  s_B[x_B-4][y_B-4] = previousDataFormat[xIndex + yIndex];
  }

  if((x_B-4) > BLOCK_WIDTH -8){
  s_B[x_B+4][y_B] = previousDataFormat[xIndex + yIndex];
  }

  if((x_B-4) > (BLOCK_WIDTH -8) && (y_B-4) > BLOCK_HEIGHT -8 ){
  s_B[x_B+4][y_B+4] = previousDataFormat[xIndex + yIndex];
  }

  if((x_B-4) > (BLOCK_WIDTH -8) && (y_B-4) < 4 ){
  s_B[x_B+4][y_B-4] = previousDataFormat[xIndex + yIndex];
  }

  if((y_B-4) < 4){
  s_B[x_B][y_B-4] = previousDataFormat[xIndex + yIndex];
  }

  if((y_B-4) > BLOCK_HEIGHT -8){
  s_B[x_B][y_B+4] = previousDataFormat[xIndex + yIndex];
  }

 __syncthreads();

    s_A[x][y]   = originaldata[xIndex + yIndex];                    //load texture of current frame in shared memory

  if((x-2) < 2){
  s_A[x-2][y] = originaldata[xIndex + yIndex];
  }

  if((x-2) < 2 && (y-2) > BLOCK_HEIGHT -4 ){
  s_A[x-2][y+2] = originaldata[xIndex + yIndex];
  }

  if((x-2) < 2 && (y-2) < 2 ){
  s_A[x-2][y-2] = originaldata[xIndex + yIndex];
  }

  if((x-2) > BLOCK_WIDTH -4){
  s_A[x+2][y] = originaldata[xIndex + yIndex];
  }

  if((x-2) > (BLOCK_WIDTH -4) && (y-2) > BLOCK_HEIGHT -4 ){
  s_A[x+2][y+2] = originaldata[xIndex + yIndex];
  }

  if((x-2) > (BLOCK_WIDTH -4) && (y-2) < 2 ){
  s_A[x+2][y-2] = originaldata[xIndex + yIndex];
  }

  if((y-2) < 2){
  s_A[x][y-2] = originaldata[xIndex + yIndex];
  }

  if((y-2) > BLOCK_HEIGHT -4){
  s_A[x][y+2] = originaldata[xIndex + yIndex];
  }

  __syncthreads();

  int i=0, j=0;

for(int i=-2;i<=2;i++)for (int j=-2;j<=2;j++)
{
float AverValCurrDat = (s_A[y] + s_A[x-1][y-1] + s_A[y-1] + s_A[x+1][y-1] + s_A[x-1][y] + s_A[x+1][y] + s_A[x-1][y+1] + s_A[y+1] + s_A[x+1][y+1]
+ s_A[x-2][y-2] + s_A[x-2][y-1] + s_A[x-2][y] + s_A[ x-2][y+1] + s_A[x-2][y+2]
+ s_A[x-1][y-2] + s_A[x-1][y+2]
+ s_A[x+1][y-2] + s_A[x+1][y+2]
+ s_A[x+2][y-2] + s_A[x+2][y-1] + s_A[x+2][y] + s_A[ x+2][y+1] + s_A[x+2][y+2]) / 25;

 float AverValPrevDat = (s_B[x_B+i][y_B+j] + s_B[x_B+i-1][y_B+j-1]  + s_B[x_B+i][y_B+j-1] + s_B[x_B+i+1][y_B+j-1] + s_B[x_B+i-1][y_B+j] + s_B[x_B+i+1][y_B+j] + s_B[x_B+i-1][y_B+j+1] + s_B[x_B+i][y_B+j+1] + s_B[x_B+i+1][y_B+j+1]
                       + s_B[x_B+i-2][y_B+j-2] + s_B[x_B+i-2][y_B+j-1] + s_B[x_B+i-2][y_B+j] + s_B[ x_B+i-2][y_B+j+1] + s_B[x_B+i-2][y_B+j+2] 
                       + s_B[x_B+i-1][y_B+j-2] + s_B[x_B+i-1][y_B+j+2]
                       + s_B[x_B+i+1][y_B+j-2] + s_B[x_B+i+1][y_B+j+2]
                       + s_B[x_B+i+2][y_B+j-2] + s_B[x_B+i+2][y_B+j-1] + s_B[x_B+i+2][y_B+j] + s_B[x_B+i+2][y_B+j+1] + s_B[x_B+i+2][y_B+j+2]) / 25;

 float corr = ((s_A[x][y] - AverValCurrDat)*(s_B[x_B+i][y_B+j]-AverValPrevDat)) + ((s_A[x-1][y-1]- AverValCurrDat)*(s_B[x_B+i-1][y_B+j-1]-AverValPrevDat))  + ((s_A[x][y-1]- AverValCurrDat)*(s_B[x_B+i][y_B+j-1]- AverValPrevDat)) + ((s_A[x+1][y-1]- AverValCurrDat)*(s_B[x_B+i+1][y_B+j-1]- AverValPrevDat)) + ((s_A[x-1][y]- AverValCurrDat)*(s_B[x_B+i-1][y_B+j]- AverValPrevDat)) + ((s_A[x+1][y]- AverValCurrDat)*(s_B[x_B+i+1][y_B+j]- AverValPrevDat)) + ((s_A[x-1][y+1]- AverValCurrDat)*(s_B[x_B+i-1][y_B+j+1]- AverValPrevDat)) + ((s_A[x][y+1]- AverValCurrDat)*(s_B[x_B+i][y_B+j+1]- AverValPrevDat)) + ((s_A[x+1][y+1]- AverValCurrDat)*(s_B[x_B+i+1][y_B+j+1]- AverValPrevDat))
                       + ((s_A[x-2][y-2]- AverValCurrDat)*(s_B[x_B+i-2][y_B+j-2]- AverValPrevDat)) + ((s_A[x-2][y-1]- AverValCurrDat)*(s_B[x_B+i-2][y_B+j-1]- AverValPrevDat)) + ((s_A[x-2][y]- AverValCurrDat)*(s_B[x_B+i-2][y_B+j]- AverValPrevDat)) + ((s_A[x-2][y+1]- AverValCurrDat)*(s_B[x_B+i-2][y_B+j+1]- AverValPrevDat)) + ((s_A[x-2][y+2]- AverValCurrDat)*(s_B[x_B+i-2][y_B+j+2]- AverValPrevDat)) 
                       + ((s_A[x-1][y-2]- AverValCurrDat)*(s_B[x_B+i-1][y_B+j-2]- AverValPrevDat)) + ((s_A[x-1][y+2]- AverValCurrDat)*(s_B[x_B+i-1][y_B+j+2]- AverValPrevDat))
                       + ((s_A[x+1][y-2]- AverValCurrDat)*(s_B[x_B+i+1][y_B+j-2]- AverValPrevDat)) + ((s_A[x+1][y+2]- AverValCurrDat)*(s_B[x_B+i+1][y_B+j+2]- AverValPrevDat))
                       + ((s_A[x+2][y-2]- AverValCurrDat)*(s_B[x_B+i+2][y_B+j-2]- AverValPrevDat)) + ((s_A[x+2][y-1]- AverValCurrDat)*(s_B[x_B+i+2][y_B+j-1]- AverValPrevDat)) + ((s_A[x+2][y]- AverValCurrDat)*(s_B[x_B+i+2][y_B+j]- AverValPrevDat)) + ((s_A[ x+2][y+1]- AverValCurrDat)*(s_B[x_B+i+2][y_B+j+1]- AverValPrevDat)) + ((s_A[x+2][y+2]- AverValCurrDat)*(s_B[x_B+i+2][y_B+j+2]- AverValPrevDat));

 corr = corr / (sqrtf(((s_A[x][y] - AverValCurrDat)*(s_A[x][y] - AverValCurrDat)) + ((s_A[x-1][y-1]- AverValCurrDat)*(s_A[x-1][y-1]- AverValCurrDat))  + ((s_A[x][y-1]- AverValCurrDat)*(s_A[x][y-1]- AverValCurrDat)) + ((s_A[x+1][y-1]- AverValCurrDat)*(s_A[x+1][y-1]- AverValCurrDat)) + ((s_A[x-1][y]- AverValCurrDat)*(s_A[x-1][y]- AverValCurrDat)) + ((s_A[x+1][y]- AverValCurrDat)*(s_A[x+1][y]- AverValCurrDat)) + ((s_A[x-1][y+1]- AverValCurrDat)*(s_A[x-1][y+1]- AverValCurrDat)) + ((s_A[x][y+1]- AverValCurrDat)*(s_A[x][y+1]- AverValCurrDat)) + ((s_A[x+1][y+1]- AverValCurrDat)*(s_A[x+1][y+1]- AverValCurrDat))
                       + ((s_A[x-2][y-2]- AverValCurrDat)*(s_A[x-2][y-2]- AverValCurrDat)) + ((s_A[x-2][y-1]- AverValCurrDat)*(s_A[x-2][y-1]- AverValCurrDat)) + ((s_A[x-2][y]- AverValCurrDat)*(s_A[x-2][y]- AverValCurrDat)) + ((s_A[x-2][y+1]- AverValCurrDat)*(s_A[x-2][y+1]- AverValCurrDat)) + ((s_A[x-2][y+2]- AverValCurrDat)*(s_A[x-2][y+2]- AverValCurrDat)) 
                       + ((s_A[x-1][y-2]- AverValCurrDat)*(s_A[x-1][y-2]- AverValCurrDat)) + ((s_A[x-1][y+2]- AverValCurrDat)*(s_A[x-1][y+2]- AverValCurrDat))
                       + ((s_A[x+1][y-2]- AverValCurrDat)*(s_A[x+1][y-2]- AverValCurrDat)) + ((s_A[x+1][y+2]- AverValCurrDat)*(s_A[x+1][y+2]- AverValCurrDat))
                       + ((s_A[x+2][y-2]- AverValCurrDat)*(s_A[x+2][y-2]- AverValCurrDat)) + ((s_A[x+2][y-1]- AverValCurrDat)*(s_A[x+2][y-1]- AverValCurrDat)) + ((s_A[x+2][y]- AverValCurrDat)*(s_A[x+2][y]- AverValCurrDat)) + ((s_A[ x+2][y+1]- AverValCurrDat)*(s_A[ x+2][y+1]- AverValCurrDat)) + ((s_A[x+2][y+2]- AverValCurrDat)*(s_A[x+2][y+2]- AverValCurrDat))) 
                       
             * sqrtf( ((s_B[x_B][y_B+j] - AverValPrevDat)*(s_B[x_B+i][y_B+j] - AverValPrevDat)) + ((s_B[x_B+i-1][y_B+j-1]- AverValPrevDat)*(s_B[x_B+i-1][y_B+j-1]- AverValPrevDat))  + ((s_B[x_B+i][y_B+j-1]- AverValPrevDat)*(s_B[x_B+i][y_B+j-1]- AverValPrevDat)) + ((s_B[x_B+i+1][y_B+j-1]- AverValPrevDat)*(s_B[x_B+i+1][y_B+j-1]- AverValPrevDat)) + ((s_B[x_B+i-1][y_B+j]- AverValPrevDat)*(s_B[x_B+i-1][y_B+j]- AverValPrevDat)) + ((s_B[x_B+i+1][y_B+j]- AverValPrevDat)*(s_B[x_B+i+1][y_B+j]- AverValPrevDat)) + ((s_B[x_B+i-1][y_B+j+1]- AverValPrevDat)*(s_B[x_B+i-1][y_B+j+1]- AverValPrevDat)) + ((s_B[x_B+i][y_B+j+1]- AverValPrevDat)*(s_B[x_B+i][y_B+j+1]- AverValPrevDat)) + ((s_B[x_B+i+1][y_B+j+1]- AverValPrevDat)*(s_B[x_B+i+1][y_B+j+1]- AverValPrevDat))
                       + ((s_B[x_B+i-2][y_B+j-2]- AverValPrevDat)*(s_B[x_B+i-2][y_B+j-2]- AverValPrevDat)) + ((s_B[x_B+i-2][y_B+j-1]- AverValPrevDat)*(s_B[x_B+i-2][y_B+j-1]- AverValPrevDat)) + ((s_B[x_B+i-2][y_B+j]- AverValPrevDat)*(s_B[x_B+i-2][y_B+j]- AverValPrevDat)) + ((s_B[x_B+i-2][y_B+j+1]- AverValPrevDat)*(s_B[x_B+i-2][y_B+j+1]- AverValPrevDat)) + ((s_B[x_B+i-2][y_B+j+2]- AverValPrevDat)*(s_B[x_B+i-2][y_B+j+2]- AverValPrevDat)) 
                       + ((s_B[x_B+i-1][y_B+j-2]- AverValPrevDat)*(s_B[x_B+i-1][y_B+j-2]- AverValPrevDat)) + ((s_B[x_B+i-1][y_B+j+2]- AverValPrevDat)*(s_B[x_B+i-1][y_B+j+2]- AverValPrevDat))
                       + ((s_B[x_B+i+1][y_B+j-2]- AverValPrevDat)*(s_B[x_B+i+1][y_B+j-2]- AverValPrevDat)) + ((s_B[x_B+i+1][y_B+j+2]- AverValPrevDat)*(s_B[x_B+i+1][y_B+j+2]- AverValPrevDat))
                       + ((s_B[x_B+i+2][y_B+j-2]- AverValPrevDat)*(s_B[x_B+i+2][y_B+j-2]- AverValPrevDat)) + ((s_B[x_B+i+2][y_B+j-1]- AverValPrevDat)*(s_B[x_B+i+2][y_B+j-1]- AverValPrevDat)) + ((s_B[x_B+i+2][y_B+j]- AverValPrevDat)*(s_B[x_B+i+2][y_B+j]- AverValPrevDat)) + ((s_B[x_B+i+2][y_B+j+1]- AverValPrevDat)*(s_B[x_B+i+2][y_B+j+1]- AverValPrevDat)) + ((s_B[x_B+i+2][y_B+j+2]- AverValPrevDat)*(s_B[x_B+i+2][y_B+j+2]- AverValPrevDat)) ));


 corrMax=fmaxf(corrMax,corr); // keep the highest correlation 

}

problem solved : synchronized threads after calculating float AverValPrevDat