this function is called iwth threadIdx.x == 0
it fills 3 rows into shared memory
[codebox]device_ void fillMaskData(unsigned char * origData, unsigned char sDataMask , int type,int Width){
int start = blockIdx.x * Width;
int end = start +3*Width;
for(int i = start, j = 0; i < end; i++,j++){
sDataMask[j] = origData[i];
__syncthreads();
}
}[/codebox]
this unction writes from shared memory to global in the end of th last thread in each block
[codebox]device void fillwithRowData(unsigned char *datatowrite, unsigned char sDataMask ,int offsetrows,int Width){
int start = (blockIdx.x+offsetrows)*Width;
for(int i = 1; i < Width-1; i++){
datatowrite[start+i] = sDataMask[i];
__syncthreads();
}
}[/codebox]
this is my kernel, launched in 254x254 → margin of 1x1 pixels
[codebox]global void doMedianFilter3x3(unsigned char *data_d_orig ,unsigned char *data_d, int Height, int Width){
extern __shared__ unsigned char sData[]; // Placeholder for 4 rows -> 3 mask, 1 row calculated
unsigned char * sDataMask = (unsigned char *)sData;
unsigned char * RowData =(unsigned char *)&sDataMask[3*Width];
unsigned char medianValues[9];
// Fill shared Data shared Mask at beginning of each block
if(threadIdx.x == 0)
fillMaskData(data_d_orig,sDataMask,3,Width);
//Get the Neighborhood
int idx = (blockIdx.x + 1) * Width + 1 + threadIdx.x;
int sidx = Width + 1 + threadIdx.x; // current pixel -> +1 because of a margin
int pix00 = threadIdx.x; // Upper left
int pix10 = 1 + threadIdx.x; // Upper middle
int pix20 = 2 + threadIdx.x; // Upper right
int pix01 = Width + threadIdx.x; // left
int pix21 = Width + 2 + threadIdx.x; // right
int pix02 = 2 * Width + threadIdx.x; // lower left
int pix12 = 2 * Width + 1 + threadIdx.x; // lower middle
int pix22 = 2 * Width + 2 + threadIdx.x;// lower Right
medianValues[0] = sDataMask[pix00];
medianValues[1] = sDataMask[pix10];
medianValues[2] = sDataMask[pix20];
medianValues[3] = sDataMask[pix01];
medianValues[4] = sDataMask[sidx];
medianValues[5] = sDataMask[pix21];
medianValues[6] = sDataMask[pix02];
medianValues[7] = sDataMask[pix12];
medianValues[8] = sDataMask[pix22];
unsigned char MedianValue = getMedianValue(medianValues,9);
RowData[threadIdx.x+1] = MedianValue;
//__syncthreads();
if(threadIdx.x == Width-2-1)
fillwithRowData(data_d,RowData,1,Width);
}[/codebox]
do you thgink there is something wrong with my code?
Regards
maz