Dear all:
I use 9800GTX to run the code, but the result is not correct.
but after I modify the BLOCK_SIZEX and BLOCK_SIZEY to be 20, it will be ok. I’ve checked the SMem in cubin file, it is
lmem = 0
smem = 5216
reg = 7
bar = 1
I don’t know what happened.
by the way, if the block size is 20, but if the image size is larger than 5k*5K, the result will be not right.
ps. the Oridata is put by me through create a data array like {1.0f,2.0f,3.0f,4.0f,5.0f,1.0f,2.0f,3.0f,4.0f,5.0f,1.0f,2.0f
,3.0f,4.0f,5.0f,1.0f,2.0f,3.0f,4.0f,5.0f…}
#define SizeX 5120
#define SizeY 3584
#define BLOCK_SIZEX 36
#define BLOCK_SIZEY 36
#define OVER_LAP 4
float Oridata;
float *gpudata, *result;
const int DATA_SIZE=SizeX*SizeY;
global void Process(float* S,float* D,int width,int height){
shared float share[BLOCK_SIZEX][BLOCK_SIZEY];
unsigned int xIndex;
unsigned int yIndex;
unsigned int index_in;
xIndex = blockIdx.x * BLOCK_SIZEX + threadIdx.x-blockIdx.xOVER_LAP;//+i(width/(BLOCK_SIZEX-OVER_LAP)/DownRatio)*(BLOCK_SIZEX-OVER_LAP);
yIndex = blockIdx.y * BLOCK_SIZEY + threadIdx.y-blockIdx.yOVER_LAP;//+j(height/(BLOCK_SIZEY-OVER_LAP)/DownRatio)*(BLOCK_SIZEY-OVER_LAP);
index_in = yIndex * width + xIndex;
share[threadIdx.x][threadIdx.y] = *(S+index_in);
__syncthreads();
if (xIndex>(OVER_LAP/2-1) && xIndex<(width-OVER_LAP/2) && yIndex>(OVER_LAP/2-1) && yIndex<(height-OVER_LAP/2)){
if( threadIdx.x>(OVER_LAP/2-1) && threadIdx.x<(BLOCK_SIZEX-OVER_LAP) && threadIdx.y>(OVER_LAP/2-1) && threadIdx.y<(BLOCK_SIZEY-OVER_LAP/2)){
*(D+index_in)=(
share[threadIdx.x-1][threadIdx.y]+share[threadIdx.x-2][threadIdx.y]+share[threadIdx.x+1][threadIdx.y]+share[thre
adIdx.x+2][threadIdx.y]+share[threadIdx.x][threadIdx.y]+
share[threadIdx.x-1][threadIdx.y-1]+share[threadIdx.x-2][threadIdx.y-1]+share[threadIdx.x+1][threadIdx.y-1]+share[threadIdx.x+2][threadIdx.y-1]+share[threadIdx.x][threadIdx.y-1]+
share[threadIdx.x-1][threadIdx.y-2]+share[threadIdx.x-2][threadIdx.y-2]+share[threadIdx.x+1][threadIdx.y-2]+share[threadIdx.x+2][threadIdx.y-2]+share[threadIdx.x][threadIdx.y-2]+
share[threadIdx.x-1][threadIdx.y+1]+share[threadIdx.x-2][threadIdx.y+1]+share[threadIdx.x+1][threadIdx.y+1]+share[
threadIdx.x+2][threadIdx.y+1]+share[threadIdx.x][threadIdx.y
1]+
share[threadIdx.x-1][threadIdx.y+2]+share[threadIdx.x-2][threadIdx.y+2]+share[threadIdx.x+1][threadIdx.y+2]+share[
threadIdx.x+2][threadIdx.y+2]+share[threadIdx.x][threadIdx.y
2]
)/25;
}
}
__syncthreads();
}
int main(){
cudaGetDeviceProperties( &devInfo, 0 );
cudaMalloc((void**) &gpudata, sizeof(float) * DATA_SIZE);
cudaMalloc((void**) &result, sizeof(float) * DATA_SIZE);
cudaMemcpy(gpudata, Oridata, sizeof(float) * DATA_SIZE,cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZEX, BLOCK_SIZEY);
dim3 dimGrid((SizeX / (dimBlock.x-OVER_LAP)), (SizeY / (dimBlock.y-OVER_LAP)));
Process<<<dimGrid,dimBlock>>>(gpudata,result,SizeX,SizeY);
cudaMemcpy(Oridata, result, sizeof(float) * DATA_SIZE, cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
return 0;
}