Why Can't it run?

Dear all:

I use 9800GTX to run the code, but the result is not correct.

but after I modify the BLOCK_SIZEX and BLOCK_SIZEY to be 20, it will be ok. I’ve checked the SMem in cubin file, it is

lmem = 0

smem = 5216

reg = 7

bar = 1

I don’t know what happened.

by the way, if the block size is 20, but if the image size is larger than 5k*5K, the result will be not right.

ps. the Oridata is put by me through create a data array like {1.0f,2.0f,3.0f,4.0f,5.0f,1.0f,2.0f,3.0f,4.0f,5.0f,1.0f,2.0f

,3.0f,4.0f,5.0f,1.0f,2.0f,3.0f,4.0f,5.0f…}

#define SizeX 5120

#define SizeY 3584

#define BLOCK_SIZEX 36

#define BLOCK_SIZEY 36

#define OVER_LAP 4

float Oridata;

float *gpudata, *result;

const int DATA_SIZE=SizeX*SizeY;

global void Process(float* S,float* D,int width,int height){

shared float share[BLOCK_SIZEX][BLOCK_SIZEY];

unsigned int xIndex;

unsigned int yIndex;

unsigned int index_in;

xIndex = blockIdx.x * BLOCK_SIZEX + threadIdx.x-blockIdx.xOVER_LAP;//+i(width/(BLOCK_SIZEX-OVER_LAP)/DownRatio)*(BLOCK_SIZEX-OVER_LAP);

yIndex = blockIdx.y * BLOCK_SIZEY + threadIdx.y-blockIdx.yOVER_LAP;//+j(height/(BLOCK_SIZEY-OVER_LAP)/DownRatio)*(BLOCK_SIZEY-OVER_LAP);

index_in = yIndex * width + xIndex;

share[threadIdx.x][threadIdx.y] = *(S+index_in);

__syncthreads();

if (xIndex>(OVER_LAP/2-1) && xIndex<(width-OVER_LAP/2) && yIndex>(OVER_LAP/2-1) && yIndex<(height-OVER_LAP/2)){

if( threadIdx.x>(OVER_LAP/2-1) && threadIdx.x<(BLOCK_SIZEX-OVER_LAP) && threadIdx.y>(OVER_LAP/2-1) && threadIdx.y<(BLOCK_SIZEY-OVER_LAP/2)){

*(D+index_in)=(

share[threadIdx.x-1][threadIdx.y]+share[threadIdx.x-2][threadIdx.y]+share[threadIdx.x+1][threadIdx.y]+share[thre

adIdx.x+2][threadIdx.y]+share[threadIdx.x][threadIdx.y]+

share[threadIdx.x-1][threadIdx.y-1]+share[threadIdx.x-2][threadIdx.y-1]+share[threadIdx.x+1][threadIdx.y-1]+share[threadIdx.x+2][threadIdx.y-1]+share[threadIdx.x][threadIdx.y-1]+

share[threadIdx.x-1][threadIdx.y-2]+share[threadIdx.x-2][threadIdx.y-2]+share[threadIdx.x+1][threadIdx.y-2]+share[threadIdx.x+2][threadIdx.y-2]+share[threadIdx.x][threadIdx.y-2]+

share[threadIdx.x-1][threadIdx.y+1]+share[threadIdx.x-2][threadIdx.y+1]+share[threadIdx.x+1][threadIdx.y+1]+share[

threadIdx.x+2][threadIdx.y+1]+share[threadIdx.x][threadIdx.y

1]+

share[threadIdx.x-1][threadIdx.y+2]+share[threadIdx.x-2][threadIdx.y+2]+share[threadIdx.x+1][threadIdx.y+2]+share[

threadIdx.x+2][threadIdx.y+2]+share[threadIdx.x][threadIdx.y

2]

)/25;

}

}

__syncthreads();

}

int main(){

cudaGetDeviceProperties( &devInfo, 0 );

cudaMalloc((void**) &gpudata, sizeof(float) * DATA_SIZE);

cudaMalloc((void**) &result, sizeof(float) * DATA_SIZE);

cudaMemcpy(gpudata, Oridata, sizeof(float) * DATA_SIZE,cudaMemcpyHostToDevice);

dim3 dimBlock(BLOCK_SIZEX, BLOCK_SIZEY);

dim3 dimGrid((SizeX / (dimBlock.x-OVER_LAP)), (SizeY / (dimBlock.y-OVER_LAP)));

Process<<<dimGrid,dimBlock>>>(gpudata,result,SizeX,SizeY);

cudaMemcpy(Oridata, result, sizeof(float) * DATA_SIZE, cudaMemcpyDeviceToHost);

cudaFree(gpudata);

cudaFree(result);

return 0;

}

20*20=400, that is smaller than 512, so it works.

36*36=1296, much bigger than 512, so it doesn’t work.

ps: please post source code surrounded by code-tags, it’s much better readable that way.