#include #include #include #include #include #define BLOCK_SIZE 8 void openfile(void); __global__ void median_kernel ( float *indata, unsigned int width , unsigned int height, float *outdata); __global__ void median_kernel_old ( float *indata, unsigned int width , unsigned int height, float *outdata); int image_height = 512, image_width=512; float *input_image,*B1,*image1; int main (int argc, char** argv) { FILE *fs; openfile(); unsigned int hTimer; cutCreateTimer(&hTimer); cutStartTimer(hTimer); // to load the image to the device float *Ad; int size = image_height*image_width*3*sizeof(float); cudaMalloc ((void**)&Ad, size); cudaMemcpy (Ad, input_image, size, cudaMemcpyHostToDevice); // Allocate output_image on the device float *Cd; cudaMalloc ((void**)&Cd, size); // Compute the execution configuration dim3 dimBlock (BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid ( image_width/ dimBlock. x, image_height/dimBlock.y); // Launch the device computation median_kernel<<< dimGrid, dimBlock,(BLOCK_SIZE+2)*(BLOCK_SIZE+2)*3*sizeof(float) >>>(Ad, image_width, image_height, Cd ); // Read Cd from the device float *C; C=(float*)malloc(image_width*image_height*3*sizeof(float)); cudaMemcpy (C,Cd, size, cudaMemcpyDeviceToHost); cutStopTimer(hTimer); printf("Processing time: %f (ms)\n", cutGetTimerValue(hTimer)); cutDeleteTimer(hTimer); //write the image into file unsigned char *Bout; fs = fopen("1out.raw","w"); Bout = (unsigned char*)malloc(image_width*image_height*3); for(int i = 0 ;i< image_height;i++) for(int j = 0 ;j0&&y>0&&x<(width-1)&&y<(height-1)) { int i0 = ((y-1)*width+(x-1))*3; int i1 = ((y-1)*width+(x))*3; int i2 = ((y-1)*width+(x+1))*3; int i3 = ((y)*width+(x-1))*3; int i4 = ((y)*width+(x))*3; int i5 = ((y)*width+(x+1))*3; int i6 = ((y+1)*width+(x-1))*3; int i7 = ((y+1)*width+(x))*3; int i8 = ((y+1)*width+(x+1))*3; float as0 = (indata[i0+0] + indata[i1+0] + indata[i2+0] + indata[i3+0] + indata[i4+0] + indata[i5+0] + indata[i6+0] + indata[i7+0] + indata[i8+0] ); float as1 = (indata[i0+1] + indata[i1+1] + indata[i2+1] + indata[i3+1] + indata[i4+1] + indata[i5+1] + indata[i6+1] + indata[i7+1] + indata[i8+1] ); float as2 = (indata[i0+2] + indata[i1+2] + indata[i2+2] + indata[i3+2] + indata[i4+2] + indata[i5+2] + indata[i6+2] + indata[i7+2] + indata[i8+2] ); outdata[i]= abs(as0)/9; outdata[i+1]= abs(as1)/9; outdata[i+2]= abs(as2)/9; } } __global__ void median_kernel ( float *indata, unsigned int width , unsigned int height, float *outdata) { unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; unsigned int i = (y*width+x)*3; int tx = threadIdx.x; int ty = threadIdx.y; int tyxw = ((ty+1)*(BLOCK_SIZE+2) + (tx+1))*3; int yxw = i;//(y*width+x)*3; int blk2 = (BLOCK_SIZE+2)*3; int w3 = width*3; extern __shared__ float sindata[];//[(BLOCK_SIZE+2)*(BLOCK_SIZE+2)*3]; // copy the tile into shared memory if(x>0&&y>0&&x<(width-1)&&y<(height-1)) { //center region //borders //load corners switch(tx) { case 0: switch(ty) { case 0: sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; sindata[tyxw-blk2+0] = indata[yxw-w3+0]; sindata[tyxw-blk2+1] = indata[yxw-w3+1]; sindata[tyxw-blk2+2] = indata[yxw-w3+2]; sindata[tyxw-3+0] = indata[yxw-3+0]; sindata[tyxw-3+1] = indata[yxw-3+1]; sindata[tyxw-3+2] = indata[yxw-3+2]; sindata[tyxw-blk2-3+0] = indata[yxw-w3-3+0]; sindata[tyxw-blk2-3+1] = indata[yxw-w3-3+1]; sindata[tyxw-blk2-3+2] = indata[yxw-w3-3+2]; break; case BLOCK_SIZE-1: sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; sindata[tyxw+blk2+0] = indata[yxw+w3+0]; sindata[tyxw+blk2+1] = indata[yxw+w3+1]; sindata[tyxw+blk2+2] = indata[yxw+w3+2]; sindata[tyxw-3+0] = indata[yxw -3+0]; sindata[tyxw-3+1] = indata[yxw-3+1]; sindata[tyxw-3+2] = indata[yxw-3+2]; sindata[tyxw+blk2-3+0] = indata[yxw+w3-3+0]; sindata[tyxw+blk2-3+1] = indata[yxw+w3-3+1]; sindata[tyxw+blk2-3+2] = indata[yxw+w3-3+2]; break; default: sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; sindata[tyxw-3+0] = indata[yxw-3+0]; sindata[tyxw-3+1] = indata[yxw-3+1]; sindata[tyxw-3+2] = indata[yxw-3+2]; } break; case BLOCK_SIZE-1: switch(ty) { case 0: sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; sindata[tyxw-blk2+0] = indata[yxw-w3+0]; sindata[tyxw-blk2+1] = indata[yxw-w3+1]; sindata[tyxw-blk2+2] = indata[yxw-w3+2]; sindata[tyxw+3+0] = indata[yxw+3+0]; sindata[tyxw+3+1] = indata[yxw+3+1]; sindata[tyxw+3+2] = indata[yxw+3+2]; sindata[tyxw-blk2+3+0] = indata[yxw-w3+3+0]; sindata[tyxw-blk2+3+1] = indata[yxw-w3+3+1]; sindata[tyxw-blk2+3+2] = indata[yxw-w3+3+2]; break; case BLOCK_SIZE-1: sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; sindata[tyxw+blk2+0] = indata[yxw+w3+0]; sindata[tyxw+blk2+1] = indata[yxw+w3+1]; sindata[tyxw+blk2+2] = indata[yxw+w3+2]; sindata[tyxw+3+0] = indata[yxw+3+0]; sindata[tyxw+3+1] = indata[yxw+3+1]; sindata[tyxw+3+2] = indata[yxw+3+2]; sindata[tyxw+blk2+3+0] = indata[yxw+w3+3+0]; sindata[tyxw+blk2+3+1] = indata[yxw+w3+3+1]; sindata[tyxw+blk2+3+2] = indata[yxw+w3+3+2]; break; default: sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; sindata[tyxw+3+0] = indata[yxw+3+0]; sindata[tyxw+3+1] = indata[yxw+3+1]; sindata[tyxw+3+2] = indata[yxw+3+2]; } break; default: switch(ty) { case 0: sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; sindata[tyxw-blk2+0] = indata[yxw-w3+0]; sindata[tyxw-blk2+1] = indata[yxw-w3+1]; sindata[tyxw-blk2+2] = indata[yxw-w3+2]; break; case BLOCK_SIZE-1: sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; sindata[tyxw+blk2+0] = indata[yxw+w3+0]; sindata[tyxw+blk2+1] = indata[yxw+w3+1]; sindata[tyxw+blk2+2] = indata[yxw+w3+2]; break; default: sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; } } } else { sindata[tyxw+0] = indata[yxw+0]; sindata[tyxw+1] = indata[yxw+1]; sindata[tyxw+2] = indata[yxw+2]; } // synchronize to make sure the tile is loaded __syncthreads(); //Median filtering //tx++;ty++; if(x>0&&y>0&&x<(width-1)&&y<(height-1)) { int i0 = tyxw-blk2-3; int i1 = tyxw-blk2; int i2 = tyxw-blk2+3; int i3 = tyxw-3; int i4 = tyxw; int i5 = tyxw+3; int i6 = tyxw+blk2-3; int i7 = tyxw+blk2; int i8 = tyxw+blk2+3; float as0 = (sindata[i0+0] + sindata[i1+0] + sindata[i2+0] + sindata[i3+0] + sindata[i4+0] + sindata[i5+0] + sindata[i6+0] + sindata[i7+0] + sindata[i8+0] ); float as1 = (sindata[i0+1] + sindata[i1+1] + sindata[i2+1] + sindata[i3+1] + sindata[i4+1] + sindata[i5+1] + sindata[i6+1] + sindata[i7+1] + sindata[i8+1] ); float as2 = (sindata[i0+2] + sindata[i1+2] + sindata[i2+2] + sindata[i3+2] + sindata[i4+2] + sindata[i5+2] + sindata[i6+2] + sindata[i7+2] + sindata[i8+2] ); outdata[i]= abs(as0)/9; outdata[i+1]= abs(as1)/9; outdata[i+2]= abs(as2)/9; } else { outdata[i]= sindata[tyxw+0]; outdata[i+1]= sindata[tyxw+1]; outdata[i+2]= sindata[tyxw+2]; } }