It is a kernel for 2D convolution, that is used as a function for non-separable 4D convolution.
__global__ void Convolution_2D_Enhancement_Valid_Z(float* __restrict__ Filter_Responses_1, float* __restrict__ Filter_Responses_2, float* __restrict__ Filter_Responses_3, float* __restrict__ Filter_Responses_4, float* __restrict__ Filter_Responses_5, float* __restrict__ Filter_Responses_6, float* __restrict__ Filter_Responses_7, float* __restrict__ Filter_Responses_8, float* __restrict__ Filter_Responses_9, float* __restrict__ Filter_Responses_10, float* __restrict__ Filter_Responses_11, float* __restrict__ Volumes, int z_offset, int t, int DATA_W, int DATA_H, int DATA_D, int DATA_T, int VALID_DATA_W, int VALID_DATA_H, int VALID_DATA_D, int ENHANCEMENT_FILTER_W, int ENHANCEMENT_FILTER_H, int ENHANCEMENT_FILTER_D, int xBlockDifference, int yBlockDifference, int blocksInY, float invBlocksInY)
{
unsigned int blockIdxz = __float2uint_rd(blockIdx.y * invBlocksInY);
unsigned int blockIdxy = blockIdx.y - __umul24(blockIdxz,blocksInY);
volatile int x = __umul24(blockIdx.x,blockDim.x / 2) * 3 + threadIdx.x;
volatile int y = __umul24(blockIdxy ,blockDim.y) * 3 + threadIdx.y;
volatile int z = __umul24(blockIdxz ,blockDim.z) + threadIdx.z;
if ( (x >= (DATA_W + xBlockDifference)) || (y >= (DATA_H + yBlockDifference)) || (z < (ENHANCEMENT_FILTER_D - 1)/2) || (z >= (DATA_D - (ENHANCEMENT_FILTER_D - 1)/2)) || (((z + z_offset) < 0)) || (((z + z_offset) >= DATA_D)) )
return;
// Circular convolution in time
if ((t < 0))
{
t = DATA_T + t;
}
else if (t >= DATA_T)
{
t = t - DATA_T;
}
__shared__ float s_Image[64][64];
// Blocks
// 1 2 3 4
// 5 6 7 8
// 9 10 11 12
// 13 14 15 16
s_Image[threadIdx.y][threadIdx.x] = 0.0f;
s_Image[threadIdx.y + 16][threadIdx.x] = 0.0f;
s_Image[threadIdx.y + 32][threadIdx.x] = 0.0f;
s_Image[threadIdx.y + 48][threadIdx.x] = 0.0f;
s_Image[threadIdx.y][threadIdx.x + 32] = 0.0f;
s_Image[threadIdx.y + 16][threadIdx.x + 32] = 0.0f;
s_Image[threadIdx.y + 32][threadIdx.x + 32] = 0.0f;
s_Image[threadIdx.y + 48][threadIdx.x + 32] = 0.0f;
// Read data into shared memory
// First row, blocks 1 + 2
if ( ((x - 8) >= 0) && ((x - 8) < DATA_W) && ((y - 8) >= 0) && ((y - 8) < DATA_H) )
{
s_Image[threadIdx.y][threadIdx.x] = Volumes[Calculate_4D_Index(x - 8,y - 8,z + z_offset,t,DATA_W, DATA_H, DATA_D)];
}
// First row, blocks 3 + 4
if ( ((x + 24) < DATA_W) && ((y - 8) >= 0) && ((y - 8) < DATA_H) )
{
s_Image[threadIdx.y][threadIdx.x + 32] = Volumes[Calculate_4D_Index(x + 24,y - 8,z + z_offset,t,DATA_W, DATA_H, DATA_D)];
}
// Second row, blocks 5 + 6
if ( ((x - 8) >= 0) && ((x - 8) < DATA_W) && ((y + 8) < DATA_H) )
{
s_Image[threadIdx.y + 16][threadIdx.x] = Volumes[Calculate_4D_Index(x - 8,y + 8,z + z_offset,t,DATA_W, DATA_H, DATA_D)];
}
// Second row, blocks 7 + 8
if ( ((x + 24) < DATA_W) && ((y + 8) < DATA_H) )
{
s_Image[threadIdx.y + 16][threadIdx.x + 32] = Volumes[Calculate_4D_Index(x + 24,y + 8,z + z_offset,t,DATA_W, DATA_H, DATA_D)];
}
// Third row, blocks 9 + 10
if ( ((x - 8) >= 0) && ((x - 8) < DATA_W) && ((y + 24) < DATA_H) )
{
s_Image[threadIdx.y + 32][threadIdx.x] = Volumes[Calculate_4D_Index(x - 8,y + 24,z + z_offset,t,DATA_W, DATA_H, DATA_D)];
}
// Third row, blocks 11 + 12
if ( ((x + 24) < DATA_W) && ((y + 24) < DATA_H) )
{
s_Image[threadIdx.y + 32][threadIdx.x + 32] = Volumes[Calculate_4D_Index(x + 24,y + 24,z + z_offset,t,DATA_W, DATA_H, DATA_D)];
}
// Fourth row, blocks 13 + 14
if ( ((x - 8) >= 0) && ((x - 8) < DATA_W) && ((y + 40) < DATA_H) )
{
s_Image[threadIdx.y + 48][threadIdx.x] = Volumes[Calculate_4D_Index(x - 8,y + 40,z + z_offset,t,DATA_W, DATA_H, DATA_D)];
}
// Fourth row, blocks 15 + 16
if ( ((x + 24) < DATA_W) && ((y + 40) < DATA_H) )
{
s_Image[threadIdx.y + 48][threadIdx.x + 32] = Volumes[Calculate_4D_Index(x + 24,y + 40,z + z_offset,t,DATA_W, DATA_H, DATA_D)];
}
__syncthreads();
// Only threads inside the image do the convolution, calculate filter responses for 48 x 48 pixels
if ( (x < VALID_DATA_W) && (y < VALID_DATA_H) )
{
int idx = Calculate_3D_Index(x,y,z - (ENHANCEMENT_FILTER_D - 1)/2,VALID_DATA_W,VALID_DATA_H);
float4 filter_responses = Convolve_11x11_Enhancement_4firstfilters(s_Image, threadIdx.y + 8, threadIdx.x + 8);
Filter_Responses_1[idx] += filter_responses.x;
Filter_Responses_2[idx] += filter_responses.y;
Filter_Responses_3[idx] += filter_responses.z;
Filter_Responses_4[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4middlefilters(s_Image, threadIdx.y + 8, threadIdx.x + 8);
Filter_Responses_5[idx] += filter_responses.x;
Filter_Responses_6[idx] += filter_responses.y;
Filter_Responses_7[idx] += filter_responses.z;
Filter_Responses_8[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4lastfilters(s_Image, threadIdx.y + 8, threadIdx.x + 8);
Filter_Responses_9[idx] += filter_responses.x;
Filter_Responses_10[idx] += filter_responses.y;
Filter_Responses_11[idx] += filter_responses.z;
}
if ( (x < VALID_DATA_W) && ((y + 16) < VALID_DATA_H) )
{
int idx = Calculate_3D_Index(x,y + 16,z - (ENHANCEMENT_FILTER_D - 1)/2,VALID_DATA_W,VALID_DATA_H);
float4 filter_responses = Convolve_11x11_Enhancement_4firstfilters(s_Image, threadIdx.y + 24, threadIdx.x + 8);
Filter_Responses_1[idx] += filter_responses.x;
Filter_Responses_2[idx] += filter_responses.y;
Filter_Responses_3[idx] += filter_responses.z;
Filter_Responses_4[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4middlefilters(s_Image, threadIdx.y + 24, threadIdx.x + 8);
Filter_Responses_5[idx] += filter_responses.x;
Filter_Responses_6[idx] += filter_responses.y;
Filter_Responses_7[idx] += filter_responses.z;
Filter_Responses_8[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4lastfilters(s_Image, threadIdx.y + 24, threadIdx.x + 8);
Filter_Responses_9[idx] += filter_responses.x;
Filter_Responses_10[idx] += filter_responses.y;
Filter_Responses_11[idx] += filter_responses.z;
}
if ( (x < VALID_DATA_W) && ((y + 32) < VALID_DATA_H) )
{
int idx = Calculate_3D_Index(x,y + 32,z - (ENHANCEMENT_FILTER_D - 1)/2,VALID_DATA_W,VALID_DATA_H);
float4 filter_responses = Convolve_11x11_Enhancement_4firstfilters(s_Image, threadIdx.y + 40, threadIdx.x + 8);
Filter_Responses_1[idx] += filter_responses.x;
Filter_Responses_2[idx] += filter_responses.y;
Filter_Responses_3[idx] += filter_responses.z;
Filter_Responses_4[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4middlefilters(s_Image, threadIdx.y + 40, threadIdx.x + 8);
Filter_Responses_5[idx] += filter_responses.x;
Filter_Responses_6[idx] += filter_responses.y;
Filter_Responses_7[idx] += filter_responses.z;
Filter_Responses_8[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4lastfilters(s_Image, threadIdx.y + 40, threadIdx.x + 8);
Filter_Responses_9[idx] += filter_responses.x;
Filter_Responses_10[idx] += filter_responses.y;
Filter_Responses_11[idx] += filter_responses.z;
}
if (threadIdx.x < 16)
{
if ( ((x + 32) < VALID_DATA_W) && (y < VALID_DATA_H) )
{
int idx = Calculate_3D_Index(x + 32,y,z - (ENHANCEMENT_FILTER_D - 1)/2,VALID_DATA_W,VALID_DATA_H);
float4 filter_responses = Convolve_11x11_Enhancement_4firstfilters(s_Image, threadIdx.y + 8, threadIdx.x + 40);
Filter_Responses_1[idx] += filter_responses.x;
Filter_Responses_2[idx] += filter_responses.y;
Filter_Responses_3[idx] += filter_responses.z;
Filter_Responses_4[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4middlefilters(s_Image, threadIdx.y + 8, threadIdx.x + 40);
Filter_Responses_5[idx] += filter_responses.x;
Filter_Responses_6[idx] += filter_responses.y;
Filter_Responses_7[idx] += filter_responses.z;
Filter_Responses_8[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4lastfilters(s_Image, threadIdx.y + 8, threadIdx.x + 40);
Filter_Responses_9[idx] += filter_responses.x;
Filter_Responses_10[idx] += filter_responses.y;
Filter_Responses_11[idx] += filter_responses.z;
}
if ( ((x + 32) < VALID_DATA_W) && ((y + 16) < VALID_DATA_H) )
{
int idx = Calculate_3D_Index(x + 32,y + 16,z - (ENHANCEMENT_FILTER_D - 1)/2,VALID_DATA_W,VALID_DATA_H);
float4 filter_responses = Convolve_11x11_Enhancement_4firstfilters(s_Image, threadIdx.y + 24, threadIdx.x + 40);
Filter_Responses_1[idx] += filter_responses.x;
Filter_Responses_2[idx] += filter_responses.y;
Filter_Responses_3[idx] += filter_responses.z;
Filter_Responses_4[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4middlefilters(s_Image, threadIdx.y + 24, threadIdx.x + 40);
Filter_Responses_5[idx] += filter_responses.x;
Filter_Responses_6[idx] += filter_responses.y;
Filter_Responses_7[idx] += filter_responses.z;
Filter_Responses_8[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4lastfilters(s_Image, threadIdx.y + 24, threadIdx.x + 40);
Filter_Responses_9[idx] += filter_responses.x;
Filter_Responses_10[idx] += filter_responses.y;
Filter_Responses_11[idx] += filter_responses.z;
}
if ( ((x + 32) < VALID_DATA_W) && ((y + 32) < VALID_DATA_H) )
{
int idx = Calculate_3D_Index(x + 32,y + 32,z - (ENHANCEMENT_FILTER_D - 1)/2,VALID_DATA_W,VALID_DATA_H);
float4 filter_responses = Convolve_11x11_Enhancement_4firstfilters(s_Image, threadIdx.y + 40, threadIdx.x + 40);
Filter_Responses_1[idx] += filter_responses.x;
Filter_Responses_2[idx] += filter_responses.y;
Filter_Responses_3[idx] += filter_responses.z;
Filter_Responses_4[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4middlefilters(s_Image, threadIdx.y + 40, threadIdx.x + 40);
Filter_Responses_5[idx] += filter_responses.x;
Filter_Responses_6[idx] += filter_responses.y;
Filter_Responses_7[idx] += filter_responses.z;
Filter_Responses_8[idx] += filter_responses.w;
filter_responses = Convolve_11x11_Enhancement_4lastfilters(s_Image, threadIdx.y + 40, threadIdx.x + 40);
Filter_Responses_9[idx] += filter_responses.x;
Filter_Responses_10[idx] += filter_responses.y;
Filter_Responses_11[idx] += filter_responses.z;
}
}
}