My ConvolutionColumnKernel looks like:
__global__ void ConvolutionColumnKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)
{
__shared__ uchar4 s_data_Input[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;
//Read into shared memory
if((x < width) && (y < height))
{
unsigned char* pixelInput = (surfaceInput + y*pitchInput);
s_data_Input[threadIdx.x].x = pixelInput[smemPos + IMUL(x, COLUMN_TILE_W)]; //R
s_data_Input[threadIdx.x].y = pixelInput[smemPos + IMUL(x+1, COLUMN_TILE_W)]; //G
s_data_Input[threadIdx.x].z = pixelInput[smemPos + IMUL(x+2, COLUMN_TILE_W)]; //B
__syncthreads();
uchar3 convolutionResult;
for(int i=0; i<=KERNEL_W; i++)
{
convolutionResult.x += s_data_Input[i].x;
convolutionResult.y += s_data_Input[i].y;
convolutionResult.z += s_data_Input[i].z;
}
convolutionResult.x /= KERNEL_W;
convolutionResult.y /= KERNEL_W;
convolutionResult.z /= KERNEL_W;
uchar4* pixelOutput;
pixelOutput[threadIdx.x].x = convolutionResult.x;
pixelOutput[threadIdx.x].y = convolutionResult.y;
pixelOutput[threadIdx.x].z = convolutionResult.z;
pixelOutput[threadIdx.x].w = 1.0;
}
}
Originally my code looked like this, and it worked fine:
unsigned char* pixelOutput = (unsigned char*) (surfaceOutput + y*pitchOutput) + 4*x;
pixelOutput[0] = convolutionResult.x;
pixelOutput[1] = convolutionResult.y;
pixelOutput[2] = convolutionResult.z;
pixelOutput[3] = 1.0;
Now I tried to coalesce my code like this:
uchar4* pixelOutput;
pixelOutput[threadIdx.x].x = convolutionResult.x;
pixelOutput[threadIdx.x].y = convolutionResult.y;
pixelOutput[threadIdx.x].z = convolutionResult.z;
pixelOutput[threadIdx.x].w = 1.0;
Unfortunately it won’t work… Anyone knows what I’m doing wrong? I’m really stuck here… Every time I execute, my PC stalls External Image
Thanks in advance!