I coalesced my code, but for some reason it won’t output the correct image anymore… Can anyone help me out?
Output Image:
My code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define KERNEL_RADIUS 1
#define BLOCK_DIM 1
#define KERNEL_W (2 * KERNEL_RADIUS + 1)
#define ROW_TILE_W 128
#define ROW_TILE_H 128
#define COLUMN_TILE_W 16
#define COLUMN_TILE_H 48
#define IMUL(a,b) __mul24(a,b)
__global__ void ConvolutionRowKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)
{
__shared__ uchar4 s_data_Input[KERNEL_RADIUS + BLOCK_DIM + KERNEL_RADIUS];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if((x < width) && (y < height))
{
unsigned char *pixelInput = (surfaceInput+y*pitchInput);
s_data_Input[threadIdx.x].x = pixelInput[4*x];
s_data_Input[threadIdx.x].y = pixelInput[4*x+1];
s_data_Input[threadIdx.x].z = pixelInput[4*x+2];
__syncthreads();
uchar3 convolutionResult;
for(int i=0; i <= KERNEL_W; i++)
{
convolutionResult.x += s_data_Input[i].x;
convolutionResult.y += s_data_Input[i].y;
convolutionResult.z += s_data_Input[i].z;
}
convolutionResult.x /= KERNEL_W;
convolutionResult.y /= KERNEL_W;
convolutionResult.z /= KERNEL_W;
uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);
pixelOutput[threadIdx.x].x = convolutionResult.x;
pixelOutput[threadIdx.x].y = convolutionResult.y;
pixelOutput[threadIdx.x].z = convolutionResult.z;
pixelOutput[threadIdx.x].w = 1;
__syncthreads();
}
}
__global__ void ConvolutionColumnKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)
{
__shared__ uchar4 s_data_Input[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;
if((x < width) && (y < height))
{
unsigned char* pixelInput = (surfaceInput + y*pitchInput);
s_data_Input[threadIdx.x].x = pixelInput[smemPos + IMUL(x, COLUMN_TILE_W)]; //R
s_data_Input[threadIdx.x].y = pixelInput[smemPos + IMUL(x+1, COLUMN_TILE_W)]; //G
s_data_Input[threadIdx.x].z = pixelInput[smemPos + IMUL(x+2, COLUMN_TILE_W)]; //B
__syncthreads();
uchar3 convolutionResult;
for(int i=0; i<=KERNEL_W; i++)
{
convolutionResult.x += s_data_Input[i].x;
convolutionResult.y += s_data_Input[i].y;
convolutionResult.z += s_data_Input[i].z;
}
convolutionResult.x /= KERNEL_W;
convolutionResult.y /= KERNEL_W;
convolutionResult.z /= KERNEL_W;
uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);
pixelOutput[threadIdx.x].x = convolutionResult.x;
pixelOutput[threadIdx.x].y = convolutionResult.y;
pixelOutput[threadIdx.x].z = convolutionResult.z;
pixelOutput[threadIdx.x].w = 1;
__syncthreads();
}
}
void cuda_kernel(void* pDataOutput, size_t pitchOutput, void* pDatainput, size_t pitchInput, int width, int height)
{
dim3 Db = dim3(BLOCK_DIM, BLOCK_DIM);
dim3 Dg = dim3((width+Db.x-1)/Db.x, (height+Db.y-1)/Db.y);
ConvolutionColumnKernel<<<Dg,Db>>>((unsigned char*)pDataOutput, pitchOutput, (unsigned char*)pDatainput, pitchInput, width, height);
ConvolutionRowKernel<<<Dg,Db>>>((unsigned char*)pDataOutput, pitchOutput, (unsigned char*)pDatainput, pitchInput, width, height);
}