2D Convolution Coalescing problems

I coalesced my code, but for some reason it won’t output the correct image anymore… Can anyone help me out?

Output Image:

My code:

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#define KERNEL_RADIUS 1

#define BLOCK_DIM 1

#define KERNEL_W (2 * KERNEL_RADIUS + 1)

#define ROW_TILE_W 128

#define ROW_TILE_H 128

#define COLUMN_TILE_W 16

#define COLUMN_TILE_H 48

#define IMUL(a,b) __mul24(a,b)

__global__ void ConvolutionRowKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)

{

	__shared__ uchar4 s_data_Input[KERNEL_RADIUS + BLOCK_DIM + KERNEL_RADIUS];

	int x = blockIdx.x * blockDim.x + threadIdx.x;

	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if((x < width) && (y < height))

	{	

		unsigned char *pixelInput = (surfaceInput+y*pitchInput);

		s_data_Input[threadIdx.x].x = pixelInput[4*x];

		s_data_Input[threadIdx.x].y = pixelInput[4*x+1];

		s_data_Input[threadIdx.x].z = pixelInput[4*x+2];

		__syncthreads();

		uchar3 convolutionResult;

		for(int i=0; i <= KERNEL_W; i++)

		{	

			convolutionResult.x += s_data_Input[i].x;

			convolutionResult.y += s_data_Input[i].y;

			convolutionResult.z += s_data_Input[i].z;

		}

		convolutionResult.x /= KERNEL_W;

		convolutionResult.y /= KERNEL_W;

		convolutionResult.z /= KERNEL_W;

		uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);

		

		pixelOutput[threadIdx.x].x = convolutionResult.x;

		pixelOutput[threadIdx.x].y = convolutionResult.y;

		pixelOutput[threadIdx.x].z = convolutionResult.z;

		pixelOutput[threadIdx.x].w = 1;

		__syncthreads();

	}

}

__global__ void ConvolutionColumnKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)

{

	__shared__ uchar4 s_data_Input[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];

	int x = blockIdx.x * blockDim.x + threadIdx.x;

	int y = blockIdx.y * blockDim.y + threadIdx.y;

	int smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;

	if((x < width) && (y < height))

	{

		unsigned char* pixelInput = (surfaceInput + y*pitchInput);

		s_data_Input[threadIdx.x].x = pixelInput[smemPos + IMUL(x, COLUMN_TILE_W)]; //R

		s_data_Input[threadIdx.x].y = pixelInput[smemPos + IMUL(x+1, COLUMN_TILE_W)]; //G

		s_data_Input[threadIdx.x].z = pixelInput[smemPos + IMUL(x+2, COLUMN_TILE_W)]; //B

		__syncthreads();

		uchar3 convolutionResult;

		for(int i=0; i<=KERNEL_W; i++)

		{

			convolutionResult.x += s_data_Input[i].x;

			convolutionResult.y += s_data_Input[i].y;

			convolutionResult.z += s_data_Input[i].z;

		}

		

		convolutionResult.x /= KERNEL_W;

		convolutionResult.y /= KERNEL_W;

		convolutionResult.z /= KERNEL_W;

		uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);

		pixelOutput[threadIdx.x].x = convolutionResult.x;

		pixelOutput[threadIdx.x].y = convolutionResult.y;

		pixelOutput[threadIdx.x].z = convolutionResult.z;

		pixelOutput[threadIdx.x].w = 1;

		__syncthreads();	

	}

}

void cuda_kernel(void* pDataOutput, size_t pitchOutput, void* pDatainput, size_t pitchInput, int width, int height)

{

	dim3 Db = dim3(BLOCK_DIM, BLOCK_DIM);

	dim3 Dg = dim3((width+Db.x-1)/Db.x, (height+Db.y-1)/Db.y);

	ConvolutionColumnKernel<<<Dg,Db>>>((unsigned char*)pDataOutput, pitchOutput, (unsigned char*)pDatainput, pitchInput, width, height);

	ConvolutionRowKernel<<<Dg,Db>>>((unsigned char*)pDataOutput, pitchOutput, (unsigned char*)pDatainput, pitchInput, width, height);

}

A few ideas: first, for the horizontal it looks like you are only copying one pixel to shared memory per thread, but to produce all your output pixels you need BLOCKSIZE + 2*KERNEL_RADIUS.

Second, for the horizontal, if you use a 2D block, as in Db = dim3(BLOCK_DIM, BLOCK_DIM), then threads with the same x value but different y values will clobber each other when they try to use shared memory indexed only on x. BLOCK_DIM is 1 right now so that’s not causing the image artifact but presumably you want to increase it.

Third, for the vertical, you seem to have the same problem where you are only copying one pixel per thread, but you need a total of COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS). There does not appear to be any relation between the tile dimensions and the block dimensions.

If all the uninitialized shared memory elements are zero, then you might get away with something that produces recognizable output, even though it’s not correct. But if some are nonzero they will corrupt your image in a very visible way, which is what I think you’re seeing.

Yeah well that’s indeed what I’m trying to achieve… The fact is that I want to read my pixels in shared memory, in parallell on all processors… I previously did it with a for loop, but that’s not optimal… Can you help me out on that?

Yeah offcourse, this is just a standard value… Thanks for your explanation though…

Same as the horizontal filter…

I see, thanks for your help… Would be nice if you could explain to me how to avoid a for loop and make the processor cores read my needed pixels in parallell…

Grtz!

  • Flokky

One thing I’ve done before in copying data to shared memory, when the number of things to be copied is not the same as BLOCKSIZE is to use a loop like this:

for (int i=threadIdx.x; i < N; i += BLOCKSIZE) {

  // copy the ith pixel

}

__syncthreads();

This is useful any time a block has N units of work to do and N is not equal to BLOCKSIZE, such as copying data to shared memory including the neighbor pixels (where N would be BLOCKSIZE + 2*KERNEL_RADIUS), or for initializing a lookup table for example.

Even though it’s a for-loop, it’s still efficient because the threads cooperate, so each thread goes through the loop only N/BLOCKSIZE times.

In your case it could look something like this for the horizontal (I haven’t tested this).

__shared__ uchar4 s_data_Input[KERNEL_RADIUS + BLOCK_DIM + KERNEL_RADIUS];

	unsigned char *rowStart = surfaceInput + y*pitchInput;

	int basex = blockIdx.x * blockDim.x - KERNEL_RADIUS;

	for (int copyidx=threadIdx.x; copyidx < BLOCK_DIM+2*KERNEL_RADIUS; copyidx += blockDim.x) {

		if (basex + copyIdx < 0 || basex + copyidx >= width) {

			continue;

		}

		s_data_Input[copyidx].x = rowStart[4*(basex+copyidx)];

		s_data_Input[copyidx].y = rowStart[4*(basex+copyidx)+1];

		s_data_Input[copyidx].z = rowStart[4*(basex+copyidx)+2];

	}

	__syncthreads();

	// kill "excess" threads after copying, otherwise copy may have gaps from dead threads

	if((x >= width) || (y >= height)) {

		return;

	}

Still stuck here :unsure: I tweaked my code, but I’m still not getting the correct output… :wacko:

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#define KERNEL_RADIUS 1

#define BLOCK_DIM 1

#define KERNEL_W (2 * KERNEL_RADIUS + 1)

#define ROW_TILE_W 128

#define ROW_TILE_H 128

#define COLUMN_TILE_W 16

#define COLUMN_TILE_H 48

#define IMUL(a,b) __mul24(a,b)

__global__ void ConvolutionRowKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)

{

	__shared__ uchar4 s_data_Input[KERNEL_RADIUS + BLOCK_DIM + KERNEL_RADIUS];

	int x = blockIdx.x * blockDim.x + threadIdx.x;

	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if((x < width) && (y < height))

	{	

		unsigned char *pixelInput = (surfaceInput+y*pitchInput);

		int gx = blockIdx.x * blockDim.x - KERNEL_RADIUS;

		for (int i=threadIdx.x; i<BLOCK_DIM+2*KERNEL_RADIUS; i+=BLOCK_DIM)

		{

			s_data_Input[i].x = pixelInput[4*(gx+i)]; //R

			s_data_Input[i].y = pixelInput[4*(gx+i)+1]; //G

			s_data_Input[i].z = pixelInput[4*(gx+i)+2]; //B

		}

		__syncthreads();

		uchar3 convolutionResult;

		for(int i=0; i <= KERNEL_W; i++)

		{	

			convolutionResult.x += s_data_Input[i].x;

			convolutionResult.y += s_data_Input[i].y;

			convolutionResult.z += s_data_Input[i].z;

		}

		convolutionResult.x /= KERNEL_W;

		convolutionResult.y /= KERNEL_W;

		convolutionResult.z /= KERNEL_W;

		uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);

		

		pixelOutput[threadIdx.x].x = convolutionResult.x;

		pixelOutput[threadIdx.x].y = convolutionResult.y;

		pixelOutput[threadIdx.x].z = convolutionResult.z;

		pixelOutput[threadIdx.x].w = 1;

		__syncthreads();

	}

}

__global__ void ConvolutionColumnKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)

{

	__shared__ uchar4 s_data_Input[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];

	int x = blockIdx.x * blockDim.x + threadIdx.x;

	int y = blockIdx.y * blockDim.y + threadIdx.y;

	int smemPos = IMUL(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;

	if((x < width) && (y < height))

	{

		unsigned char* pixelInput = (surfaceInput + y*pitchInput);

		int gx = blockIdx.x * blockDim.x - KERNEL_RADIUS;

		for (int i=threadIdx.x; i<COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS); i+=BLOCK_DIM)

		{

			s_data_Input[i].x = pixelInput[smemPos + IMUL((gx+i), COLUMN_TILE_W)]; //R

			s_data_Input[i].y = pixelInput[smemPos + IMUL((gx+i)+1, COLUMN_TILE_W)]; //G

			s_data_Input[i].z = pixelInput[smemPos + IMUL((gx+i)+2, COLUMN_TILE_W)]; //B

		}

		__syncthreads();

		uchar3 convolutionResult;

		for(int i=0; i<=KERNEL_W; i++)

		{

			convolutionResult.x += s_data_Input[i].x;

			convolutionResult.y += s_data_Input[i].y;

			convolutionResult.z += s_data_Input[i].z;

		}

		

		convolutionResult.x /= KERNEL_W;

		convolutionResult.y /= KERNEL_W;

		convolutionResult.z /= KERNEL_W;

		uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);

		pixelOutput[threadIdx.x].x = convolutionResult.x;

		pixelOutput[threadIdx.x].y = convolutionResult.y;

		pixelOutput[threadIdx.x].z = convolutionResult.z;

		pixelOutput[threadIdx.x].w = 1;

		__syncthreads();	

	}

}

void cuda_kernel(void* pDataOutput, size_t pitchOutput, void* pDatainput, size_t pitchInput, int width, int height)

{

	dim3 Db = dim3(BLOCK_DIM, BLOCK_DIM);

	dim3 Dg = dim3((width+Db.x-1)/Db.x, (height+Db.y-1)/Db.y);

	ConvolutionColumnKernel<<<Dg,Db>>>((unsigned char*)pDataOutput, pitchOutput, (unsigned char*)pDatainput, pitchInput, width, height);

	ConvolutionRowKernel<<<Dg,Db>>>((unsigned char*)pDataOutput, pitchOutput, (unsigned char*)pDatainput, pitchInput, width, height);

}

Here’s what I’ve got for the horizontal, with commentary added:

[codebox]global void ConvolutionRowKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)

{

__shared__ uchar4 s_data_Input[KERNEL_RADIUS + BLOCK_DIM + KERNEL_RADIUS];

int x = blockIdx.x * blockDim.x + threadIdx.x;

int y = blockIdx.y;  // block height must always be 1, gridDim.y must always be height

unsigned char *pixelInput = (surfaceInput+y*pitchInput);

int gx = blockIdx.x * blockDim.x - KERNEL_RADIUS;

for (int i=threadIdx.x; i<BLOCK_DIM+2*KERNEL_RADIUS; i += BLOCK_DIM)

{

	if (gx+i < 0 || gx+i >= width) {  // theoretically this is necessary to prevent

		continue;  // accessing beyond array limits, but seems to have no effect

	}			 // kernel didn't crash without it, even with a moderately large radius

	s_data_Input[i].x = pixelInput[4*(gx+i)]; //R

	s_data_Input[i].y = pixelInput[4*(gx+i)+1]; //G

	s_data_Input[i].z = pixelInput[4*(gx+i)+2]; //B

}

__syncthreads();

// The condition for x and y being within bounds has moved below the copy, because otherwise, the partial blocks

// might not copy enough

// changed condition to only calculate those pixels that have full support within the image

// the left or right ends of s_data_Input might be uninitialized for edge blocks, so keeping this within bounds

// keeps us within valid pixel data

if(x >= KERNEL_RADIUS && x < width-KERNEL_RADIUS) {

	float3 convolutionResult;  // needs to be float3, or the sum may easily overflow unsigned char

	// int3 could perhaps work, but probably slower than float

	convolutionResult.x = 0;  // must be initialized to zero, or sum will contain junk

	convolutionResult.y = 0;

	convolutionResult.z = 0;

	for(int i=0; i < KERNEL_W; i++)  // up to but not including KERNEL_W.  Loop from zero up to and including KERNEL_W-1

	{	

		convolutionResult.x += s_data_Input[threadIdx.x+i].x;  // has to be dependent on thread id.

		convolutionResult.y += s_data_Input[threadIdx.x+i].y;

		convolutionResult.z += s_data_Input[threadIdx.x+i].z;

	}

	convolutionResult.x /= KERNEL_W;  // could be refactored so accumulation uses multiply-add, instead of 

	convolutionResult.y /= KERNEL_W;  // division at the end.  probably not an issue because I assume 

	convolutionResult.z /= KERNEL_W;  // eventually we'll extend to use a kernel anyway

	uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);

	

	pixelOutput[x].x = convolutionResult.x;  // need to store at global x, not threadIdx.x, which is block relative

	pixelOutput[x].y = convolutionResult.y;

	pixelOutput[x].z = convolutionResult.z;

	pixelOutput[x].w = 1;

}

else if (x < width) {  // (optional) clobbering of the edge pixels instead of leaving them uninitialized

	uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);

	pixelOutput[x].x = 0;

	pixelOutput[x].y = 0;

	pixelOutput[x].z = 0;

	pixelOutput[x].w = 1;

}

// synchronization at the end has no effect

}[/codebox]

Launched with a block height of 1, like so:

dim3 Db = dim3(BLOCK_DIM);

	dim3 Dg = dim3((width+Db.x-1)/Db.x, height);

	ConvolutionRowKernel<<<Dg, Db>>>(devOutput, pitch, devInput, pitch, width, height);

One thing I encountered, which has also caused headaches for me in the past, is getting confused between which values are common among threads in a block, versus which ones are specific to a thread. For example, whether I need to use i, or threadIdx.x+i. I have contemplated using some sort of perhaps hungarian notation to distinguish them, but so far I just strain to remember them all (and get errors when I forget).

I’ll take a look at the column convolution next.

This appears to work for the vertical:

[codebox]

global void ConvolutionColumnKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)

{

__shared__ uchar4 s_data_Input[COLUMN_TILE_H+2*KERNEL_RADIUS][COLUMN_TILE_W];

int x = blockIdx.x * blockDim.x + threadIdx.x;

int y = blockIdx.y * blockDim.y + threadIdx.y;

unsigned char *pixelInput = surfaceInput + pitchInput*(blockIdx.y*blockDim.y - KERNEL_RADIUS);

for (int srow = threadIdx.y; srow < COLUMN_TILE_H+2*KERNEL_RADIUS; srow += COLUMN_TILE_H) {

	if (srow < 0 || srow >= height) {

		continue;

	}

	s_data_Input[srow][threadIdx.x].x = pixelInput[srow*pitchInput + x*4];

	s_data_Input[srow][threadIdx.x].y = pixelInput[srow*pitchInput + x*4+1];

	s_data_Input[srow][threadIdx.x].z = pixelInput[srow*pitchInput + x*4+2];

}

__syncthreads();

if(x < width && y >= KERNEL_RADIUS && y < height-KERNEL_RADIUS) {

    float3 convolutionResult;

	convolutionResult.x = 0;

	convolutionResult.y = 0;

	convolutionResult.z = 0;

    for(int i=0; i<KERNEL_W; i++) {

        convolutionResult.x += s_data_Input[threadIdx.y+i][threadIdx.x].x;

        convolutionResult.y += s_data_Input[threadIdx.y+i][threadIdx.x].y;

        convolutionResult.z += s_data_Input[threadIdx.y+i][threadIdx.x].z;

    }

convolutionResult.x /= KERNEL_W;

    convolutionResult.y /= KERNEL_W;

    convolutionResult.z /= KERNEL_W;

uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);

pixelOutput.x = convolutionResult.x;

    pixelOutput[x].y = convolutionResult.y;

    pixelOutput[x].z = convolutionResult.z;

    pixelOutput[x].w = 1;

}

}[/codebox]

Some of the same problem, confusing thread-specific values with values that are common to all threads within a block, or block-relative buffers with globally addressed buffers.

I invoke it as follows:

#define COLUMN_TILE_W 16

#define COLUMN_TILE_H 16

...

		dim3 Db = dim3(COLUMN_TILE_W, COLUMN_TILE_H);

		dim3 Dg = dim3((width+Db.x-1)/Db.x, (height+Db.y-1)/Db.y);

		ConvolutionColumnKernel<<<Dg, Db>>>(devOutput, pitch, devInput, pitch, width, height);

Sorry for my late reaction, was trying to understand the code you suggested before asking some new questions…

Now I implemented the two kernels, but I still experience the same problem… If I combine the two kernels they do not work, so I guess I’m still doing something wrong in my execution grid or something…

My current implementation now looks like this:

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#define KERNEL_RADIUS 8

#define BLOCK_DIM 8

#define KERNEL_W (2 * KERNEL_RADIUS + 1)

#define COLUMN_TILE_W 16

#define COLUMN_TILE_H 16

__global__ void ConvolutionRowKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)

{

	__shared__ uchar4 s_data_Input[KERNEL_RADIUS + BLOCK_DIM + KERNEL_RADIUS];

	int x = blockIdx.x * blockDim.x + threadIdx.x;

	int y = blockIdx.y; // block height must always be 1, gridDim.y must always be height

	unsigned char *pixelInput = (surfaceInput+y*pitchInput);

	int gx = blockIdx.x * blockDim.x - KERNEL_RADIUS;

	for (int i=threadIdx.x; i<BLOCK_DIM+2*KERNEL_RADIUS; i += BLOCK_DIM)

	{ 

		s_data_Input[i].x = pixelInput[4*(gx+i)]; //R

		s_data_Input[i].y = pixelInput[4*(gx+i)+1]; //G

		s_data_Input[i].z = pixelInput[4*(gx+i)+2]; //B

	}

	__syncthreads();

	if(x >= KERNEL_RADIUS && x < width-KERNEL_RADIUS) 

	{

		float3 convolutionResult; 

		convolutionResult.x = 0; 

		convolutionResult.y = 0;

		convolutionResult.z = 0;

		for(int i=0; i < KERNEL_W; i++) 

		{

			convolutionResult.x += s_data_Input[threadIdx.x+i].x;

			convolutionResult.y += s_data_Input[threadIdx.x+i].y;

			convolutionResult.z += s_data_Input[threadIdx.x+i].z;

		}

		convolutionResult.x /= KERNEL_W; 

		convolutionResult.y /= KERNEL_W; 

		convolutionResult.z /= KERNEL_W;

		uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);

		pixelOutput[x].x = convolutionResult.x; 

		pixelOutput[x].y = convolutionResult.y;

		pixelOutput[x].z = convolutionResult.z;

		pixelOutput[x].w = 1;

	}

}

__global__ void ConvolutionColumnKernel(unsigned char* surfaceOutput, size_t pitchOutput, unsigned char* surfaceInput, size_t pitchInput, int width, int height)

{

	__shared__ uchar4 s_data_Input[COLUMN_TILE_H+2*KERNEL_RADIUS][COLUMN_TILE_W];

	int x = blockIdx.x * blockDim.x + threadIdx.x;

	int y = blockIdx.y * blockDim.y + threadIdx.y;

	unsigned char *pixelInput = surfaceInput + pitchInput*(blockIdx.y*blockDim.y - KERNEL_RADIUS);

	for (int srow = threadIdx.y; srow < COLUMN_TILE_H+2*KERNEL_RADIUS; srow += COLUMN_TILE_H) 

	{

		s_data_Input[srow][threadIdx.x].x = pixelInput[srow*pitchInput + x*4];

		s_data_Input[srow][threadIdx.x].y = pixelInput[srow*pitchInput + x*4+1];

		s_data_Input[srow][threadIdx.x].z = pixelInput[srow*pitchInput + x*4+2];

	}

	__syncthreads();

	if(x < width && y >= KERNEL_RADIUS && y < height-KERNEL_RADIUS) 

	{

		float3 convolutionResult;

		convolutionResult.x = 0;

		convolutionResult.y = 0;

		convolutionResult.z = 0;

		for(int i=0; i<KERNEL_W; i++) 

		{

			convolutionResult.x += s_data_Input[threadIdx.y+i][threadIdx.x].x;

			convolutionResult.y += s_data_Input[threadIdx.y+i][threadIdx.x].y;

			convolutionResult.z += s_data_Input[threadIdx.y+i][threadIdx.x].z;

		}

		convolutionResult.x /= KERNEL_W;

		convolutionResult.y /= KERNEL_W;

		convolutionResult.z /= KERNEL_W;

		uchar4* pixelOutput = (uchar4*)(surfaceOutput + y*pitchOutput);

		pixelOutput[x].x = convolutionResult.x;

		pixelOutput[x].y = convolutionResult.y;

		pixelOutput[x].z = convolutionResult.z;

		pixelOutput[x].w = 1;

	}

}

void cuda_kernel(void* pDataOutput, size_t pitchOutput, void* pDatainput, size_t pitchInput, int width, int height)

{

	dim3 DbR = dim3(BLOCK_DIM);

	dim3 DgR = dim3((width+DbR.x-1)/DbR.x, height);

	dim3 DbC = dim3(COLUMN_TILE_W, COLUMN_TILE_H);

	dim3 DgC = dim3((width+DbC.x-1)/DbC.x, (height+DbC.y-1)/DbC.y);

	ConvolutionRowKernel<<<DgR,DbR>>>((unsigned char*)pDataOutput, pitchOutput, (unsigned char*)pDatainput, pitchInput, width, height);

	ConvolutionColumnKernel<<<DgC, DbC>>>((unsigned char*)pDataOutput, pitchOutput, (unsigned char*)pDatainput, pitchInput, width, height);

}

I haven’t looked over the whole implementation, but the output of the first one has to be the input of the second one. Are they working for you individually?