Wrong output for processed images problems running in non-emulation mode

Hello everybody, i’m doing some image processing algorithms using CUDA, those algorithm worked fine on my old notebook in emulation mode, but now i moved to another cuda enabled pc and turning OFF the -deviceemu flag results in bad output images (scattered red green and blue points or colored horizontal lines). As i just said, those programs gave me the right output in emu mode. I’ll paste a kernel and the function caling it below:

[codebox]#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <cutil.h>

#define i_mult(a, b) __mul24(a, b)

#define KERNEL_RADIUS 8

#define KERNEL_WIDTH (2 * KERNEL_RADIUS + 1)

#define ROW_TILE_W 128

#define KERNEL_RADIUS_ALIGNED 16

#define COLUMN_TILE_W 16

#define COLUMN_TILE_H 48

inline int iDivUp(int a, int b){ return (a % b != 0) ? (a / b + 1) : (a / b); }

inline int iDivDown(int a, int b){ return a / b; }

inline int iAlignUp(int a, int b){ return (a % b != 0) ? (a - a % b + b) : a; }

inline int iAlignDown(int a, int b){ return a - a % b; }

device constant float d_kernel[KERNEL_WIDTH];

const unsigned int KERNEL_SIZE = KERNEL_WIDTH * sizeof(float);

global void convolutionRow_GPU(float do_r, float do_g, float* do_b, float di_r, float di_g, float* di_b, int width, int height)

{

__shared__ float shared_r[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];

__shared__ float shared_g[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];

__shared__ float shared_b[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];

const int tileStart = i_mult(blockIdx.x, ROW_TILE_W);

const int           tileEnd = tileStart + ROW_TILE_W - 1;

const int        apronStart = tileStart - KERNEL_RADIUS;

const int          apronEnd = tileEnd   + KERNEL_RADIUS;

const int tileEndClamped = min(tileEnd, width - 1);

const int apronStartClamped = max(apronStart, 0);

const int   apronEndClamped = min(apronEnd, width - 1);

const int rowStart = i_mult(blockIdx.y, width);

const int apronStartAligned = tileStart - KERNEL_RADIUS_ALIGNED;

const int loadPos = apronStartAligned + threadIdx.x;

if(loadPos >= apronStart)

{

	const int smemPos = loadPos - apronStart;

shared_r[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? di_r[rowStart + loadPos] : 0;

    shared_g[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? di_g[rowStart + loadPos] : 0;

    shared_b[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? di_b[rowStart + loadPos] : 0;

}

__syncthreads();

const int writePos = tileStart + threadIdx.x;

if(writePos <= tileEndClamped)

{

	const int smemPos = writePos - apronStart;

    float sum_r = 0, sum_g = 0, sum_b = 0;

for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)

	{

		sum_r += shared_r[smemPos + k] * d_kernel[KERNEL_RADIUS - k];

        sum_g += shared_g[smemPos + k] * d_kernel[KERNEL_RADIUS - k];

        sum_b += shared_b[smemPos + k] * d_kernel[KERNEL_RADIUS - k];

	}

do_r[rowStart + writePos] = sum_r;

	do_g[rowStart + writePos] = sum_g;

	do_b[rowStart + writePos] = sum_b;

}

}

global void convolutionColumn_GPU(float* do_r, float* do_g, float* do_b, float di_r, float di_g, float* di_b, int width, int height, int smemStride, int gmemStride)

{

__shared__ float shared_r[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];

__shared__ float shared_g[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];

__shared__ float shared_b[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];

const int tileStart = i_mult(blockIdx.y, COLUMN_TILE_H);

const int           tileEnd = tileStart + COLUMN_TILE_H - 1;

const int        apronStart = tileStart - KERNEL_RADIUS;

const int          apronEnd = tileEnd   + KERNEL_RADIUS;

const int tileEndClamped = min(tileEnd, height - 1);

const int apronStartClamped = max(apronStart, 0);

const int   apronEndClamped = min(apronEnd, height - 1);

const int columnStart = i_mult(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;

int smemPos = i_mult(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;

int gmemPos = i_mult(apronStart + threadIdx.y, width) + columnStart;

for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y)

{

	shared_r[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ? di_r[gmemPos] : 0;

	shared_g[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ? di_g[gmemPos] : 0;

	shared_b[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ? di_b[gmemPos] : 0;

smemPos += smemStride;

    gmemPos += gmemStride;

}

__syncthreads();

smemPos = i_mult(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;

gmemPos = i_mult(tileStart + threadIdx.y , width) + columnStart;

for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y)

{

	float sum_r = 0, sum_g = 0, sum_b = 0;

for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)

	{	

		sum_r += shared_r[smemPos + i_mult(k, COLUMN_TILE_W)] * d_kernel[KERNEL_RADIUS - k];

		sum_g += shared_g[smemPos + i_mult(k, COLUMN_TILE_W)] * d_kernel[KERNEL_RADIUS - k];

		sum_b += shared_b[smemPos + i_mult(k, COLUMN_TILE_W)] * d_kernel[KERNEL_RADIUS - k];

	}

	do_r[gmemPos] = sum_r;

	do_g[gmemPos] = sum_g;

	do_b[gmemPos] = sum_b;

smemPos += smemStride;

    gmemPos += gmemStride;

}

}

extern “C” void ConvolutionSeparable(float* hi_r, float* hi_g, float* hi_b, float* ho_r, float* ho_g, float* ho_b, unsigned int width, unsigned int height)

{

const unsigned int mem_size = sizeof(float) * width * height;

float *di_r, *di_g, *di_b, *do_r, *do_g, *do_b, *h_kernel, kernel_sum = 0;

h_kernel = (float*)malloc(KERNEL_SIZE);

CUDA_SAFE_CALL(cudaMalloc((void **)&di_r, mem_size));

CUDA_SAFE_CALL(cudaMalloc((void **)&do_r, mem_size));

CUDA_SAFE_CALL(cudaMalloc((void **)&di_g, mem_size));

CUDA_SAFE_CALL(cudaMalloc((void **)&do_g, mem_size));

CUDA_SAFE_CALL(cudaMalloc((void **)&di_b, mem_size));

CUDA_SAFE_CALL(cudaMalloc((void **)&do_b, mem_size));

CUDA_SAFE_CALL(cudaMalloc((void **)&d_kernel, KERNEL_SIZE));

for(int i = 0; i < KERNEL_WIDTH; i++)

{

    float dist = (float)(i - KERNEL_RADIUS) / (float)KERNEL_RADIUS;

	h_kernel[i] = expf(- dist * dist / 2);

    kernel_sum += h_kernel[i];

}

for(int i = 0; i < KERNEL_WIDTH; i++)

    h_kernel[i] /= kernel_sum;

CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_kernel, h_kernel, KERNEL_SIZE));

CUDA_SAFE_CALL(cudaMemcpy(di_r, hi_r, mem_size, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(di_g, hi_g, mem_size, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(di_b, hi_b, mem_size, cudaMemcpyHostToDevice));

dim3 blockGridRows(iDivUp(width, ROW_TILE_W), height);

dim3 blockGridColumns(iDivUp(width, COLUMN_TILE_W), iDivUp(height, COLUMN_TILE_H));

dim3 threadBlockRows(KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS);

dim3 threadBlockColumns(COLUMN_TILE_W, 8);

convolutionRow_GPU<<<blockGridRows, threadBlockRows>>>(do_r, do_g, do_b, di_r, di_g, di_b, width, height);

CUT_CHECK_ERROR("convolutionRow_GPU() execution failed\n");



CUDA_SAFE_CALL(cudaThreadSynchronize());

convolutionColumn_GPU<<<blockGridColumns, threadBlockColumns>>>(di_r, di_g, di_b, do_r, do_g, do_b, width, height, COLUMN_TILE_W * threadBlockColumns.y, width * threadBlockColumns.y);

CUT_CHECK_ERROR("convolutionColumn_GPU() execution failed\n");

CUDA_SAFE_CALL(cudaThreadSynchronize());

CUDA_SAFE_CALL(cudaMemcpy(ho_r, di_r, mem_size, cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaMemcpy(ho_g, di_g, mem_size, cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaMemcpy(ho_b, di_b, mem_size, cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaFree(d_kernel));

CUDA_SAFE_CALL(cudaFree(di_r));

CUDA_SAFE_CALL(cudaFree(do_r));

CUDA_SAFE_CALL(cudaFree(di_g));

CUDA_SAFE_CALL(cudaFree(do_g));

CUDA_SAFE_CALL(cudaFree(di_b));

CUDA_SAFE_CALL(cudaFree(do_b));

free(h_kernel);

}[/codebox]

Basically this is the convolutionSeparable test in the SDK extended for RGB images. In this particular case the output result and the input remains the same and i just can’t understand why. I tried to change the nvcc otions but nothing changed. I’m working on a 8600GT board and my nvcc options (at the moment) are:

[codebox]“C:\CUDA\bin\nvcc.exe” -ccbin “C:\Program Files\Microsoft Visual Studio 8\VC\bin” -O0 -arch sm_10 -code sm_10 --host-compilation C++ -c -m 32 -o “Debug\prova2.obj” -odir “Debug” -ext none -int none “c:\progetto uni\prova2\prova2.vcproj”[/codebox]

No runtime device/host errors… can somebody give me a hint about what’s happening? Thank you very much

Andrea