Hello everybody, i’m doing some image processing algorithms using CUDA, those algorithm worked fine on my old notebook in emulation mode, but now i moved to another cuda enabled pc and turning OFF the -deviceemu flag results in bad output images (scattered red green and blue points or colored horizontal lines). As i just said, those programs gave me the right output in emu mode. I’ll paste a kernel and the function caling it below:
[codebox]#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cutil.h>
#define i_mult(a, b) __mul24(a, b)
#define KERNEL_RADIUS 8
#define KERNEL_WIDTH (2 * KERNEL_RADIUS + 1)
#define ROW_TILE_W 128
#define KERNEL_RADIUS_ALIGNED 16
#define COLUMN_TILE_W 16
#define COLUMN_TILE_H 48
inline int iDivUp(int a, int b){ return (a % b != 0) ? (a / b + 1) : (a / b); }
inline int iDivDown(int a, int b){ return a / b; }
inline int iAlignUp(int a, int b){ return (a % b != 0) ? (a - a % b + b) : a; }
inline int iAlignDown(int a, int b){ return a - a % b; }
device constant float d_kernel[KERNEL_WIDTH];
const unsigned int KERNEL_SIZE = KERNEL_WIDTH * sizeof(float);
global void convolutionRow_GPU(float do_r, float do_g, float* do_b, float di_r, float di_g, float* di_b, int width, int height)
{
__shared__ float shared_r[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];
__shared__ float shared_g[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];
__shared__ float shared_b[KERNEL_RADIUS + ROW_TILE_W + KERNEL_RADIUS];
const int tileStart = i_mult(blockIdx.x, ROW_TILE_W);
const int tileEnd = tileStart + ROW_TILE_W - 1;
const int apronStart = tileStart - KERNEL_RADIUS;
const int apronEnd = tileEnd + KERNEL_RADIUS;
const int tileEndClamped = min(tileEnd, width - 1);
const int apronStartClamped = max(apronStart, 0);
const int apronEndClamped = min(apronEnd, width - 1);
const int rowStart = i_mult(blockIdx.y, width);
const int apronStartAligned = tileStart - KERNEL_RADIUS_ALIGNED;
const int loadPos = apronStartAligned + threadIdx.x;
if(loadPos >= apronStart)
{
const int smemPos = loadPos - apronStart;
shared_r[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? di_r[rowStart + loadPos] : 0;
shared_g[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? di_g[rowStart + loadPos] : 0;
shared_b[smemPos] = ((loadPos >= apronStartClamped) && (loadPos <= apronEndClamped)) ? di_b[rowStart + loadPos] : 0;
}
__syncthreads();
const int writePos = tileStart + threadIdx.x;
if(writePos <= tileEndClamped)
{
const int smemPos = writePos - apronStart;
float sum_r = 0, sum_g = 0, sum_b = 0;
for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
{
sum_r += shared_r[smemPos + k] * d_kernel[KERNEL_RADIUS - k];
sum_g += shared_g[smemPos + k] * d_kernel[KERNEL_RADIUS - k];
sum_b += shared_b[smemPos + k] * d_kernel[KERNEL_RADIUS - k];
}
do_r[rowStart + writePos] = sum_r;
do_g[rowStart + writePos] = sum_g;
do_b[rowStart + writePos] = sum_b;
}
}
global void convolutionColumn_GPU(float* do_r, float* do_g, float* do_b, float di_r, float di_g, float* di_b, int width, int height, int smemStride, int gmemStride)
{
__shared__ float shared_r[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
__shared__ float shared_g[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
__shared__ float shared_b[COLUMN_TILE_W * (KERNEL_RADIUS + COLUMN_TILE_H + KERNEL_RADIUS)];
const int tileStart = i_mult(blockIdx.y, COLUMN_TILE_H);
const int tileEnd = tileStart + COLUMN_TILE_H - 1;
const int apronStart = tileStart - KERNEL_RADIUS;
const int apronEnd = tileEnd + KERNEL_RADIUS;
const int tileEndClamped = min(tileEnd, height - 1);
const int apronStartClamped = max(apronStart, 0);
const int apronEndClamped = min(apronEnd, height - 1);
const int columnStart = i_mult(blockIdx.x, COLUMN_TILE_W) + threadIdx.x;
int smemPos = i_mult(threadIdx.y, COLUMN_TILE_W) + threadIdx.x;
int gmemPos = i_mult(apronStart + threadIdx.y, width) + columnStart;
for(int y = apronStart + threadIdx.y; y <= apronEnd; y += blockDim.y)
{
shared_r[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ? di_r[gmemPos] : 0;
shared_g[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ? di_g[gmemPos] : 0;
shared_b[smemPos] = ((y >= apronStartClamped) && (y <= apronEndClamped)) ? di_b[gmemPos] : 0;
smemPos += smemStride;
gmemPos += gmemStride;
}
__syncthreads();
smemPos = i_mult(threadIdx.y + KERNEL_RADIUS, COLUMN_TILE_W) + threadIdx.x;
gmemPos = i_mult(tileStart + threadIdx.y , width) + columnStart;
for(int y = tileStart + threadIdx.y; y <= tileEndClamped; y += blockDim.y)
{
float sum_r = 0, sum_g = 0, sum_b = 0;
for(int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++)
{
sum_r += shared_r[smemPos + i_mult(k, COLUMN_TILE_W)] * d_kernel[KERNEL_RADIUS - k];
sum_g += shared_g[smemPos + i_mult(k, COLUMN_TILE_W)] * d_kernel[KERNEL_RADIUS - k];
sum_b += shared_b[smemPos + i_mult(k, COLUMN_TILE_W)] * d_kernel[KERNEL_RADIUS - k];
}
do_r[gmemPos] = sum_r;
do_g[gmemPos] = sum_g;
do_b[gmemPos] = sum_b;
smemPos += smemStride;
gmemPos += gmemStride;
}
}
extern “C” void ConvolutionSeparable(float* hi_r, float* hi_g, float* hi_b, float* ho_r, float* ho_g, float* ho_b, unsigned int width, unsigned int height)
{
const unsigned int mem_size = sizeof(float) * width * height;
float *di_r, *di_g, *di_b, *do_r, *do_g, *do_b, *h_kernel, kernel_sum = 0;
h_kernel = (float*)malloc(KERNEL_SIZE);
CUDA_SAFE_CALL(cudaMalloc((void **)&di_r, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&do_r, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&di_g, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&do_g, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&di_b, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&do_b, mem_size));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_kernel, KERNEL_SIZE));
for(int i = 0; i < KERNEL_WIDTH; i++)
{
float dist = (float)(i - KERNEL_RADIUS) / (float)KERNEL_RADIUS;
h_kernel[i] = expf(- dist * dist / 2);
kernel_sum += h_kernel[i];
}
for(int i = 0; i < KERNEL_WIDTH; i++)
h_kernel[i] /= kernel_sum;
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_kernel, h_kernel, KERNEL_SIZE));
CUDA_SAFE_CALL(cudaMemcpy(di_r, hi_r, mem_size, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(di_g, hi_g, mem_size, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(di_b, hi_b, mem_size, cudaMemcpyHostToDevice));
dim3 blockGridRows(iDivUp(width, ROW_TILE_W), height);
dim3 blockGridColumns(iDivUp(width, COLUMN_TILE_W), iDivUp(height, COLUMN_TILE_H));
dim3 threadBlockRows(KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS);
dim3 threadBlockColumns(COLUMN_TILE_W, 8);
convolutionRow_GPU<<<blockGridRows, threadBlockRows>>>(do_r, do_g, do_b, di_r, di_g, di_b, width, height);
CUT_CHECK_ERROR("convolutionRow_GPU() execution failed\n");
CUDA_SAFE_CALL(cudaThreadSynchronize());
convolutionColumn_GPU<<<blockGridColumns, threadBlockColumns>>>(di_r, di_g, di_b, do_r, do_g, do_b, width, height, COLUMN_TILE_W * threadBlockColumns.y, width * threadBlockColumns.y);
CUT_CHECK_ERROR("convolutionColumn_GPU() execution failed\n");
CUDA_SAFE_CALL(cudaThreadSynchronize());
CUDA_SAFE_CALL(cudaMemcpy(ho_r, di_r, mem_size, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(ho_g, di_g, mem_size, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(ho_b, di_b, mem_size, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(d_kernel));
CUDA_SAFE_CALL(cudaFree(di_r));
CUDA_SAFE_CALL(cudaFree(do_r));
CUDA_SAFE_CALL(cudaFree(di_g));
CUDA_SAFE_CALL(cudaFree(do_g));
CUDA_SAFE_CALL(cudaFree(di_b));
CUDA_SAFE_CALL(cudaFree(do_b));
free(h_kernel);
}[/codebox]
Basically this is the convolutionSeparable test in the SDK extended for RGB images. In this particular case the output result and the input remains the same and i just can’t understand why. I tried to change the nvcc otions but nothing changed. I’m working on a 8600GT board and my nvcc options (at the moment) are:
[codebox]“C:\CUDA\bin\nvcc.exe” -ccbin “C:\Program Files\Microsoft Visual Studio 8\VC\bin” -O0 -arch sm_10 -code sm_10 --host-compilation C++ -c -m 32 -o “Debug\prova2.obj” -odir “Debug” -ext none -int none “c:\progetto uni\prova2\prova2.vcproj”[/codebox]
No runtime device/host errors… can somebody give me a hint about what’s happening? Thank you very much
Andrea