Hi!
I’ve got a quite similar problem. I wrote a small Thresholdfilter to test basic CUDA capabilities as ITK filter.
So this is my code so far:
Kernel
#ifndef _MYTHRESHOLDFILTERCUDAKERNEL_H_
#define _MYTHRESHOLDFILTERCUDAKERNEL_H_
#include "MyThresholdFilterCUDA.h"
/**
paints every pixel < threshold white
*/
__global__ void
thresholdFilter( unsigned short* d_InputImage, unsigned short* d_OutputImage, int width, int height, unsigned short threshold)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
//Check if pixel is in picture
if(bx*BLOCK_SIZE+tx<width && by*BLOCK_SIZE+ty<height)
{
int ind = width*(by*BLOCK_SIZE+ty-1)+(bx*BLOCK_SIZE+tx);
unsigned short pix = (d_InputImage[ind]);
if(pix<threshold)
{
d_OutputImage[ind] = 255;
}
else
{
d_OutputImage[ind] = pix;
}
}
}
#endif // #ifndef _MYTHRESHOLDFILTERCUDAKERNEL_H_
Host program:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
//CUDA Includes
#include <cutil_inline.h>
#include "MyThresholdFilterCUDAKernel.cu"
#include "MyThresholdFilterCUDA.h"
namespace thcuda{
void ThresholdCUDA(const unsigned short* inputImage,
unsigned short* outputImage, int width, int height, int insize, int outsize, int threshold)
{
//CUDA Code
cudaSetDevice( cutGetMaxGflopsDeviceId() );
unsigned short* d_InputImage;
// allocate device memory
cutilSafeCall(cudaMalloc((void**) &d_InputImage, insize));
//printf("malloc\n %d %d %d\n", inputImage[84*343+156], insize, outsize );
//copy data from host do device
cutilSafeCall(cudaMemcpy(d_InputImage, inputImage, insize,cudaMemcpyHostToDevice) );
// allocate device memory for result
unsigned short* d_OutputImage;
cutilSafeCall(cudaMalloc((void**) &d_OutputImage, insize));
// setup execution parameters
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
int gridx, gridy;
//calculate gridsize
gridx = width / BLOCK_SIZE;
if(width % BLOCK_SIZE != 0)
{
gridx += 1;
}
gridy = height / BLOCK_SIZE;
if(height % BLOCK_SIZE != 0)
{
gridy += 1;
}
dim3 grid(gridx, gridy);
// execute the kernel
thresholdFilter<<< grid, threads >>>(d_InputImage,d_OutputImage,width,height,threshold);
// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");
//copy result from device to host
cutilSafeCall(cudaMemcpy(outputImage, d_OutputImage, outsize,cudaMemcpyDeviceToHost) );
//printf("memcopyretour\n %d\n", ((unsigned short*)outputImage)[84*343+156] );
cutilSafeCall(cudaFree(d_InputImage));
cutilSafeCall(cudaFree(d_OutputImage));
cudaThreadExit();
}
}
Everytime I run my filter (as a member of an ITK Filterpipeline with a 8bit .TIF picture as data input) I get this error:
cutilCheckMsg cudaThreadSynchronize error: Kernel execution failed in file <MyThresholdFilterCUDA.cu>, line 57 : unspecified launch failure.
I found out, that the whole pipeline works just fine when I overwrite the InputImage with the new pixel value instead of writing it in the OutputImage (of course I also changed the deviceToHost memcopy of the result):
#ifndef _MYTHRESHOLDFILTERCUDAKERNEL_H_
#define _MYTHRESHOLDFILTERCUDAKERNEL_H_
#include "MyThresholdFilterCUDA.h"
/**
paints every pixel < threshold white
*/
__global__ void
thresholdFilter( unsigned short* d_InputImage, unsigned short* d_OutputImage, int width, int height, unsigned short threshold)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
//Check if pixel is in picture
if(bx*BLOCK_SIZE+tx<width && by*BLOCK_SIZE+ty<height)
{
int ind = width*(by*BLOCK_SIZE+ty-1)+(bx*BLOCK_SIZE+tx);
unsigned short pix = (d_InputImage[ind]);
if(pix<threshold)
{
d_InputImage[ind] = 255;
}
else
{
d_InputImage[ind] = pix;
}
}
}
#endif // #ifndef _MYTHRESHOLDFILTERCUDAKERNEL_H_
Of course this Kernel would do the job so far, but this is just a small example for me to get familar with the whole ITK/CUDA thing and I want to develop more complex (and usefull) filters in the future where I would need more than one image working…
I just can’t figure out where the problem lies. I’m using VC2005 and the newest CUDA (and also ITK, if that matters) version.