Problem with "unspecified launch failure"

Hi,

I’m trying to use CUDA to parallelize a program I have. Specifically I have been having problems with my kernel below

global void
kernel1( float* g_newWeights, float** g_weight1, unsigned int rfSize ,float *event)
{

// write data to global memory
const unsigned int tid = threadIdx.x;
int i;
float x;

g_newWeights[tid] = 0;

for(i=0; i<rfSize; i++){
x += g_weight1[tid][i]*event[i];
}

g_newWeights[tid] = x;

__syncthreads();

}

When I run my code with the above kernel, I get

cutilCheckMsg cudaThreadSynchronize error: Kernel execution failed in file <hpm.cu>, line 122 : unspecified launch failure.

where line 122 is the line that contains ‘x += g_weight1[tid][i]*event[i];’. Now if I replace x with something like ‘x=tid’, I have NO problem at all.

Also, if I run in emulation mode, the above code works just perfectly. Any ideas would help! Thanks!

How did you get ULF to tell you what line inside the kernel is failing? I want to know how to do that!

Anyways, you are probably initialzing your ** incorrectly on the GPU. It is such a pain in the but to do, that I won’t even attempt to repeat it here (search the forums for old posts on this topic if you really want to see). It likely works in device emulation because you’ve done the normal mistake with ** initialization on the GPU and have host pointers transferred to the GPU. GPU reading host * = ULF.

You are much better off allocating your 2D array as a flat * using cudaMallocPitch and indexing it like : d_data[j*pitch + i]

So I tried getting rid of the ** alltogether, replacing my code with this:

global void
//kernel1( float* g_newWeights, float** g_weight1, unsigned int rfSize ,float event)
kernel1( float
g_newWeights, unsigned int rfSize ,float *event)
{

// write data to global memory
const unsigned int tid = threadIdx.x;
int i;
float x;

g_newWeights[tid] = 0;

for(i=0; i<rfSize; i++){

//x += g_weight1[tid][i]*event[i];
x += event[i];
}

g_newWeights[tid] = x;

__syncthreads();

}

Where I am simply trying to accumulate all the values from the event array into a single place of the g_newWeights array, which is indexed by the thread-ID. However, I still get the EXACT same problem. Here is my host code that is called from my main():

extern “C” void runCalcUnitOutput( unsigned int rfSize, unsigned int numModules, float* newWeights,float *event)
{
unsigned int z,y;

cudaSetDevice( cutGetMaxGflopsDeviceId() );

const unsigned int num_threads =  numModules;
const unsigned int mem_size = sizeof(float) * numModules * rfSize;
const unsigned int result_mem_size = sizeof(float) * numModules;
const unsigned int mem_size2 = sizeof(float) * numModules;

float* d_event;
float* d_newWeights;

cutilSafeCall(cudaMalloc((void**) &d_newWeights,result_mem_size));
cutilSafeCall(cudaMalloc((void**) &d_event,mem_size2));
cutilSafeCall(cudaMemcpy(d_event, event, mem_size2, cudaMemcpyHostToDevice) );

// setup execution parameters
dim3 grid(1, 1, 1);
dim3 threads(num_threads, 1, 1);

// execute the kernel
kernel1<<< grid, threads >>>(d_newWeights, rfSize, event);

// check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed");

// copy results from device to host
cutilSafeCall(cudaMemcpy(newWeights, d_newWeights, result_mem_size,cudaMemcpyDeviceToHost));

cutilSafeCall(cudaFree(d_newWeights));
cutilSafeCall(cudaFree(d_event));
cudaThreadExit();

}

So my question is, am I maybe not copying over the event array to the device correctly? Because I believe I am following the tutorials/examples I have looked at. If not that, is there something wrong with my result array (newWeights)?

Thanks for the help, I am sure that using the ** isnt the best way to go about it, but even removed, I still have problems.

SOLVED:

So the problem was I was passing a host-variable parameter to the kernel instead of the device parameter (that is, i was sending h_events rather than d_events).

However, I also got rid of the double pointer, it is true, they are best avoided. That got rid of some annoying warnings.

Thanks for the help, if someone wants a more thorough explaination of the problem/solution, let me know!

Hi!

I’ve got a quite similar problem. I wrote a small Thresholdfilter to test basic CUDA capabilities as ITK filter.

So this is my code so far:

Kernel

#ifndef _MYTHRESHOLDFILTERCUDAKERNEL_H_

#define _MYTHRESHOLDFILTERCUDAKERNEL_H_

#include "MyThresholdFilterCUDA.h"

/**

	paints every pixel < threshold white

*/

__global__ void 

thresholdFilter( unsigned short* d_InputImage, unsigned short* d_OutputImage, int width, int height, unsigned short threshold)

{

	

	// Block index

	int bx = blockIdx.x;

	int by = blockIdx.y;

	// Thread index

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	//Check if pixel is in picture

	if(bx*BLOCK_SIZE+tx<width && by*BLOCK_SIZE+ty<height)

	{

		int ind = width*(by*BLOCK_SIZE+ty-1)+(bx*BLOCK_SIZE+tx);

		unsigned short pix = (d_InputImage[ind]);

		if(pix<threshold)

		{

			d_OutputImage[ind] = 255;

		}

		else

		{

			d_OutputImage[ind] = pix;

		}

	}

}

#endif // #ifndef _MYTHRESHOLDFILTERCUDAKERNEL_H_

Host program:

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

//CUDA Includes

#include <cutil_inline.h>

#include "MyThresholdFilterCUDAKernel.cu"

#include "MyThresholdFilterCUDA.h"

namespace thcuda{

	void ThresholdCUDA(const unsigned short*  inputImage, 

		unsigned short* outputImage, int width, int height, int insize, int outsize, int threshold)

	{

		//CUDA Code

		cudaSetDevice( cutGetMaxGflopsDeviceId() );

		

		unsigned short* d_InputImage;

		// allocate device memory

		cutilSafeCall(cudaMalloc((void**) &d_InputImage, insize));

		//printf("malloc\n %d %d %d\n", inputImage[84*343+156], insize, outsize );

		

		//copy data from host do device

		cutilSafeCall(cudaMemcpy(d_InputImage, inputImage, insize,cudaMemcpyHostToDevice) );

	

		// allocate device memory for result

		unsigned short* d_OutputImage;

		cutilSafeCall(cudaMalloc((void**) &d_OutputImage, insize));

		// setup execution parameters

		dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

		int gridx, gridy;

		//calculate gridsize

		gridx = width / BLOCK_SIZE;

		if(width % BLOCK_SIZE != 0)

		{

			gridx += 1;

		}

		gridy = height / BLOCK_SIZE;

		if(height % BLOCK_SIZE != 0)

		{

			gridy += 1;

		}

		dim3 grid(gridx, gridy);

		// execute the kernel

		thresholdFilter<<< grid, threads >>>(d_InputImage,d_OutputImage,width,height,threshold);

		// check if kernel execution generated and error

		cutilCheckMsg("Kernel execution failed");

		//copy result from device to host

		cutilSafeCall(cudaMemcpy(outputImage, d_OutputImage, outsize,cudaMemcpyDeviceToHost) );

		//printf("memcopyretour\n %d\n", ((unsigned short*)outputImage)[84*343+156] );

		cutilSafeCall(cudaFree(d_InputImage));

		cutilSafeCall(cudaFree(d_OutputImage));

		cudaThreadExit();

	}

}

Everytime I run my filter (as a member of an ITK Filterpipeline with a 8bit .TIF picture as data input) I get this error:

cutilCheckMsg cudaThreadSynchronize error: Kernel execution failed in file <MyThresholdFilterCUDA.cu>, line 57 : unspecified launch failure.

I found out, that the whole pipeline works just fine when I overwrite the InputImage with the new pixel value instead of writing it in the OutputImage (of course I also changed the deviceToHost memcopy of the result):

#ifndef _MYTHRESHOLDFILTERCUDAKERNEL_H_

#define _MYTHRESHOLDFILTERCUDAKERNEL_H_

#include "MyThresholdFilterCUDA.h"

/**

	paints every pixel < threshold white

*/

__global__ void 

thresholdFilter( unsigned short* d_InputImage, unsigned short* d_OutputImage, int width, int height, unsigned short threshold)

{

	

	// Block index

	int bx = blockIdx.x;

	int by = blockIdx.y;

	// Thread index

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	//Check if pixel is in picture

	if(bx*BLOCK_SIZE+tx<width && by*BLOCK_SIZE+ty<height)

	{

		int ind = width*(by*BLOCK_SIZE+ty-1)+(bx*BLOCK_SIZE+tx);

		unsigned short pix = (d_InputImage[ind]);

		if(pix<threshold)

		{

			d_InputImage[ind] = 255;

		}

		else

		{

			d_InputImage[ind] = pix;

		}

	}

}

#endif // #ifndef _MYTHRESHOLDFILTERCUDAKERNEL_H_

Of course this Kernel would do the job so far, but this is just a small example for me to get familar with the whole ITK/CUDA thing and I want to develop more complex (and usefull) filters in the future where I would need more than one image working…

I just can’t figure out where the problem lies. I’m using VC2005 and the newest CUDA (and also ITK, if that matters) version.