"unspecified launch failure" - ERROR

Hi Everyone,

My System Specifications :

OS : Open SUSE 11.4 (x86_64)

Graphics card : NVIDIA GeForce GTX 465

Driver Installed : 275.09.07

GCC Compiler Version : 4.3

CUDA Toolkit(s) installed : cudatoolkit_4.0.17_linux_64_suse11.2.run

cudatools_4.0.17_linux_64

I’m running the following code on my system :

void dummy_call()

{

    cudaEvent_t event;

    cudaError_t err;

err = cudaSetDevice(0);

    assert(cudaSuccess == err);

err = cudaEventCreate(&event);

    assert(cudaSuccess == err);

/* Query an event that hasn't been recorded */

    err = cudaEventQuery(event);

    printf("Query unrecorded event: \t\t%s\n", cudaGetErrorString(err));

/* Record the event */

    err = cudaEventRecord(event, 0);

    assert(cudaSuccess == err);

/* Query the event again, we now expect cudaErrorNotReady */

    err = cudaEventQuery(event);

    printf("Query recorded but not occured event: \t%s\n", cudaGetErrorString(err));

// Disparity map computation.

    dim3 num_threads(100, 100, 1); 

    dim3 num_blocks(10, 10, 10); 

    simple_kernel_call <<<num_blocks, num_threads>>> (thread_index);

    cudaStreamSynchronize(0);

/* Query the event again, we now expect cudaSuccess */

    err = cudaEventQuery(event);

    printf("Query recorded and occured event: \t%s\n", cudaGetErrorString(err));

}

__global__ void simple_kernel_call(float *thread_index_array)

{

    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;

    int idy = (blockIdx.y * blockDim.y) + threadIdx.y;

    int idz = (blockIdx.z * blockDim.z) + threadIdx.z;

long index = (idx * block_height * block_depth)  + ( idy * block_depth ) + idz ;

    thread_index[index] = index;

    __syncthreads();

}

Can anyone please shed some light ???

Where am doing the mistake ???

Bhanu Kiran Challa

Mostly unspecified launch failure means segmentation fault for the host code. May be you might need to check your code. Did you check what values for block_height, block_depth and size of the thread_index.

Moreover, starting a new thread doesn’t mean that your question will be answered[old thread]. Unless you provide details about all the variables used by the code, its difficult to point out the error.

Mostly unspecified launch failure in device code is similar to segmentation fault in the host code. May be you might need to check your code. Did you check what values for block_height, block_depth and size of the thread_index.

Moreover, starting a new thread doesn’t mean that your question will be answered[old thread]. Unless you provide details about all the variables used by the code, its difficult to point out the error.

Hi Veda,

Thanks for your reply. I’ve gone through the code and couldn’t find any memory leaks. The Hardware Specification tells me that I can launch 1024 threads per block and I have around

I’m posting the full code here. Please go through it and point where I’m doing wrong.

#include <assert.h>

#include <stdio.h>

#include <stdlib.h>

void cuda_Init(int, int, int);

void cuda_Process();

void cuda_CleanUp();

void cuda_TestDownload(unsigned int *);

__device__ int width;

__device__ int height;

__device__ int depth;

__device__ unsigned int *thread_index;

int main(int argc, char *argv[])

{

	int data_width = 640;

	int data_height = 480;

	int data_depth = 32;

	cuda_Init(data_width, data_height, data_depth);

	cuda_Process(); 

	cuda_TestDownload(thread_index);

	cuda_CleanUp();

return EXIT_SUCCESS;

}

__global__ void launch_test_kernel()

{

    int idx = (blockIdx.x * blockDim.x) + threadIdx.x;

    int idy = (blockIdx.y * blockDim.y) + threadIdx.y;

    int idz = (blockIdx.z * blockDim.z) + threadIdx.z;

    unsigned int index = 0; 

index = (idx * height * depth)  + ( idy * depth ) + idz ;

    thread_index[index] = index;

}

void cuda_Init( int iw, int ih, int id)

{

    width = iw;

    height = ih;

    depth = id;

cudaMalloc( (void **) &thread_index, width * height * depth * sizeof(unsigned int));

    cudaMemset(thread_index, 0, width * height * depth * sizeof(unsigned int) );

}

void cuda_Process()

{

    cudaEvent_t event;

    cudaError_t err;

err = cudaSetDevice(0);

    assert(cudaSuccess == err);

err = cudaEventCreate(&event);

    assert(cudaSuccess == err);

/* Query an event that hasn't been recorded */

    err = cudaEventQuery(event);

    cudaStreamSynchronize(0);

    printf("Query unrecorded event: \t\t%s\n", cudaGetErrorString(err));

/* Record the event */

    err = cudaEventRecord(event, 0);

    assert(cudaSuccess == err);

/* Query the event again, we now expect cudaErrorNotReady */

    err = cudaEventQuery(event);

    cudaStreamSynchronize(0);

    printf("Query recorded but not occured event: \t%s\n", cudaGetErrorString(err));

// Disparity map computation.

    dim3 num_threads(4, 6, 32); 

    dim3 num_blocks(160, 80, 1); 

    launch_test_kernel <<<num_blocks, num_threads>>> ();

    cudaStreamSynchronize(0);

/* Query the event again, we now expect cudaSuccess */

    err = cudaEventQuery(event);

    cudaStreamSynchronize(0);

    printf("Query recorded and occured event: \t%s\n", cudaGetErrorString(err));

}

void cuda_TestDownload(unsigned int *arg_thread_index)

{

	cudaMemcpy(arg_thread_index, thread_index, width * height * depth * sizeof(unsigned int), cudaMemcpyDeviceToHost);

}

void cuda_CleanUp()

{

    cudaFree(thread_index);

}

Bhanu Kiran Challa

You code is neither correct nor incorrect. It is just not what it needs to be…

Hi Mr. Sarnath,

Could you plz elaborate your reply …

Bhanu Kiran Challa

I was just replying in terms of your signature… Nothing else…
It was just for fun. Pls dont take it seriously…

The cudaMemcpy() in cuda_TestDownload() goes from a device pointer to a device pointer. You need to allocate memory on the host instead on order to copy the results back.

Got it !

Thanks for the reply …

Bhanu Kiran Challa

CHeers! Now the glass is full…