Mostly unspecified launch failure means segmentation fault for the host code. May be you might need to check your code. Did you check what values for block_height, block_depth and size of the thread_index.
Moreover, starting a new thread doesn’t mean that your question will be answered[old thread]. Unless you provide details about all the variables used by the code, its difficult to point out the error.
Mostly unspecified launch failure in device code is similar to segmentation fault in the host code. May be you might need to check your code. Did you check what values for block_height, block_depth and size of the thread_index.
Moreover, starting a new thread doesn’t mean that your question will be answered[old thread]. Unless you provide details about all the variables used by the code, its difficult to point out the error.
Thanks for your reply. I’ve gone through the code and couldn’t find any memory leaks. The Hardware Specification tells me that I can launch 1024 threads per block and I have around
I’m posting the full code here. Please go through it and point where I’m doing wrong.
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
void cuda_Init(int, int, int);
void cuda_Process();
void cuda_CleanUp();
void cuda_TestDownload(unsigned int *);
__device__ int width;
__device__ int height;
__device__ int depth;
__device__ unsigned int *thread_index;
int main(int argc, char *argv[])
{
int data_width = 640;
int data_height = 480;
int data_depth = 32;
cuda_Init(data_width, data_height, data_depth);
cuda_Process();
cuda_TestDownload(thread_index);
cuda_CleanUp();
return EXIT_SUCCESS;
}
__global__ void launch_test_kernel()
{
int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
int idy = (blockIdx.y * blockDim.y) + threadIdx.y;
int idz = (blockIdx.z * blockDim.z) + threadIdx.z;
unsigned int index = 0;
index = (idx * height * depth) + ( idy * depth ) + idz ;
thread_index[index] = index;
}
void cuda_Init( int iw, int ih, int id)
{
width = iw;
height = ih;
depth = id;
cudaMalloc( (void **) &thread_index, width * height * depth * sizeof(unsigned int));
cudaMemset(thread_index, 0, width * height * depth * sizeof(unsigned int) );
}
void cuda_Process()
{
cudaEvent_t event;
cudaError_t err;
err = cudaSetDevice(0);
assert(cudaSuccess == err);
err = cudaEventCreate(&event);
assert(cudaSuccess == err);
/* Query an event that hasn't been recorded */
err = cudaEventQuery(event);
cudaStreamSynchronize(0);
printf("Query unrecorded event: \t\t%s\n", cudaGetErrorString(err));
/* Record the event */
err = cudaEventRecord(event, 0);
assert(cudaSuccess == err);
/* Query the event again, we now expect cudaErrorNotReady */
err = cudaEventQuery(event);
cudaStreamSynchronize(0);
printf("Query recorded but not occured event: \t%s\n", cudaGetErrorString(err));
// Disparity map computation.
dim3 num_threads(4, 6, 32);
dim3 num_blocks(160, 80, 1);
launch_test_kernel <<<num_blocks, num_threads>>> ();
cudaStreamSynchronize(0);
/* Query the event again, we now expect cudaSuccess */
err = cudaEventQuery(event);
cudaStreamSynchronize(0);
printf("Query recorded and occured event: \t%s\n", cudaGetErrorString(err));
}
void cuda_TestDownload(unsigned int *arg_thread_index)
{
cudaMemcpy(arg_thread_index, thread_index, width * height * depth * sizeof(unsigned int), cudaMemcpyDeviceToHost);
}
void cuda_CleanUp()
{
cudaFree(thread_index);
}
The cudaMemcpy() in cuda_TestDownload() goes from a device pointer to a device pointer. You need to allocate memory on the host instead on order to copy the results back.