Hello,
I’m having an issue with CUDA not launching my kernel depending on the length of one of the loops in a nested loop structure. With a few iterations it works fine, however if the number of iterations is increased, CUDA issues the error message “unspecified launch failure”.
Here is the calling program:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil.h>
// includes, kernels
#include <template_kernel.cu>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
runTest( argc, argv);
CUT_EXIT(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest( int argc, char** argv)
{
CUT_CHECK_DEVICE();
unsigned int num_threads = 64;
unsigned int len_DelT = 801;
// allocate host memory
float* TimeData = (float *) malloc(sizeof(float) * 16400000);
// initalize host memory
for(int i = 0; i < 16400000; i++)
TimeData[i] = i;
// allocate device memory
float* d_TimeData;
CUDA_SAFE_CALL( cudaMalloc( (void**) &d_TimeData, sizeof(float) * 16400000));
// copy host memory to device
CUDA_SAFE_CALL( cudaMemcpy( d_TimeData, TimeData, sizeof(float)*16400000, cudaMemcpyHostToDevice));
// allocate device memory for result
float* d_tss_r;
CUDA_SAFE_CALL(cudaMalloc( (void**) &d_tss_r, sizeof(float) * 16400*len_DelT));
// setup execution parameters
dim3 grid( 1, 1, 1);
dim3 threads( num_threads, 1, 1);
// execute the kernel
testKernel<<< grid, threads, 0 >>>(d_tss_r, d_TimeData, 16400, len_DelT, 1000);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// allocate mem for the result on host side
float* h_tss_r_host = (float*) malloc(sizeof(float) * 16400*len_DelT);
// copy result from device to host
CUDA_SAFE_CALL( cudaMemcpy( h_tss_r_host, d_tss_r, sizeof( float) * 16400*len_DelT, cudaMemcpyDeviceToHost));
for(int k = 0; k < 10; k++)
{
printf("h_tss_r_host[%d] = %g\n", k, h_tss_r_host[k]);
}
// cleanup memory
free(TimeData);
CUDA_SAFE_CALL(cudaFree(d_TimeData));
CUDA_SAFE_CALL(cudaFree(d_tss_r));
free(h_tss_r_host);
}
and here is the kernel
__global__ void testKernel( float* d_tss_r, float* d_TimeData, unsigned int pts, int len_DelT, unsigned int scans)
{
__device__ float *d_rpc;
__device__ int r_index, c_index, k;
for(r_index = 0; r_index < 16400; r_index++)
{
for(k = 0; k < len_DelT; k++)
{
d_rpc = d_TimeData+r_index;
for(c_index = 0; c_index < scans; c_index++)
{
d_rpc += pts;
}
}
}
d_tss_r[threadIdx.x] = 1.0;
}
This was part of a larger program that I have stripped down as much as possible while still producing the error. Thus, the program in its current state does not do much and the nested loops do not contribute to the output. If I run the program with a small value for len_DelT (~100 or less), everything is fine. However if len_DelT is larger (such as 801 in this code now), CUDA fails with the aforementioned message.
Any help is appreciated.
Scott