another unspecified launcher failure

agarg · March 14, 2009, 9:23am

I’ve read a few of these posts, and I realize that my “unspecified launch failure” probably equates to segmentation fault.

So, can someone help me look at the code and debug this?

I ran it through the emulator, and all is fine for all N. When I run it on the gpu, it works fine for smaller N, but fails for larger N.

I’ve been looking at the code all day … I’m sure some in this forum would be able to spot the problem fairly quickly.

thanks in advance.

Btw, I’m on OSX 10.5, cuda 2.0, and running the kernel on the secondary gpu (9600GT is display, 9400 is cuda kernel)

[codebox]

#include <stdio.h>

#include <stdlib.h>

#include <getopt.h>

#include <cutil.h> // CUDA utilities CUDA_SAFE_CALL

#define _DEBUG

unsigned int N = 1024*1024;// the # of elems in the array (default: 1M)

float *A; // array A on the host

float *B; // array B on the host

float *A_d; // array A on the gpu

float *B_d; // array B on the gpu

int seed = 100; // random number generator seed

int MAX_BLOCKS_DIM1 = 0; // the maximum number of blocks per grid (1st D)

int MAX_BLOCKS_DIM2 = 0; // the maximum number of blocks per grid (2nd D)

int MAX_THREADS = 0; // the maximum number of threads per block

int total_blocks = 0; // the total number of blocks needed

int numblocks_dim1 = 0; // the number of blocks per grid (1st D)

int numblocks_dim2 = 0; // the number of blocks per grid (2nd D)

int numthreads = 0; // the number of threads per block

int amount = 8; // the number to increment by

struct cudaDeviceProp *dev;// the CUDA device properties struct

static struct option long_options =

{

{“num”, required_argument, 0, ‘n’},

{0, 0, 0, 0}

};

/*

CUDA Kernel to increment a value in an array
by some value.
@param *a pointer to the array in GPU memory
@param amt the value to increment by
@param N the number of elements in the array

*/

global

void increment(float *a, int amt, unsigned int n)

{

unsigned int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;

unsigned int threadIndex = blockIndex * blockDim.x + threadIdx.x;

if (threadIndex < n) {

  a[threadIndex] = a[threadIndex] + amt;

}

/*

usage
Print usage and then exit.

*/

void usage(int argc, char **argv)

{

fprintf(stderr, “usage: %s [options]\n”,argv[0]);

fprintf(stderr, “\n”);

fprintf(stderr, " -n,–num N the number of elements to process.\n");

fprintf(stderr, " default is %u.\n", N);

fprintf(stderr, " -h,–help this help screen.\n");

fprintf(stderr, “\n”);

exit(1);

}

/*

main program
@param argc the number of arguments passed to the program
@param **argv the arguments passed in

*/

int main(int argc, char **argv)

{

int count;

int active;

int i;

srand(seed);

// get the device properties so we know how to divide the work

dev = (struct cudaDeviceProp *)malloc(sizeof(struct cudaDeviceProp));

if (!dev) {

  fprintf(stderr, "*** OOM: could not allocate cudaDeviceProp ***\n");

  exit(1);

}

CUDA_SAFE_CALL(cudaGetDeviceCount(&count));

cudaGetDevice(&active);

printf(“total number of devices: %d\n”, count);

printf(“active device is %d\n”, active);

for (i = 0; i < count; i++) {

  cudaGetDeviceProperties(dev, i);

  printf("device %d: name: %s\n", i, dev->name);

}

active = 1;

printf(“setting active device to %d\n”, active);

cudaSetDevice(active);

cudaGetDevice(&active);

printf(“active device is %d\n”, active);

cudaGetDeviceProperties(dev, active);

MAX_THREADS = dev->maxThreadsPerBlock;

MAX_BLOCKS_DIM1 = dev->maxGridSize[0];

MAX_BLOCKS_DIM2 = dev->maxGridSize[1];

// get input (N)

int option_index = 0;

int c = 0;

while (1) {

  c = getopt_long (argc, argv, "n:h", long_options, &option_index);

  if (c == -1) break;

  switch © {

     case 'n':

        N = (unsigned int)atof(optarg);

        break;

     case '?':

     default:

        usage(argc,argv);

  }

}

// divide the work

// each element in N should be operated on by a thread

numthreads = MAX_THREADS;

total_blocks = N / numthreads + (N % numthreads == 0 ? 0 : 1);

numblocks_dim2 = 0;

numblocks_dim1 = 0;

// more than the maximum number of blocks in 1D, so go to 2D

if (total_blocks > MAX_BLOCKS_DIM1) {

  numblocks_dim1 = MAX_BLOCKS_DIM1;

  numblocks_dim2 = total_blocks / MAX_BLOCKS_DIM1 +

                   (total_blocks % MAX_BLOCKS_DIM1 == 0 ? 0 : 1);

} else {

  numblocks_dim2 = 0;

  numblocks_dim1 = total_blocks;

}

printf(“device name: %s\n”, dev->name);

printf(“total global memory: %u\n”, dev->totalGlobalMem);

printf(“N = %u\n”, N);

printf(“numthreads = %d; MAX = %d\n”, numthreads, MAX_THREADS);

printf(“total_blocks = %d; MAX = %d\n”, total_blocks, MAX_BLOCKS_DIM1);

printf(“numblocks_dim2 = %d\n”, numblocks_dim2);

printf(“numblocks_dim1 = %d\n”, numblocks_dim1);

unsigned int calculated_N, extra_blocks, extra_threads;

if (numblocks_dim2 > 0) {

  calculated_N = numblocks_dim1 * numthreads * numblocks_dim2;

  extra_blocks = numblocks_dim1 * numblocks_dim2 - total_blocks;

} else {

  calculated_N = numblocks_dim1 * numthreads;

  extra_blocks = 0;

}

extra_threads = calculated_N - N;

printf(“calculated_N = %u\n”, calculated_N);

printf(“extra_threads = %u\n”, extra_threads);

printf(“extra_blocks = %u\n”, extra_blocks);

// allocate host memory for the arrays

A = (float *)malloc(N * sizeof(float));

if (!A) {

  fprintf(stderr, "*** OOM: could not allocate A ***\n");

  exit(1);

}

B = (float *)malloc(N * sizeof(float));

if (!A) {

  fprintf(stderr, "*** OOM: could not allocate B ***\n");

  exit(1);

}

CUDA_SAFE_CALL(cudaMalloc((void **)&A_d, N*sizeof(float)));

// CUDA_SAFE_CALL(cudaMalloc((void **)&B_d, N*sizeof(float)));

// initialize A

printf(“initializing A with random floats\n”);

for (i = 0; i < N; i++) {

  A[i] = (float) rand()/RAND_MAX;

  B[i] = A[i];

}

printf(“copying %d elements from host->gpu\n”,N);

CUDA_SAFE_CALL(cudaMemcpy(A_d,A,N*sizeof(float),cudaMemcpyHo

stToDevice));

if (numblocks_dim2 == 0) numblocks_dim2 = 1; // dimension is 1-indexed?

dim3 grid_blocks(numblocks_dim1,numblocks_dim2);

dim3 block_threads(numthreads);

printf(“running cuda kernel with %dx%d blocks/grid and %d threads/block\n”,

       numblocks_dim1,numblocks_dim2,numthreads);

increment<<<grid_blocks, block_threads>>>(A_d,amount,N);

printf(“blocking until gpu work is complete\n”);

cudaThreadSynchronize();

cudaError_t err = cudaGetLastError();

if( err != 0 ) {

  printf( "** cuda error: %s ** \n", cudaGetErrorString(err));

}

CUT_CHECK_ERROR(“Kernel execution failed”);

// printf(“copying %d elements from gpu->gpu\n”,N);

// CUDA_SAFE_CALL(cudaMemcpy(B_d,A_d,N*sizeof(float),cudaMemcpy

DeviceToDevice));

// printf(“copying %d elements from gpu->host\n”,N);

// CUDA_SAFE_CALL(cudaMemcpy(B,B_d,N*sizeof(float),cudaMemcpyDe

viceToHost));

printf(“copying %d elements from gpu->host\n”,N);

CUDA_SAFE_CALL(cudaMemcpy(A,A_d,N*sizeof(float),cudaMemcpyDe

viceToHost));

unsigned int errors = 0;

for (i = 0; i < N; i++) {

  if (A[i] != B[i] + amount) {

     errors++;

  }

}

printf(“number of errors: %u / %u\n”,errors,N);

if (errors == N) {

  printf("everything failed! what's the problem?\n");

}

free(A);

free(B);

CUDA_SAFE_CALL(cudaFree(A_d));

CUDA_SAFE_CALL(cudaFree(B_d));

}

[/codebox]

my output is:

** GOOD RUN / SMALL N **

total number of devices: 2

active device is 0

device 0: name: GeForce 9600M GT

device 1: name: GeForce 9400M

setting active device to 1

active device is 1

device name: GeForce 9400M

total global memory: 266010624

N = 1048576

numthreads = 512; MAX = 512

total_blocks = 2048; MAX = 65535

numblocks_dim2 = 0

numblocks_dim1 = 2048

calculated_N = 1048576

extra_threads = 0

extra_blocks = 0

initializing A with random floats

copying 1048576 elements from host->gpu

running cuda kernel with 2048x1 blocks/grid and 512 threads/block

blocking until gpu work is complete

copying 1048576 elements from gpu->host

number of errors: 0 / 1048576

** BAD RUN / LARGER N **

total number of devices: 2

active device is 0

device 0: name: GeForce 9600M GT

device 1: name: GeForce 9400M

setting active device to 1

active device is 1

device name: GeForce 9400M

total global memory: 266010624

N = 266010624

numthreads = 512; MAX = 512

total_blocks = 519552; MAX = 65535

numblocks_dim2 = 8

numblocks_dim1 = 65535

calculated_N = 268431360

extra_threads = 2420736

extra_blocks = 4728

initializing A with random floats

copying 266010624 elements from host->gpu

running cuda kernel with 65535x8 blocks/grid and 512 threads/block

blocking until gpu work is complete

** cuda error: unspecified launch failure **

copying 266010624 elements from gpu->host

number of errors: 266010624 / 266010624

everything failed! what’s the problem?

Jamie_K · March 15, 2009, 10:38pm

It looks like you’re using an array of N = roughly 256 million floats, but you have only about 256M bytes of memory available. I would have expected cudaMalloc(N*4) to have failed. It’s probably worth investigating why it didn’t complain at that point.

Topic		Replies	Views
Causes of unspecified launch failure CUDA Programming and Performance	8	8965	July 9, 2009
Wrong output when adding blocks what am I doing wrong? CUDA Programming and Performance	13	12076	December 4, 2007
unspecified launch failure CUDA Programming and Performance	2	9068	January 21, 2009
Need help with cuda error: "unspecified launch failure" CUDA Programming and Performance	0	923	July 28, 2011
using cudaMalloc and cudaFree within a loop unspecified launch failure! CUDA Programming and Performance	21	37833	April 23, 2009
Unspecified launch failure CUDA Programming and Performance	9	1210	November 5, 2010
Unspecified launch failure and garbage values: Please help CUDA Programming and Performance	4	4423	July 5, 2010
"unspecified launch failure" with large size (but within 256 bytes) kernel arguments CUDA Programming and Performance	0	4777	September 22, 2009
CUDA Bug: "CUDA error: unspecified launch failure" CUDA Programming and Performance	7	11833	March 11, 2011
"unspecified launch failure" runtime failure CUDA Programming and Performance	6	3383	May 9, 2009

another unspecified launcher failure

Related topics