another unspecified launcher failure

I’ve read a few of these posts, and I realize that my “unspecified launch failure” probably equates to segmentation fault.

So, can someone help me look at the code and debug this?

I ran it through the emulator, and all is fine for all N. When I run it on the gpu, it works fine for smaller N, but fails for larger N.

I’ve been looking at the code all day … I’m sure some in this forum would be able to spot the problem fairly quickly.

thanks in advance.

Btw, I’m on OSX 10.5, cuda 2.0, and running the kernel on the secondary gpu (9600GT is display, 9400 is cuda kernel)

[codebox]

#include <stdio.h>

#include <stdlib.h>

#include <getopt.h>

#include <cutil.h> // CUDA utilities CUDA_SAFE_CALL

#define _DEBUG

unsigned int N = 1024*1024;// the # of elems in the array (default: 1M)

float *A; // array A on the host

float *B; // array B on the host

float *A_d; // array A on the gpu

float *B_d; // array B on the gpu

int seed = 100; // random number generator seed

int MAX_BLOCKS_DIM1 = 0; // the maximum number of blocks per grid (1st D)

int MAX_BLOCKS_DIM2 = 0; // the maximum number of blocks per grid (2nd D)

int MAX_THREADS = 0; // the maximum number of threads per block

int total_blocks = 0; // the total number of blocks needed

int numblocks_dim1 = 0; // the number of blocks per grid (1st D)

int numblocks_dim2 = 0; // the number of blocks per grid (2nd D)

int numthreads = 0; // the number of threads per block

int amount = 8; // the number to increment by

struct cudaDeviceProp *dev;// the CUDA device properties struct

static struct option long_options =

{

{“num”, required_argument, 0, ‘n’},

{0, 0, 0, 0}

};

/*

  • CUDA Kernel to increment a value in an array

  • by some value.

  • @param *a pointer to the array in GPU memory

  • @param amt the value to increment by

  • @param N the number of elements in the array

*/

global

void increment(float *a, int amt, unsigned int n)

{

unsigned int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;

unsigned int threadIndex = blockIndex * blockDim.x + threadIdx.x;

if (threadIndex < n) {

  a[threadIndex] = a[threadIndex] + amt;

}

}

/*

  • usage

  • Print usage and then exit.

*/

void usage(int argc, char **argv)

{

fprintf(stderr, “usage: %s [options]\n”,argv[0]);

fprintf(stderr, “\n”);

fprintf(stderr, " -n,–num N the number of elements to process.\n");

fprintf(stderr, " default is %u.\n", N);

fprintf(stderr, " -h,–help this help screen.\n");

fprintf(stderr, “\n”);

exit(1);

}

/*

  • main program

  • @param argc the number of arguments passed to the program

  • @param **argv the arguments passed in

*/

int main(int argc, char **argv)

{

int count;

int active;

int i;

srand(seed);

// get the device properties so we know how to divide the work

dev = (struct cudaDeviceProp *)malloc(sizeof(struct cudaDeviceProp));

if (!dev) {

  fprintf(stderr, "*** OOM: could not allocate cudaDeviceProp ***\n");

  exit(1);

}

CUDA_SAFE_CALL(cudaGetDeviceCount(&count));

cudaGetDevice(&active);

printf(“total number of devices: %d\n”, count);

printf(“active device is %d\n”, active);

for (i = 0; i < count; i++) {

  cudaGetDeviceProperties(dev, i);

  printf("device %d: name: %s\n", i, dev->name);

}

active = 1;

printf(“setting active device to %d\n”, active);

cudaSetDevice(active);

cudaGetDevice(&active);

printf(“active device is %d\n”, active);

cudaGetDeviceProperties(dev, active);

MAX_THREADS = dev->maxThreadsPerBlock;

MAX_BLOCKS_DIM1 = dev->maxGridSize[0];

MAX_BLOCKS_DIM2 = dev->maxGridSize[1];

// get input (N)

int option_index = 0;

int c = 0;

while (1) {

  c = getopt_long (argc, argv, "n:h", long_options, &option_index);

  if (c == -1) break;

  switch © {

     case 'n':

        N = (unsigned int)atof(optarg);

        break;

     case '?':

     default:

        usage(argc,argv);

  }

}

// divide the work

// each element in N should be operated on by a thread

numthreads = MAX_THREADS;

total_blocks = N / numthreads + (N % numthreads == 0 ? 0 : 1);

numblocks_dim2 = 0;

numblocks_dim1 = 0;

// more than the maximum number of blocks in 1D, so go to 2D

if (total_blocks > MAX_BLOCKS_DIM1) {

  numblocks_dim1 = MAX_BLOCKS_DIM1;

  numblocks_dim2 = total_blocks / MAX_BLOCKS_DIM1 +

                   (total_blocks % MAX_BLOCKS_DIM1 == 0 ? 0 : 1);

} else {

  numblocks_dim2 = 0;

  numblocks_dim1 = total_blocks;

}

printf(“device name: %s\n”, dev->name);

printf(“total global memory: %u\n”, dev->totalGlobalMem);

printf(“N = %u\n”, N);

printf(“numthreads = %d; MAX = %d\n”, numthreads, MAX_THREADS);

printf(“total_blocks = %d; MAX = %d\n”, total_blocks, MAX_BLOCKS_DIM1);

printf(“numblocks_dim2 = %d\n”, numblocks_dim2);

printf(“numblocks_dim1 = %d\n”, numblocks_dim1);

unsigned int calculated_N, extra_blocks, extra_threads;

if (numblocks_dim2 > 0) {

  calculated_N = numblocks_dim1 * numthreads * numblocks_dim2;

  extra_blocks = numblocks_dim1 * numblocks_dim2 - total_blocks;

} else {

  calculated_N = numblocks_dim1 * numthreads;

  extra_blocks = 0;

}

extra_threads = calculated_N - N;

printf(“calculated_N = %u\n”, calculated_N);

printf(“extra_threads = %u\n”, extra_threads);

printf(“extra_blocks = %u\n”, extra_blocks);

// allocate host memory for the arrays

A = (float *)malloc(N * sizeof(float));

if (!A) {

  fprintf(stderr, "*** OOM: could not allocate A ***\n");

  exit(1);

}

B = (float *)malloc(N * sizeof(float));

if (!A) {

  fprintf(stderr, "*** OOM: could not allocate B ***\n");

  exit(1);

}

CUDA_SAFE_CALL(cudaMalloc((void **)&A_d, N*sizeof(float)));

// CUDA_SAFE_CALL(cudaMalloc((void **)&B_d, N*sizeof(float)));

// initialize A

printf(“initializing A with random floats\n”);

for (i = 0; i < N; i++) {

  A[i] = (float) rand()/RAND_MAX;

  B[i] = A[i];

}

printf(“copying %d elements from host->gpu\n”,N);

CUDA_SAFE_CALL(cudaMemcpy(A_d,A,N*sizeof(float),cudaMemcpyHo

stToDevice));

if (numblocks_dim2 == 0) numblocks_dim2 = 1; // dimension is 1-indexed?

dim3 grid_blocks(numblocks_dim1,numblocks_dim2);

dim3 block_threads(numthreads);

printf(“running cuda kernel with %dx%d blocks/grid and %d threads/block\n”,

       numblocks_dim1,numblocks_dim2,numthreads);

increment<<<grid_blocks, block_threads>>>(A_d,amount,N);

printf(“blocking until gpu work is complete\n”);

cudaThreadSynchronize();

cudaError_t err = cudaGetLastError();

if( err != 0 ) {

  printf( "** cuda error: %s ** \n", cudaGetErrorString(err));

}

CUT_CHECK_ERROR(“Kernel execution failed”);

// printf(“copying %d elements from gpu->gpu\n”,N);

// CUDA_SAFE_CALL(cudaMemcpy(B_d,A_d,N*sizeof(float),cudaMemcpy

DeviceToDevice));

// printf(“copying %d elements from gpu->host\n”,N);

// CUDA_SAFE_CALL(cudaMemcpy(B,B_d,N*sizeof(float),cudaMemcpyDe

viceToHost));

printf(“copying %d elements from gpu->host\n”,N);

CUDA_SAFE_CALL(cudaMemcpy(A,A_d,N*sizeof(float),cudaMemcpyDe

viceToHost));

unsigned int errors = 0;

for (i = 0; i < N; i++) {

  if (A[i] != B[i] + amount) {

     errors++;

  }

}

printf(“number of errors: %u / %u\n”,errors,N);

if (errors == N) {

  printf("everything failed! what's the problem?\n");

}

free(A);

free(B);

CUDA_SAFE_CALL(cudaFree(A_d));

CUDA_SAFE_CALL(cudaFree(B_d));

}

[/codebox]

my output is:

** GOOD RUN / SMALL N **

total number of devices: 2

active device is 0

device 0: name: GeForce 9600M GT

device 1: name: GeForce 9400M

setting active device to 1

active device is 1

device name: GeForce 9400M

total global memory: 266010624

N = 1048576

numthreads = 512; MAX = 512

total_blocks = 2048; MAX = 65535

numblocks_dim2 = 0

numblocks_dim1 = 2048

calculated_N = 1048576

extra_threads = 0

extra_blocks = 0

initializing A with random floats

copying 1048576 elements from host->gpu

running cuda kernel with 2048x1 blocks/grid and 512 threads/block

blocking until gpu work is complete

copying 1048576 elements from gpu->host

number of errors: 0 / 1048576

** BAD RUN / LARGER N **

total number of devices: 2

active device is 0

device 0: name: GeForce 9600M GT

device 1: name: GeForce 9400M

setting active device to 1

active device is 1

device name: GeForce 9400M

total global memory: 266010624

N = 266010624

numthreads = 512; MAX = 512

total_blocks = 519552; MAX = 65535

numblocks_dim2 = 8

numblocks_dim1 = 65535

calculated_N = 268431360

extra_threads = 2420736

extra_blocks = 4728

initializing A with random floats

copying 266010624 elements from host->gpu

running cuda kernel with 65535x8 blocks/grid and 512 threads/block

blocking until gpu work is complete

** cuda error: unspecified launch failure **

copying 266010624 elements from gpu->host

number of errors: 266010624 / 266010624

everything failed! what’s the problem?

It looks like you’re using an array of N = roughly 256 million floats, but you have only about 256M bytes of memory available. I would have expected cudaMalloc(N*4) to have failed. It’s probably worth investigating why it didn’t complain at that point.