kernel fails to launch (example included) stripped down self-contained example

The following is a stripped down self-contained example of program in which

my kernel fails to launch. Actually, the code below runs. There is a single line

when uncommented then causes the kernel to fail to launch.

So not only would I like to know what I did wrong, but also how to debug

such occurrences.

thanks in advance,


P.S. Linux system running Fedora 6

#define _DEBUG

#include <stdlib.h>

#include <stdio.h>

#include <cutil.h>

#define DEVALLOC(ptr,size) CUDA_SAFE_CALL( cudaMalloc((void **)& ptr, size))

#define DEVMEMSET(ptr,size,val) CUDA_SAFE_CALL(cudaMemset(ptr, val, size) )

#define HOST2DEV(hst,dev,size) CUDA_SAFE_CALL( cudaMemcpy(dev,  hst,   size, cudaMemcpyHostToDevice) );

#define DEV2HOST(dev,hst,size) CUDA_SAFE_CALL( cudaMemcpy(hst,  dev,   size, cudaMemcpyDeviceToHost) );

#define DEVDECLALLOC(name, type, size) \

  type * name; \

  DEVALLOC(name, size * sizeof(type))


#define MAXL 10

typedef struct {

  int d_lShapeCnt[MAXL];

  int4 *d_lShapes[MAXL];


  int *d_lNxtInterCnt[MAXL];

  int *d_lNxtInterOffset[MAXL];

  int d_lNxtInterTot[MAXL];

  int4 *d_lNxtIntersect[MAXL];


  int *d_lPrvInterCnt[MAXL];

  int *d_lPrvInterOffset[MAXL];

  int d_lPrvInterTot[MAXL];

  int4 *d_lPrvIntersect[MAXL];



  int *d_lBthInterCnt[MAXL];

  int *d_lBthInterOffset[MAXL];

  int d_lBthInterTot[MAXL];

  int4 *d_lBthIntersect[MAXL];

} devtbl;

__global__ void d_eps_tot( int n, int *inp, int *out, int *tot) {

  int sum = 0;

  for (int i=0; i<n; i++) {

    out[i] = sum;

    sum += inp[i];


  // uncomment the following line and the kernel will fail to launch

  //*tot = sum;


int main(int argc, char **argv) {

 int gw = 25000;

  int nl = 10;


 int  lShapeCnt[MAXL];

  int4 *lShapes[MAXL];

 for (int l = 0; l<nl; l++) {

    lShapeCnt[l] = gw / 1000;

    lShapes[l] = (int4 *)malloc(lShapeCnt[l] * sizeof(int4));


 devtbl *h_tbl = (devtbl *)malloc(sizeof(devtbl));

  DEVDECLALLOC(d_tbl, devtbl, 1);

 for(int l = 0; l<nl; l++) {

   h_tbl->d_lShapeCnt[l] = lShapeCnt[l];


    // Put shapes to device

    DEVALLOC(h_tbl->d_lShapes[l],              h_tbl->d_lShapeCnt[l] * sizeof(int4));

    DEVMEMSET(h_tbl->d_lShapes[l],             h_tbl->d_lShapeCnt[l] * sizeof(int4), 0xff);

    HOST2DEV(lShapes[l], h_tbl->d_lShapes[l],  h_tbl->d_lShapeCnt[l] * sizeof(int4));


    // allocate shape intersection count arrays

    DEVALLOC(h_tbl->d_lNxtInterCnt[l],  h_tbl->d_lShapeCnt[l] * sizeof(int));

    DEVALLOC(h_tbl->d_lPrvInterCnt[l],  h_tbl->d_lShapeCnt[l] * sizeof(int));

    DEVMEMSET(h_tbl->d_lPrvInterCnt[l],  h_tbl->d_lShapeCnt[l] * sizeof(int), 0xff);

    DEVMEMSET(h_tbl->d_lNxtInterCnt[l],  h_tbl->d_lShapeCnt[l] * sizeof(int), 0xff);


    // allocate shape offset indexing arrays

    DEVALLOC(h_tbl->d_lNxtInterOffset[l],  h_tbl->d_lShapeCnt[l] * sizeof(int));

    DEVALLOC(h_tbl->d_lPrvInterOffset[l],  h_tbl->d_lShapeCnt[l] * sizeof(int));

   // allocate shape offset indexing arrays

    DEVALLOC(h_tbl->d_lBthInterCnt[l],  h_tbl->d_lShapeCnt[l] * sizeof(int));

    DEVALLOC(h_tbl->d_lBthInterOffset[l],  h_tbl->d_lShapeCnt[l] * sizeof(int));


  for(int l = 0; l<nl-1; l++) {

     d_eps_tot<<<1,1>>>( lShapeCnt[l],





      CUT_CHECK_ERROR("d_eps 1");






      CUT_CHECK_ERROR("d_eps 2");


  CUT_EXIT(argc, argv);


//*tot = sum;
have you tried tot[0] = sum? i don’t know…

Yes, i tried tot[0], same thing.
It’s such a small program i thought someone (NVIDIA?)
would try it and figure it out.

P.S. If anyone has any ideas I can also be reached at:
Damir Jamsek
Research Staff Member
IBM Austin Research Lab
Austin Texas

d_lNxtInterTot and d_lPrvInterTot are not in device memory. They should be allocated on the device, when writing to it by device code.

Very embarrassing,

Thank you,


Actually, they are in device memory, in the struct d_tbl,

my mistake was passing the pointer from h_tbl,

still a simple embarassing mistake,

sorry to take up everyone bandwidth on

my lack of attention,