The following is a stripped down self-contained example of program in which
my kernel fails to launch. Actually, the code below runs. There is a single line
when uncommented then causes the kernel to fail to launch.
So not only would I like to know what I did wrong, but also how to debug
such occurrences.
thanks in advance,
–kushu
P.S. Linux system running Fedora 6
#define _DEBUG
#include <stdlib.h>
#include <stdio.h>
#include <cutil.h>
#define DEVALLOC(ptr,size) CUDA_SAFE_CALL( cudaMalloc((void **)& ptr, size))
#define DEVMEMSET(ptr,size,val) CUDA_SAFE_CALL(cudaMemset(ptr, val, size) )
#define HOST2DEV(hst,dev,size) CUDA_SAFE_CALL( cudaMemcpy(dev, hst, size, cudaMemcpyHostToDevice) );
#define DEV2HOST(dev,hst,size) CUDA_SAFE_CALL( cudaMemcpy(hst, dev, size, cudaMemcpyDeviceToHost) );
#define DEVDECLALLOC(name, type, size) \
type * name; \
DEVALLOC(name, size * sizeof(type))
#define MAXL 10
typedef struct {
int d_lShapeCnt[MAXL];
int4 *d_lShapes[MAXL];
int *d_lNxtInterCnt[MAXL];
int *d_lNxtInterOffset[MAXL];
int d_lNxtInterTot[MAXL];
int4 *d_lNxtIntersect[MAXL];
int *d_lPrvInterCnt[MAXL];
int *d_lPrvInterOffset[MAXL];
int d_lPrvInterTot[MAXL];
int4 *d_lPrvIntersect[MAXL];
int *d_lBthInterCnt[MAXL];
int *d_lBthInterOffset[MAXL];
int d_lBthInterTot[MAXL];
int4 *d_lBthIntersect[MAXL];
} devtbl;
__global__ void d_eps_tot( int n, int *inp, int *out, int *tot) {
int sum = 0;
for (int i=0; i<n; i++) {
out[i] = sum;
sum += inp[i];
}
// uncomment the following line and the kernel will fail to launch
//*tot = sum;
}
int main(int argc, char **argv) {
int gw = 25000;
int nl = 10;
CUT_DEVICE_INIT();
int lShapeCnt[MAXL];
int4 *lShapes[MAXL];
for (int l = 0; l<nl; l++) {
lShapeCnt[l] = gw / 1000;
lShapes[l] = (int4 *)malloc(lShapeCnt[l] * sizeof(int4));
}
devtbl *h_tbl = (devtbl *)malloc(sizeof(devtbl));
DEVDECLALLOC(d_tbl, devtbl, 1);
for(int l = 0; l<nl; l++) {
h_tbl->d_lShapeCnt[l] = lShapeCnt[l];
// Put shapes to device
DEVALLOC(h_tbl->d_lShapes[l], h_tbl->d_lShapeCnt[l] * sizeof(int4));
DEVMEMSET(h_tbl->d_lShapes[l], h_tbl->d_lShapeCnt[l] * sizeof(int4), 0xff);
HOST2DEV(lShapes[l], h_tbl->d_lShapes[l], h_tbl->d_lShapeCnt[l] * sizeof(int4));
// allocate shape intersection count arrays
DEVALLOC(h_tbl->d_lNxtInterCnt[l], h_tbl->d_lShapeCnt[l] * sizeof(int));
DEVALLOC(h_tbl->d_lPrvInterCnt[l], h_tbl->d_lShapeCnt[l] * sizeof(int));
DEVMEMSET(h_tbl->d_lPrvInterCnt[l], h_tbl->d_lShapeCnt[l] * sizeof(int), 0xff);
DEVMEMSET(h_tbl->d_lNxtInterCnt[l], h_tbl->d_lShapeCnt[l] * sizeof(int), 0xff);
// allocate shape offset indexing arrays
DEVALLOC(h_tbl->d_lNxtInterOffset[l], h_tbl->d_lShapeCnt[l] * sizeof(int));
DEVALLOC(h_tbl->d_lPrvInterOffset[l], h_tbl->d_lShapeCnt[l] * sizeof(int));
// allocate shape offset indexing arrays
DEVALLOC(h_tbl->d_lBthInterCnt[l], h_tbl->d_lShapeCnt[l] * sizeof(int));
DEVALLOC(h_tbl->d_lBthInterOffset[l], h_tbl->d_lShapeCnt[l] * sizeof(int));
}
for(int l = 0; l<nl-1; l++) {
d_eps_tot<<<1,1>>>( lShapeCnt[l],
h_tbl->d_lNxtInterCnt[l],
h_tbl->d_lNxtInterOffset[l],
h_tbl->d_lNxtInterTot+l
);
CUT_CHECK_ERROR("d_eps 1");
d_eps_tot<<<1,1>>>(lShapeCnt[l+1],
h_tbl->d_lPrvInterCnt[l+1],
h_tbl->d_lPrvInterOffset[l+1],
h_tbl->d_lPrvInterTot+l+1
);
CUT_CHECK_ERROR("d_eps 2");
}
CUT_EXIT(argc, argv);
}