The following example is derived from a more complex piece of code that failed.
The following kernel has a launch failure and i would like to understand why and
how to figure these things out.
//FAILS
__global__ void d_eps_tot(int *inp, int *out, int n, int *tot) {
int sum = 0;
for (int i=0; i<n; i++) {
out[i] = sum;
sum += inp[i];
}
*tot = sum;
}
But the following works.
//OK
__global__ void d_eps(int *inp, int *out, int n/* , int *tot */) {
int sum = 0;
for (int i=0; i<n; i++) {
out[i] = sum;
sum += inp[i];
}
/* *tot = sum; */
}
I invoke it as follows and get d_eps 2 failure.
if both are d_eps it works fine,
if both are d_eps_tot then d_eps 1 fails.
for(int l = 0; l<nl-1; l++) {
HMSGI("loop3", l);
d_eps<<<1,1>>>(h_tbl->d_lNxtInterCnt[l],
h_tbl->d_lNxtInterOffset[l],
lShapeCnt[l]/* , */
/* h_tbl->d_lNxtInterTot+l */
);
CUT_CHECK_ERROR("d_eps 1");
d_eps_tot<<<1,1>>>(h_tbl->d_lPrvInterCnt[l+1],
h_tbl->d_lPrvInterOffset[l+1],
lShapeCnt[l+1],
h_tbl->d_lPrvInterTot+l+1
);
CUT_CHECK_ERROR("d_eps 2");
}
Any help is appreciated.
–kushu