I have set up the code below using dynamic parallelism trying to reproduce a problem I had on a larger project and I bumped into the issues below.

I have a four Kepler K20c, OS Windows 7, CUDA 5.0. If I set GPU #0, then I receive

```
GPUassert: uncorrectable ECC error encountered C:/Users/Matlab_User/Desktop/StackOverflow/StackOverflow/kernel.cu 57
```

where line 57 is the allocation of d_x. If I set GPU #1-#3, then I receive

```
GPUassert: unspecified launch failure C:/Users/Matlab_User/Desktop/StackOverflow/StackOverflow/kernel.cu 64
```

for the kernel launch.

I have the following questions:

- What could be the issue of the ECC problem? Hardware problem on GPU #0?
- What could be the issue of the unspecified launch failure?

Thanks a lot in advance.

```
#include <stdio.h>
#include <conio.h>
#define K 6
#define BLOCK_SIZE 256
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { getch(); exit(code); }
}
}
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
__global__ void child_kernel(double* P1, const double cc_diff1)
{
int m = threadIdx.x;
P1[m] = K*K-(cc_diff1-(m-K))*(cc_diff1-(m-K));
}
__global__ void parent_kernel(const double* __restrict__ x, int M)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if(i<M) {
double cc_points1=2.*x[i];
double r_cc_points1=rint(cc_points1);
const double cc_diff1 = cc_points1-r_cc_points1;
double* P1 = new double[13];
dim3 dimBlock(2*K+1,2*K+1); dim3 dimGrid(1,1);
child_kernel<<<dimGrid,dimBlock>>>(P1,cc_diff1);
for(int m=0; m<2*K+1; m++)
printf("%f %f\n",P1[m],K*K-(cc_diff1-(m-K))*(cc_diff1-(m-K)));
}
}
int main() {
const int M = 190;
gpuErrchk(cudaSetDevice(1));
double* x = (double*)malloc(M*sizeof(double));
for (int i=0; i<M; i++)
x[i] = i;
double* d_x; gpuErrchk(cudaMalloc((void**)&d_x,M*sizeof(double)));
gpuErrchk(cudaMemcpy(d_x,x,M*sizeof(double),cudaMemcpyHostToDevice));
dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(iDivUp(M,BLOCK_SIZE));
parent_kernel<<<dimGrid,dimBlock>>>(x,M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
return 0;
}
```