Need help with cuda error: "unspecified launch failure"

Hello,

I’m an undergraduate researcher relatively new to CUDA and have been attempting to port snippets of fortran code

to CUDA using fortran/CUDA C mixed language techniques. So far I’ve found google extremely helpful in learning CUDA

however I’ve run into an extremely annoying error that I can’t rid of. I’m under the impression that

“unspecified launch failure” translates into segmentation fault so I’ve run my program with debugging tools to find

the source of the problem.

this is my program output concerning the following snippet of code using the cuda-memcheck debugging tool with the --continue flag to catch as many memory errors as possible.

CUDA error: unspecified launch failure at line 177

========= Invalid __global__ read of size 4

=========     at 0x000000b0 in /work/jahad/campari_GPU_2/source/inner_loops_en_gpo.cu:222:vrspl_vbulk_kern_

=========     by thread (0,0,0) in block (0,0,0)

=========     Address 0xdb1f3400 is out of bounds

=========

========= ERROR SUMMARY: 1 error

the CUDA error output is due to my own cuda error processing function, listed in the source code below.

the rest of the output was given by cuda-memcheck.

In the case that variable information would help,

dvec is a N by 3 matrix that was flattened to a N*3 size vector in the host code successfully.

svec is a 3 by 1 correction term

d2 is a N sized vector

d1 is a N sized vector

id1 is an N sized vector

extern "C" void vrspl_vbulk_(float* dvec,float* svec,float* d2,float* d1,float* id1,int* N)

{

    size_t sizealloc = (*N)*sizeof(float);

    float* svecd;

    float* d2d;

    float* d1d;

    float* id1d;

    float* dvecd;

cudaErrCheck(cudaMalloc((void**)&svecd,3*sizeof(float)),164);

    cudaErrCheck(cudaMalloc((void**)&d2d, sizealloc),165);

    cudaErrCheck(cudaMalloc((void**)&d1d, sizealloc),166);

    cudaErrCheck(cudaMalloc((void**)&id1d, sizealloc),167);

    cudaErrCheck(cudaMalloc((void**)&dvecd,3*sizealloc),168);

cudaErrCheck(cudaMemcpy(svecd,svec,3*sizeof(float), cudaMemcpyHostToDevice),170);

    cudaErrCheck(cudaMemcpy(dvecd,dvec,3*sizealloc,cudaMemcpyHostToDevice),171);

dim3 dimBlock(32,1);

    dim3 dimGrid(*N/dimBlock.x + 1,1);

    vrspl_vbulk_kern_<<<dimGrid,dimBlock>>>(dvecd,svecd,d2d,d1d,id1d,N);

    cudaErrCheck(cudaGetLastError(),176);

    cudaErrCheck(cudaThreadSynchronize(),177);

cudaErrCheck(cudaMemcpy(dvec,dvecd,(size_t)(3*(int)sizealloc),cudaMemcpyDeviceToHost),181);

    cudaErrCheck(cudaMemcpy(d1,d1d,sizealloc,cudaMemcpyDeviceToHost),179);

    cudaErrCheck(cudaMemcpy(d2,d2d,sizealloc,cudaMemcpyDeviceToHost),178);

    cudaErrCheck(cudaMemcpy(id1,id1d,sizealloc,cudaMemcpyDeviceToHost),180);

cudaErrCheck(cudaFree(dvecd),183);

    cudaErrCheck(cudaFree(svecd),184);

    cudaErrCheck(cudaFree(d2d),185);

    cudaErrCheck(cudaFree(d1d),186);

    cudaErrCheck(cudaFree(id1d),187);

}

/////////////////////////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////INTERNAL FUNCTIONS///////////////////////////////////////////

/////////////////////////////////////////////////////////////////////////////////////////////////////

void cudaErrCheck(cudaError_t err, int line_num)

{

  if(err != cudaSuccess)

  {

    // print the CUDA error message and exit

    printf("CUDA error: %s at line %i\n", cudaGetErrorString(err), line_num);

    exit(-1);

  }

}

/////////////////////////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////KERNELS//////////////////////////////////////////////////

/////////////////////////////////////////////////////////////////////////////////////////////////////

__global__ void vrspl_vbulk_kern_(float* dvec,float* svec,float* d2,float* d1,float* id1,int* N)

{

    int i = (blockIdx.x * blockDim.x) + threadIdx.x;

    if(i <(*N))

    {

      dvec[i] = dvec[i] + svec[0];

    }

    else if(i<2*(*N))

    {

      dvec[i] = dvec[i] + svec[1];

    }

    else if(i<3*(*N))

      dvec[i] = dvec[i] + svec[2];

    if(i < (*N))

    {

      d2[i] = dvec[i]*dvec[i] + dvec[i+*N]*dvec[i+*N] + dvec[i+*N*2]*dvec[i+*N*2];

      d1[i] = sqrt(d2[i]);

      id1[i] = 1.0/d1[i];

    }

}

to clarify the error

line 222 in the mem-check output specifies the following section of code

if(i <(*N))

    {

      dvec[i] = dvec[i] + svec[0];

    }

I have tried commenting out various portions of code, commenting out if statements, changing how the inputs are allocated etc etc

I would appreciate any sort of help with this problem.

In advance,

thank you for your time