Hello
I am trying to use an array of pointers in a kernel. The code works well if I compile for compute capability 1.X, but if I use -arch=sm_20, the kernel crashes with the message “unknown error” on my 64 bit windows platform (with GTX480), while my Linux platform (with GTX460) obtains “unspecified launch error”. The code works with -arch=sm_20 if I compile with debug information (-G), which removes the optimization. The code also works with compilation for compute capability 2.0 if I compile for a 32 bit platform instead.
I have created a code that can replicate the error. The code only crashes if all the three lines
debugvector[0]=jcptr[0];
j=threadIdx.x>=5000;
debugvector[1]=zlocal[j].re;
are present. Removing either debugvector[0]=jcptr[0] or debugvector[1]=zlocal[j].re, or putting j=0 (which is the value it obtains) makes the code work without crashing. In all cases where the code does not crash, the correct values are obtained from the operations (if real values were to be put into the vectors)
It would really be helpful if someone could explain what is actually going wrong in this example and to solve the problem
Below follows the code that replicates the error on my platforms. All parameters are the same as they are in the real program. I was using CUDA 3.2 when I tested this code.
#define CUDAREAL float //reproduces with both double and float
#include "cuda.h"
#include <stdio.h>
typedef struct { CUDAREAL re,im; } dcmplx; //complex variable struct
void cudasafe(cudaError_t error) //handles errors
{
if (error != cudaSuccess) {
fprintf(stderr,cudaGetErrorString(error));
exit(1);
}
}
/*---------------------------------------------------------*/
__global__ void kerneltest(int** jcptr2,CUDAREAL* debugvector)
{
int j;
__shared__ int *jcptr;
__shared__ dcmplx zlocal[2];
if(threadIdx.x==0){
jcptr=jcptr2[3]; //the last element in the pointer array
}
__syncthreads();
debugvector[0]=jcptr[0]; //Unknown error occurs in this line and line 33 are active
j=threadIdx.x>=5000; //required for reproducing error. Should be zero, but the compiler cannot know this
debugvector[1]=zlocal[j].re; //Unknown error occurs in this line and line 31 are active
}
/*---------------------------------------------------------*/
int main()
{
int** jcptr2=(int**)calloc(4,sizeof(int*));
int** gjcptr2;
int sizes[]={10,100,1156,9732};
CUDAREAL* ddebugvector;
cudasafe(cudaMalloc((void **)&ddebugvector,2000*sizeof(CUDAREAL))); //allocate an array on GPU
cudasafe(cudaMalloc((void **)&gjcptr2,4*sizeof(int*))); //allocate an array of pointers on GPU
for(int k=0;k<=3;k++) {
cudasafe(cudaMalloc((void **)&jcptr2[k],sizes[k]*sizeof(int))); //initialize the array of pointers
}
cudasafe(cudaMemcpy(gjcptr2,jcptr2,4*sizeof(int*),cudaMemcpyHostToDevice)); //transfer array of pointers to GPU
printf("Ok so far\n"); //code works up until this point
kerneltest<<<85, 32>>>(gjcptr2,ddebugvector);
cudasafe(cudaThreadSynchronize()); //catches the error
return 0;
}
Thank you