Following is the code snippet:
{
R *r, *r_d;
P *p;
N *n; // R is the main struct. P & N are sub structures of R. Also, N is linkedlist
// The linked list N is created inside CUDA by using the arrays which are copied from host
double *nAr,*nAr_d;
int numNAr;
nAr = (double *) malloc(1*sizeof(double));
while(!feof(fp))
{
nAr = (double *)realloc(nAr, (i+1)*sizeof(double));
fscanf(fp, "%lf", &nAr[i]);
i=i+1;
}
fclose(fp);
numNAr = i/2;
cudaMalloc( (void **) &nAr_d, numNAr * 2 * sizeof(double));
cudaMemcpy(nAr_d, nAr, numNAr * 2* sizeof(double), cudaMemcpyHostToDevice);
AssignKernel2<<<1,1>>> (root_dev, nAr_d, numNAr);
cudaThreadSynchronize();
....
}
The code for AssignKernel is as follows:
__global__ void AssignKernel2(R *r_d,double *nAr_d,int numNAr_d)
{
int i,iN, dInc, numNb, nT=0;
double bSX, bSY, bSZ;
N *n;
P *p;
p = r_d->p;
r_d->nNKM = 0;
p->nC = 0;
r_d->nNQ = (N *)NULL;
n = (N *) malloc(1*sizeof(N));
r_dev->nKs = (N **) malloc(((numNAr_d) + 5) * sizeof( N *));
for(i=0; i<numNAr_d; i++)
{
p->nC = para->nC + 1;
if(nT >= r_d->nNKM)
{
ENKs(r_d, r_d->nNKM + 1);
}
n->mT.idx = nT;
n->numNb = 2;
n->x = nAr_d[2*i];
n->y = nAr_d[2*i+1];
n->x = n->x * p->b;
n->y = n->y * p->b;
}
}
The AssignKernel2 is the 2nd cuda kernel. There is no error in 1st cuda kernel call.
The cuda-gdb shows following information:
215 while(fgetc(fp)!='.');
(cuda-gdb) step 10
dim3 (this=0x7fffe70672b0, vx=1, vy=1, vz=1) at /usr/local/cuda/bin/../include/vector_types.h:497
497 __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
(cuda-gdb) step
dim3 (this=0x7fffe70672c0, vx=1, vy=1, vz=1) at /usr/local/cuda/bin/../include/vector_types.h:497
497 __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
(cuda-gdb) list
492 /*DEVICE_BUILTIN*/
493 struct dim3
494 {
495 unsigned int x, y, z;
496 #if defined(__cplusplus)
497 __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
498 __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
499 __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
500 #endif /* __cplusplus */
501 };
(cuda-gdb) step
0x0000000000401800 in __device_stub__Z12AssignKernel1P5_rootPd (__par0=0x0, __par1=0x7fffe70676e0) at Main.cudafe1.stub.c:1
1 #include "crt/host_runtime.h"
(cuda-gdb) step
0x0000000000401810 2 #include "Main.fatbin.c"
(cuda-gdb) step
0x00000000004017a0 in ?? () at ll_kernel.cu:114
warning: Source file is more recent than executable.
114 /*
(cuda-gdb) step
0x00000000004017b0 1 #include "crt/host_runtime.h"
(cuda-gdb) step
0x00002b3bf5963930 in ?? () from /usr/local/cuda/lib64/libcudart.so.4
(cuda-gdb) step
Single stepping until exit from function cudaSetupArgument,
which has no line number information.
[Launch of CUDA Kernel 0 (AssignKernel1<<<(1,1,1),(1,1,1)>>>) on Device 0]
Number of Ns = 100
[Launch of CUDA Kernel 1 (AssignKernel2<<<(1,1,1),(1,1,1)>>>) on Device 0]
Failed to read the virtual PC on CUDA device 0 (error=10).
(cuda-gdb) step
Failed to read the virtual PC on CUDA device 0 (error=10).
(cuda-gdb) continue
Continuing.
Failed to read the virtual PC on CUDA device 0 (error=10).
(cuda-gdb) q
The program is running. Exit anyway? (y or n) y
Here its not showing any line number, which is causing the error. How to get that info?
The cuda-memcheck output is as follows:
# cuda-memcheck ./gpu_debug.exe
========= CUDA-MEMCHECK
Number of Ns = 100
========= Error: process didn't terminate successfully
========= Out-of-range Shared or Local Address
========= in ll_kernel.cu:AssignKernel2
========= by thread (0,0,0) in block (0,0,0)
=========
========= ERROR SUMMARY: 1 error
Please let me know how to diagnose it further.