How to track down a Segmentation Fault in Big Programs

Ok, so I have about 6k lines of code that I’ve been converting from fortran to cuda (the old LSODA ODE solver), and it compiles and runs just fine in device emulation mode. However, in regular mode it compiles, but crashes at runtime with only a mysterious “Segmentation fault” message. Question is, what is the best way to track down the cause of this segfault? I presume that I’ve got code trying to access host memory from within the device, but beyond that, I have only a vague idea of what the cause could be. Are there any sneaky ways to pinpoint the cause? Oh, and just to make the situation more complicated, I’m playing with functors, templates, and structs in the program. So lots of nasty little places for problems to hide, :D

Thanks much,

Paul

EDIT
Oh, and to make matters WORSE, when I try compiling with -g -G, it takes a REAAAAALY long time. Like it hasn’t finished yet. So I don’t know if it’s hanging, or just really slow. For some reason the non emulation build takes a lot longer than the emu build.

Ok, it took 12 minutes to do it, but nvcc finally built the release with debug info. The segfault is happening at line 83 in my main.

9 

 10 //#define EMULATION_MODE

 11 //#define use_export	// uncomment this if project is built using a compiler that

 12 // supports the C++ keyword "export".  If not using such a 

 13 // compiler, be sure not to add cuLsoda.cc to the target, or

 14 // you will get redefinition errors.

 15 

 16 #include <stdio.h>

 17 #include <math.h>

 18 #include "cuLsoda_kernel.cu"

 19 

 20 

 21 int main(void)   /* Main program */ 

 22 {

 23 	

 24	 /* Local variables */

 25	  double *t, *y/*[3]*/;

 26	  int *jt;

 27	  int *neq, *liw, *lrw;

 28	  double *atol/*[3]*/;

 29	  int *itol, *iopt;

 30	  double *rtol;

 31	  int *iout;

 32	  double *tout;

 33	  int *itask, *iwork/*[23]*/;

 34	  double *rwork/*[70]*/;

 35 	 int *istate;

 36 	/* End Local Block */

 37 

 38 	cudaMallocHost((void**)&t,sizeof(double));

 39 	cudaMallocHost((void**)&y,sizeof(double)*3);

 40 	cudaMallocHost((void**)&jt,sizeof(int));

 41 	cudaMallocHost((void**)&neq,sizeof(int));

 42 	cudaMallocHost((void**)&liw,sizeof(int));		

 43 	cudaMallocHost((void**)&lrw,sizeof(int));	

 44 	cudaMallocHost((void**)&atol,sizeof(double)*13);

 45 	cudaMallocHost((void**)&itol,sizeof(int));

 46 	cudaMallocHost((void**)&iopt,sizeof(int));

 47 	cudaMallocHost((void**)&rtol,sizeof(double));

 48 	cudaMallocHost((void**)&iout,sizeof(int));

 49 	cudaMallocHost((void**)&tout,sizeof(double));

 50 	cudaMallocHost((void**)&itask,sizeof(int));

 51 	cudaMallocHost((void**)&iwork,sizeof(int)*23);	

 52 	cudaMallocHost((void**)&rwork,sizeof(double)*70);

 53 	cudaMallocHost((void**)&istate,sizeof(int));

 54 	

 55 	/* Pointers to Device versions of Local variables */

 56 	double	*_Dt;

 57 	double	*_Dy;	// [3]

 58 	int	*_Djt;

 59 	int	*_Dneq;

 60 	int	*_Dliw;

 61 	int	*_Dlrw;

 62	 double	*_Datol;	//[3]

 63	 int	*_Ditol;

 64 	int	*_Diopt;

 65	 double	*_Drtol;

 66	 int	*_Diout;

 67	 double	*_Dtout;

 68	 int	*_Ditask;

 69 	int	*_Diwork;	// [23]

 70	 double	*_Drwork;	// [70]

 71 	int	*_Distate;

 72 	/* End Pointer Block */

 73 	

 74 	

 75 	

 76 	/* Method instantiations for Derivative and Jacobian functions to send to template */

 77 	myFex fex;

 78 	myJex jex;

 79 

 80 	

 81 	/* Assignment of initial values to locals */

 82	 *neq = 3;

 83 	y[0] = 1.;

 84	 y[1] = 0.;

 85	 y[2] = 0.;

 86	 *t = 0.;

 87	 *tout = .4;

 88 	*itol = 2;

 89	 *rtol = 1e-4;

 90	 atol[0] = 1e-6;

 91	 atol[1] = 1e-10;

 92	 atol[2] = 1e-6;

 93	 *itask = 1;

 94	 *istate = 1;

 95	 *iopt = 0;

 96	 *lrw = 70;

 97	 *liw = 23;

 98	 *jt = 2;

 99 	

100 	/* Allocate device memory for each of the pointers, and copy the values from local to device */

101 	cudaMalloc((void**)&_Dt,sizeof(double));		cudaMemcpy(_Dt,t,sizeof(double),cudaMemcpyHostToDevice);

102 	cudaMalloc((void**)&_Dy,sizeof(double)*3);		cudaMemcpy(_Dy,y,sizeof(double)*3,cudaMemcpyHostToDevice);

103 	cudaMalloc((void**)&_Djt,sizeof(int));			cudaMemcpy(_Djt,jt,sizeof(int),cudaMemcpyHostToDevice);

104 	cudaMalloc((void**)&_Dneq,sizeof(int));			cudaMemcpy(_Dneq,neq,sizeof(int),cudaMemcpyHostToDevice);

105 	cudaMalloc((void**)&_Dliw,sizeof(int));			cudaMemcpy(_Dliw,liw,sizeof(int),cudaMemcpyHostToDevice);

106 	cudaMalloc((void**)&_Dlrw,sizeof(int));			cudaMemcpy(_Dlrw,lrw,sizeof(int),cudaMemcpyHostToDevice);

107 	cudaMalloc((void**)&_Datol,sizeof(double)*13);	cudaMemcpy(_Datol,atol,sizeof(double)*13,cudaMemcpyHostToDevice);

108 	cudaMalloc((void**)&_Ditol,sizeof(int));		cudaMemcpy(_Ditol,itol,sizeof(int),cudaMemcpyHostToDevice);

109 	cudaMalloc((void**)&_Diopt,sizeof(int));		cudaMemcpy(_Diopt,iopt,sizeof(int),cudaMemcpyHostToDevice);

110 	cudaMalloc((void**)&_Drtol,sizeof(double));		cudaMemcpy(_Drtol,rtol,sizeof(double),cudaMemcpyHostToDevice);

111 	cudaMalloc((void**)&_Diout,sizeof(int));		cudaMemcpy(_Diout,iout,sizeof(int),cudaMemcpyHostToDevice);

112 	cudaMalloc((void**)&_Dtout,sizeof(double));		cudaMemcpy(_Dtout,tout,sizeof(double),cudaMemcpyHostToDevice);

113 	cudaMalloc((void**)&_Ditask,sizeof(int));		cudaMemcpy(_Ditask,itask,sizeof(int),cudaMemcpyHostToDevice);

114 	cudaMalloc((void**)&_Diwork,sizeof(int)*23);	cudaMemcpy(_Diwork,iwork,sizeof(int)*23,cudaMemcpyHostToDevice);

115 	cudaMalloc((void**)&_Drwork,sizeof(double)*70);	cudaMemcpy(_Drwork,rwork,sizeof(double)*70,cudaMemcpyHostToDevice);

116 	cudaMalloc((void**)&_Distate,sizeof(int));		cudaMemcpy(_Distate,istate,sizeof(int),cudaMemcpyHostToDevice);

117 	/* End Allocation and Copy Block */

118 	

119	 for (*iout = 1; *iout <= 12; ++*iout) {

120 	

121 		cuLsoda<<<1,1>>>(fex, _Dneq, _Dy, _Dt, _Dtout, _Ditol, _Drtol, _Datol, _Ditask, _Distate, _Diopt, _Drwork, _Dlrw, _Diwork, _Dliw, jex, _Djt);

122 

123 		/* Copy memory back from Device to Host */

124 		cudaMemcpy(t,_Dt,sizeof(double),cudaMemcpyDeviceToHost);

125 		cudaMemcpy(y,_Dy,sizeof(double)*3,cudaMemcpyDeviceToHost);

126 		cudaMemcpy(jt,_Djt,sizeof(int),cudaMemcpyDeviceToHost);

127 		cudaMemcpy(neq,_Dneq,sizeof(int),cudaMemcpyDeviceToHost);

128 		cudaMemcpy(liw,_Dliw,sizeof(int),cudaMemcpyDeviceToHost);

129 		cudaMemcpy(lrw,_Dlrw,sizeof(int),cudaMemcpyDeviceToHost);

130 		cudaMemcpy(atol,_Datol,sizeof(double)*13,cudaMemcpyDeviceToHost);

131 		cudaMemcpy(itol,_Ditol,sizeof(int),cudaMemcpyDeviceToHost);

132 		cudaMemcpy(iopt,_Diopt,sizeof(int),cudaMemcpyDeviceToHost);

133 		cudaMemcpy(rtol,_Drtol,sizeof(double),cudaMemcpyDeviceToHost);

134 		cudaMemcpy(tout,_Dtout,sizeof(double),cudaMemcpyDeviceToHost);

135 		cudaMemcpy(itask,_Ditask,sizeof(int),cudaMemcpyDeviceToHost);

136 		cudaMemcpy(iwork,_Diwork,sizeof(int)*23,cudaMemcpyDeviceToHost);

137 		cudaMemcpy(rwork,_Drwork,sizeof(double)*70,cudaMemcpyDeviceToHost);

138 		cudaMemcpy(istate,_Distate,sizeof(int),cudaMemcpyDeviceToHost);

139 		/* End Copy Block */

140 

141 

142 		printf("Exit: \t\t\tAt t =\t%G\ty = \t%g\t%g\t%g\n", *t, y[0], y[1], y[2]);

143 		if (istate < 0) {

144 			goto L80;

145 		}

146 		/* L40: */

147 		*tout *= 10.; 

148 		cudaMemcpy(_Dtout,tout,sizeof(double),cudaMemcpyHostToDevice);

149	 }

150	  printf("Number of Steps:  %i\nNo. f-s: %i\nNo. J-s = %i\nMethod Last Used = %i\nLast switch was at t = %g\n",iwork[10],iwork[11],iwork[12],iwork[18],rwork[14]);

151 	

152 L80:

153	  printf( "STOP istate is < 0\n");

154	 return 0;

155 } /* MAIN__ */

I don’t get what is wrong here. Can someone enlighten me?

Thanks,

Paul

My guess would be that one of your cuda calls is failing in the beginning of your program and the variable y is never getting set. I would start adding in some error checking code around your cuda calls.

turns out that I can’t access memory that has been allocated with cudaMallocHost as I do in line 83 (as an array). I changed the allocations to just plain old malloc’s, and no more seg fault.

That seems very odd, can you access them at all (maybe by dereferencing the base pointer?)

I may have another cause of this problem. I am getting seg faults during cudaMemcpy when I use cudaMallocHost and cudaMemcpy host-to-host on large data sizes. The program runs fine on smaller data sizes.

SOLVED: I was allocating some arrays for debugging on the stack instead of heap. The limit on the size of the stack was causing my seg faults.