compile segmentation fault

I compile my code and get

Segmentation fault
make: *** [all] Error 255

I don’t know what’s wrong with my code , so I post my code in the following.

global void SQUFOF( long long* f, long long* g)
{
int k = threadIdx.x + blockIdx.x*blockDim.x;
long long N;
long long Q[3],P[2],r,sq,b;
int i,dis;

if( k < 256 )
{
	k++;
	dis = 0;
	k++;
	N = (long long)k*g[0];
	// initial stage
	Q[0] = 1 ;
	P[0] = sqrtf((double)N);

	Q[1] = N - P[0]*P[0];
	r = P[0];
	
	// test squareness
	if(r*r==N)
	{
		f[k-1] = r;
	}
	else
	{
	
	// find a square form
		for( i = 0 ; i < 1000 ; i++)
		{
			dis++;
			sq = sqrtf((double)Q[1]);
			if((sq*sq)==Q[1])break;
		
			b = (r+P[0])/Q[1];

			P[1] = b*Q[1] - P[0];
			Q[2] = Q[0] + b*(P[0] - P[1]); // suppose P[0]-P[1] > 0
			//update
			
			P[0] = P[1];
			Q[0] = Q[1];
			Q[1] = Q[2];
			
		}
	
	// reverse to find symmetry point 
	Q[2] = sqrtf((double)Q[1]);
	Q[1] = Q[2]*Q[0];
	Q[0] = Q[2];

	//reduce
	P[0] = P[0]+Q[0]*((r-P[0])/Q[0]);
	Q[1] = (N-P[0]*P[0])/Q[0];
	i = 0;
	
	while( i !=dis)
	{
		i++;
		b = (r+P[0])/Q[1];

		P[1] = b*Q[1] - P[0];
		Q[2] = Q[0] + b*(P[0] - P[1]);
		
		//update
		Q[0] = Q[1];
		Q[1] = Q[2];
		if(P[0]==P[1])break;			
		P[0] = P[1];
	}

	if(N%(long long)k==0)N/=(long long)k;
	Q[2]=Q[0];
	/*if(N%Q[0]==0)	f[k-1]=Q[0];
	else			f[k-1]=1;
	*/
	
	}
	
}

}

Actually, the problem is that if I unmask the code in the last few lines, I get the compile problem but if I mask it, it can run.

Hi,

I assume you call the kernel with blockDim 256 and an array of 256 elements for f. In that case your commented out statements will go out of the allocated array. k will be max 257, because initially it is limited to 255 and then incremented twice. writing f[256] would cause an error.

Further, for some reason ptxas fails when compiling for a target < 2.0 in release mode, but it will compile with (Nsight) debug mode (-G0).

I can’t get the hang of that one yet, but it should be reported as a bug.

After a few changes the kernel produces output, too! (compiled in debug mode)

[codebox]include <cuda.h>

include <cutil_inline.h>

typedef double fltype;

//define CUPRINTF

ifdef CUPRINTF

include <cuprintf.cu>

endif

global void SQUFOF( long long* f, long long g)

{

int k = threadIdx.x + blockIdx.x*blockDim.x;

long long N;

long long Q[3],P[2],r,sq,b;

int i,dis;

if( k < 256 )

{

	k++;

	dis = 0;

	//k++;

	N = (long long)k*g;

  #ifdef CUPRINTF

	if (k==1)

		cuPrintf("k: %d; N: 0x%lx%lx\n",k,(long)(N>>32),(long)N);

  #endif

	// initial stage

	Q[0] = 1 ;

	P[0] = sqrt((fltype)N);

  #ifdef CUPRINTF

	if (k==1)

		cuPrintf("k: %d; P[0]: 0x%ld\n",k,(long)P[0]);

  #endif

	Q[1] = N - P[0]*P[0];

	r = P[0];

	// test squareness

	if(r*r==N)

		f[k-1] = r;

	else

	{

		// find a square form

		for( i = 0 ; i < 1000 ; i++)

		{

		dis++;

		sq = sqrt((fltype)Q[1]);

		if((sq*sq)==Q[1])break;

		b = (r+P[0])/Q[1];

		P[1] = b*Q[1] - P[0];

		Q[2] = Q[0] + b*(P[0] - P[1]); // suppose P[0]-P[1] > 0

		//update

		P[0] = P[1];

		Q[0] = Q[1];

		Q[1] = Q[2];

	}

	// reverse to find symmetry point 

	Q[2] = sqrt((fltype)Q[1]);

	Q[1] = Q[2]*Q[0];

	Q[0] = Q[2];

	//reduce

	P[0] = P[0]+Q[0]*((r-P[0])/Q[0]);

	Q[1] = (N-P[0]*P[0])/Q[0];

	for( i=0; i++ !=dis; )

	{

		b = (r+P[0])/Q[1];

		P[1] = b*Q[1] - P[0];

		Q[2] = Q[0] + b*(P[0] - P[1]);

		//update

		Q[0] = Q[1];

		Q[1] = Q[2];

		if(P[0]==P[1])break;	

		P[0] = P[1];

		}

		if((N%(long long)k)==0) N/=(long long)k;

		Q[2]=Q[0];

		//long long n=Q[0];

		//long long rem=N%n;

		//long long rem=0;

		if((N%Q[0])==0)	f[k-1]=Q[0];

		else	 f[k-1]=1;

		//if(rem==0)	f[k-1]=Q[0];

		//f[k-1]=rem?1:n;

		//f[k-1]=N;

		//if (k==1) f[0]=N;

	}

}

}

int callSQUFOF(long long *divisor, long long n) // assume divisor to point to array(256) in host memory

{

long long *divs;

cudaMalloc(&divs,256*sizeof(divs[0]));

cudaMemset(divs,0,256*sizeof(divs[0]));

ifdef CUPRINTF

cudaPrintfInit();

endif

SQUFOF<<<1,256>>>(divs,n);

cutilCheckMsg( "Kernel execution failed" );

cudaThreadSynchronize();

ifdef CUPRINTF

cudaPrintfDisplay(stdout, true);

cudaPrintfEnd();

endif

cudaMemcpy(divisor,divs,256*sizeof(divs[0]),cudaMemcpyDevice

ToHost);

int el=0;

for( int n=0; n<256; n++ )

{

	if (divisor[n]<=1) continue;

	if (n>el++)

	{

		divisor[el-1]=divisor[n];

		divisor[n]=0;

	}

}

return el;

}

int main()

{

long long fact[256],num;

int facts=callSQUFOF(fact,num=(long long)rand()<<46 | (long long)rand()<<32 | (long long)rand()<<16 | (long long)rand());

printf("%lld (0x%llx), root %lld has %d factors:\n",num,num,(long long)sqrt((double)num),facts);

for(int n=0;n<facts;n++)

	printf("Fact  %d: %lld\n",n,fact[n]);

return 0;

}[/codebox]

Output:

[codebox]2894065343293316 (0xa482318be6784), root 53796517 has 42 factors:

Fact 0: 4

Fact 1: 45599

Fact 2: 102865052

Fact 3: 182396

Fact 4: 28134583

Fact 5: 4

Fact 6: 182396

Fact 7: 617

Fact 8: 179295268

Fact 9: 112538332

Fact 10: 3932

Fact 11: 2426044

Fact 12: 2468

Fact 13: 45599

Fact 14: 45599

Fact 15: 44823817

Fact 16: 26161

Fact 17: 26161

Fact 18: 28134583

Fact 19: 2468

Fact 20: 179295268

Fact 21: 2426044

Fact 22: 2426044

Fact 23: 4

Fact 24: 45599

Fact 25: 4

Fact 26: 2426044

Fact 27: 112538332

Fact 28: 2468

Fact 29: 3932

Fact 30: 26161

Fact 31: 45599

Fact 32: 983

Fact 33: 182396

Fact 34: 104644

Fact 35: 45599

Fact 36: 112538332

Fact 37: 64565348

Fact 38: 104644

Fact 39: 25716263

Fact 40: 2426044

Fact 41: 28134583

Press any key to continue . . .[/codebox]

Jan

Hi,

I assume you call the kernel with blockDim 256 and an array of 256 elements for f. In that case your commented out statements will go out of the allocated array. k will be max 257, because initially it is limited to 255 and then incremented twice. writing f[256] would cause an error.

Further, for some reason ptxas fails when compiling for a target < 2.0 in release mode, but it will compile with (Nsight) debug mode (-G0).

I can’t get the hang of that one yet, but it should be reported as a bug.

After a few changes the kernel produces output, too! (compiled in debug mode)

[codebox]include <cuda.h>

include <cutil_inline.h>

typedef double fltype;

//define CUPRINTF

ifdef CUPRINTF

include <cuprintf.cu>

endif

global void SQUFOF( long long* f, long long g)

{

int k = threadIdx.x + blockIdx.x*blockDim.x;

long long N;

long long Q[3],P[2],r,sq,b;

int i,dis;

if( k < 256 )

{

	k++;

	dis = 0;

	//k++;

	N = (long long)k*g;

  #ifdef CUPRINTF

	if (k==1)

		cuPrintf("k: %d; N: 0x%lx%lx\n",k,(long)(N>>32),(long)N);

  #endif

	// initial stage

	Q[0] = 1 ;

	P[0] = sqrt((fltype)N);

  #ifdef CUPRINTF

	if (k==1)

		cuPrintf("k: %d; P[0]: 0x%ld\n",k,(long)P[0]);

  #endif

	Q[1] = N - P[0]*P[0];

	r = P[0];

	// test squareness

	if(r*r==N)

		f[k-1] = r;

	else

	{

		// find a square form

		for( i = 0 ; i < 1000 ; i++)

		{

		dis++;

		sq = sqrt((fltype)Q[1]);

		if((sq*sq)==Q[1])break;

		b = (r+P[0])/Q[1];

		P[1] = b*Q[1] - P[0];

		Q[2] = Q[0] + b*(P[0] - P[1]); // suppose P[0]-P[1] > 0

		//update

		P[0] = P[1];

		Q[0] = Q[1];

		Q[1] = Q[2];

	}

	// reverse to find symmetry point 

	Q[2] = sqrt((fltype)Q[1]);

	Q[1] = Q[2]*Q[0];

	Q[0] = Q[2];

	//reduce

	P[0] = P[0]+Q[0]*((r-P[0])/Q[0]);

	Q[1] = (N-P[0]*P[0])/Q[0];

	for( i=0; i++ !=dis; )

	{

		b = (r+P[0])/Q[1];

		P[1] = b*Q[1] - P[0];

		Q[2] = Q[0] + b*(P[0] - P[1]);

		//update

		Q[0] = Q[1];

		Q[1] = Q[2];

		if(P[0]==P[1])break;	

		P[0] = P[1];

		}

		if((N%(long long)k)==0) N/=(long long)k;

		Q[2]=Q[0];

		//long long n=Q[0];

		//long long rem=N%n;

		//long long rem=0;

		if((N%Q[0])==0)	f[k-1]=Q[0];

		else	 f[k-1]=1;

		//if(rem==0)	f[k-1]=Q[0];

		//f[k-1]=rem?1:n;

		//f[k-1]=N;

		//if (k==1) f[0]=N;

	}

}

}

int callSQUFOF(long long *divisor, long long n) // assume divisor to point to array(256) in host memory

{

long long *divs;

cudaMalloc(&divs,256*sizeof(divs[0]));

cudaMemset(divs,0,256*sizeof(divs[0]));

ifdef CUPRINTF

cudaPrintfInit();

endif

SQUFOF<<<1,256>>>(divs,n);

cutilCheckMsg( "Kernel execution failed" );

cudaThreadSynchronize();

ifdef CUPRINTF

cudaPrintfDisplay(stdout, true);

cudaPrintfEnd();

endif

cudaMemcpy(divisor,divs,256*sizeof(divs[0]),cudaMemcpyDevice

ToHost);

int el=0;

for( int n=0; n<256; n++ )

{

	if (divisor[n]<=1) continue;

	if (n>el++)

	{

		divisor[el-1]=divisor[n];

		divisor[n]=0;

	}

}

return el;

}

int main()

{

long long fact[256],num;

int facts=callSQUFOF(fact,num=(long long)rand()<<46 | (long long)rand()<<32 | (long long)rand()<<16 | (long long)rand());

printf("%lld (0x%llx), root %lld has %d factors:\n",num,num,(long long)sqrt((double)num),facts);

for(int n=0;n<facts;n++)

	printf("Fact  %d: %lld\n",n,fact[n]);

return 0;

}[/codebox]

Output:

[codebox]2894065343293316 (0xa482318be6784), root 53796517 has 42 factors:

Fact 0: 4

Fact 1: 45599

Fact 2: 102865052

Fact 3: 182396

Fact 4: 28134583

Fact 5: 4

Fact 6: 182396

Fact 7: 617

Fact 8: 179295268

Fact 9: 112538332

Fact 10: 3932

Fact 11: 2426044

Fact 12: 2468

Fact 13: 45599

Fact 14: 45599

Fact 15: 44823817

Fact 16: 26161

Fact 17: 26161

Fact 18: 28134583

Fact 19: 2468

Fact 20: 179295268

Fact 21: 2426044

Fact 22: 2426044

Fact 23: 4

Fact 24: 45599

Fact 25: 4

Fact 26: 2426044

Fact 27: 112538332

Fact 28: 2468

Fact 29: 3932

Fact 30: 26161

Fact 31: 45599

Fact 32: 983

Fact 33: 182396

Fact 34: 104644

Fact 35: 45599

Fact 36: 112538332

Fact 37: 64565348

Fact 38: 104644

Fact 39: 25716263

Fact 40: 2426044

Fact 41: 28134583

Press any key to continue . . .[/codebox]

Jan

Hi, Thanks for your reply,

I have thought the problem of the bound of the array. actually I call the kernel<<<8,32>>>, although I mask some code, something unreasonable

happen, I have thought that the origin of the problem is the code I mask. However, if I unmask it, and mask some other codes, it can run.

Another curious problem is why segmentation fault would occur at the compile time, I assume it would happen at the runtime.

Hi, Thanks for your reply,

I have thought the problem of the bound of the array. actually I call the kernel<<<8,32>>>, although I mask some code, something unreasonable

happen, I have thought that the origin of the problem is the code I mask. However, if I unmask it, and mask some other codes, it can run.

Another curious problem is why segmentation fault would occur at the compile time, I assume it would happen at the runtime.

You are quite right that the bounds problem is not likely to be the cause of the compile memory-access fault (ptxas.exe).

As I mentioned in my (edited) post, the kernel will compile with the code unmasked when targeting SM20 (fermi, release and debug) or SM13 (200 series, in debug only), but has an access fault in other configurations. I have tried numerous variations of the code (after having taken care of the boundsproblem) without success.

So I think there is a bug in the compiler and the problem should be reported to NVIDIA.

Compiler output VS2008:

[codebox]1>------ Build started: Project: yetanothertest2, Configuration: Release x64 ------

1>Compiling with CUDA Build Rule…

1>“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.1\bin\nvcc.exe” -gencode=arch=compute_13,code="sm_13,compute_13" --machine 64 -ccbin “C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin” -Xcompiler “/EHsc /W3 /nologo /Ox /Zi /MT " -I”./" -I"C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK/C/common/inc" -I"C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK/shared/inc" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.1\include" -maxrregcount=64 --ptxas-options=-v --compile -o “x64\Release/akernel.cu.obj” akernel.cu

1>akernel.cu

1>tmpxft_000016a0_00000000-3_akernel.cudafe1.gpu

1>tmpxft_000016a0_00000000-8_akernel.cudafe2.gpu

1>ptxas info : Compiling entry function ‘_Z6SQUFOFPxx’ for ‘sm_13’

1>Internal error

1>nvcc error : ‘ptxas’ died with status 0xC0000005 (ACCESS_VIOLATION)

1>Linking…

1>LINK : fatal error LNK1181: cannot open input file ‘.\x64\Release\akernel.cu.obj’

1>Build log was saved at “file://c:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\src\yetanothertest2\x64\Release\BuildLog.htm”

1>yetanothertest2 - 1 error(s), 0 warning(s)

[/codebox]

The code runs, imo, with one increment (k++) masked, which also takes care of the bounds problem.

I have added example code with this modification and sqrtf((double)…) replaced by sqrt((double)…).

It appears to do a correct factorization (with some repetion), although I have not investigated whether all factors are found.

You are quite right that the bounds problem is not likely to be the cause of the compile memory-access fault (ptxas.exe).

As I mentioned in my (edited) post, the kernel will compile with the code unmasked when targeting SM20 (fermi, release and debug) or SM13 (200 series, in debug only), but has an access fault in other configurations. I have tried numerous variations of the code (after having taken care of the boundsproblem) without success.

So I think there is a bug in the compiler and the problem should be reported to NVIDIA.

Compiler output VS2008:

[codebox]1>------ Build started: Project: yetanothertest2, Configuration: Release x64 ------

1>Compiling with CUDA Build Rule…

1>“C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.1\bin\nvcc.exe” -gencode=arch=compute_13,code="sm_13,compute_13" --machine 64 -ccbin “C:\Program Files (x86)\Microsoft Visual Studio 9.0\VC\bin” -Xcompiler “/EHsc /W3 /nologo /Ox /Zi /MT " -I”./" -I"C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK/C/common/inc" -I"C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK/shared/inc" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v3.1\include" -maxrregcount=64 --ptxas-options=-v --compile -o “x64\Release/akernel.cu.obj” akernel.cu

1>akernel.cu

1>tmpxft_000016a0_00000000-3_akernel.cudafe1.gpu

1>tmpxft_000016a0_00000000-8_akernel.cudafe2.gpu

1>ptxas info : Compiling entry function ‘_Z6SQUFOFPxx’ for ‘sm_13’

1>Internal error

1>nvcc error : ‘ptxas’ died with status 0xC0000005 (ACCESS_VIOLATION)

1>Linking…

1>LINK : fatal error LNK1181: cannot open input file ‘.\x64\Release\akernel.cu.obj’

1>Build log was saved at “file://c:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\src\yetanothertest2\x64\Release\BuildLog.htm”

1>yetanothertest2 - 1 error(s), 0 warning(s)

[/codebox]

The code runs, imo, with one increment (k++) masked, which also takes care of the bounds problem.

I have added example code with this modification and sqrtf((double)…) replaced by sqrt((double)…).

It appears to do a correct factorization (with some repetion), although I have not investigated whether all factors are found.

Thank you for bringing this issue to our attention. I am able to reproduce this problem with the CUDA 3.1 toolchain on WinXP64 plus VS2005. I am unable to reproduce this issue with a recent internal toolchain, so it looks like the problem may already be fixed. I will follow up with our compiler team.

Thank you for bringing this issue to our attention. I am able to reproduce this problem with the CUDA 3.1 toolchain on WinXP64 plus VS2005. I am unable to reproduce this issue with a recent internal toolchain, so it looks like the problem may already be fixed. I will follow up with our compiler team.

Thanks for your experiment on the code, I have also tried many times but failed.

I try to find whch section of the code makes the compile time segmentation fault. And I found

that in the for loop (i = 0 ; i < 1000 ; i++) is the possible reason. If I mask the the for loop

and the break statement. Then I compile with success. So I am wondering why the compiler gives

me segmentation fault just because of adding the for loop ?

Thanks for your experiment on the code, I have also tried many times but failed.

I try to find whch section of the code makes the compile time segmentation fault. And I found

that in the for loop (i = 0 ; i < 1000 ; i++) is the possible reason. If I mask the the for loop

and the break statement. Then I compile with success. So I am wondering why the compiler gives

me segmentation fault just because of adding the for loop ?

I found that you can just mask the last assigment in the 1000 loop and have a compile, but it did no good in the end.

I tried to use older ptxas.exe versions, same result. Tried to move the 1000 loop to a separate device function, didn’t help.

Tried to demand fewer registers (since masking 1 assignment made a difference), no good.

Tried several more options for ptxas, but unless I use -G0, no good.

Put Q in shared memory, no good.

Worked the P and Q arrays to vectors, and I must have forgotten things I tried. But I can’t get the hang of it, nothing explicable comes to mind.

I could run the code in debug mode and so I worked a bit on it. Result enclosed, maybe it helps you.

Imo the main performance gain would be to put more threads to work on the 1000 loop.

Increasing the loopcount yields more factors.

Let’s hope the 3.2 release will be out shortly or someone will be so kind to post an improved ptxas.exe!

Final note:

NVCC does generate a .ptx file. You could try to run that in driver mode, letting the driver do the last compilation.

Perhaps the bugfixes are more quickly put through there than in the development tools.

I found that you can just mask the last assigment in the 1000 loop and have a compile, but it did no good in the end.

I tried to use older ptxas.exe versions, same result. Tried to move the 1000 loop to a separate device function, didn’t help.

Tried to demand fewer registers (since masking 1 assignment made a difference), no good.

Tried several more options for ptxas, but unless I use -G0, no good.

Put Q in shared memory, no good.

Worked the P and Q arrays to vectors, and I must have forgotten things I tried. But I can’t get the hang of it, nothing explicable comes to mind.

I could run the code in debug mode and so I worked a bit on it. Result enclosed, maybe it helps you.

Imo the main performance gain would be to put more threads to work on the 1000 loop.

Increasing the loopcount yields more factors.

Let’s hope the 3.2 release will be out shortly or someone will be so kind to post an improved ptxas.exe!

Final note:

NVCC does generate a .ptx file. You could try to run that in driver mode, letting the driver do the last compilation.

Perhaps the bugfixes are more quickly put through there than in the development tools.

Thanks jan.heckman for doing the experiment.

I have another question for the code now.

When I try the algorithm for the cpu version and the gpu version, actually they are almost the same.

They give the same result for integers for small value.

However when I set bigger integers as the input, thee results of the cpu and gpu go separate.

It is strange. After trying many possible ways, I assume the gpu would behave abnormally when the operations cause integer to overflow.

Since this is just my assumption, would somebody test the code ?

Thanks jan.heckman for doing the experiment.

I have another question for the code now.

When I try the algorithm for the cpu version and the gpu version, actually they are almost the same.

They give the same result for integers for small value.

However when I set bigger integers as the input, thee results of the cpu and gpu go separate.

It is strange. After trying many possible ways, I assume the gpu would behave abnormally when the operations cause integer to overflow.

Since this is just my assumption, would somebody test the code ?