Pointer to pointer strange behaviour cudaSafeCall Runtime Api error: Unknown error

Hi,

I have the following function:

__device__ int algo(unsigned int n0,unsigned int n,unsigned int* ST1,  unsigned int* ans)

{

	

	unsigned int ELM, ELM1;

	unsigned int a, e, p;

	unsigned int * D[6] = { d_D1, d_D2, d_D3, d_D4, d_D5, d_D6 };

	ELM = ST1[n0]+n;  //local.37

	ELM1 = n;	   //local.33

	

		

	

		

	for (a = 0; a < 6; a++)  //index of ans

	{		 

		for (e = ELM1; e < ELM; e++)  // e = n ELM1 <= ELM

		{

			for (p = 0; p < 6; p++)

		   {

			  if (D[p][e] == ans[a])

			  {

				  

				  goto one;

			

	

			  }   

		   }

		

		}

		  

	 

		   return 0;

			   	

one:;

	}

		  

	  

		   

		   return 1; 

}

d_D1, d_D2, d_D3,d_D4,d_D5 and d_D6 are declared as constant device constant unsigned int d_D1[442];

Now when I run the kernel I get back the error: cudaSafeCall Runtime Api error: Unknown error, if I change return 1 with return 0, so to have two return 0, everything works.

if I swap the return 0 with the return 1, I got the some error, seems that the function does not like the return 1, and I guess that it has to do with the pointer to pointer unsigned int * D[6] = { d_D1, d_D2, d_D3, d_D4, d_D5, d_D6 };

If I change the code to:

__device__ int algo(unsigned int n0,unsigned int n,unsigned int* ST1,  unsigned int* ans)

{

	

	unsigned int ELM, ELM1;

	unsigned int a, e, p;

	

	   ELM = ST1[n0]+n;  //local.37

	ELM1 = n;	   //local.33

	

	

		

	

		

	for (a = 0; a < 6; a++)  //index of ans

	{		 

		for (e = ELM1; e < ELM; e++)  // e = n ELM1 <= ELM

	  {

			

			  if (d_D1[e] == ans[a]) 

		 {

				goto one;

		  }   

		   else if (d_D2[e] == ans[a])

		 {

				goto one;

		  }

			  else if (d_D3[e] == ans[a])

		 {

				goto one;

		 }

			  else if (d_D4[e] == ans[a])

		{

				goto one;

		}

			else if (d_D5[e] == ans[a])

		{

				goto one;

		}

			else if (d_D6[e] == ans[a])

		{

				goto one;

		}

		

	 }

		  

			 return 0;

		   

			   	

one:;

	}

		  

	  

		   return 1; 

}

Everything works. Any suggestions?

Thanks.

Does it work in emulation mode?

Well, because N is 13000000, I did not test it in full, however, yes tracing through the function I did not have problems

Maybe compiler bug, need to inspect ptx output.

I now compiled it with the option --ptxas-options=-v, but where do I find the file with the ptx instructions?

here is the pxt file:

	.version 1.4

.target sm_11, map_f64_to_f32

// compiled with C:\CUDA\bin/../open64/lib//be.exe

// nvopencc 3.0 built on 2010-02-23

//-----------------------------------------------------------

// Compiling sample.compute_11.cpp3.i (C:/Users/ADMINI~1/AppData/Local/Temp/ccBI#.a09260)

//-----------------------------------------------------------

//-----------------------------------------------------------

// Options:

//-----------------------------------------------------------

//  Target:ptx, ISA:sm_11, Endian:little, Pointer Size:32

//  -O3	(Optimization level)

//  -g0	(Debug level)

//  -m2	(Report advisories)

//-----------------------------------------------------------

.file	1	"sample.compute_11.cudafe2.gpu"

.file	2	"C:\CUDA\include\cuPrintf.cu"

.file	3	"C:\Program Files\Microsoft Visual Studio 9.0\VC\INCLUDE\crtdefs.h"

.file	4	"C:\CUDA\include\crt/device_runtime.h"

.file	5	"C:\CUDA\include\host_defines.h"

.file	6	"C:\CUDA\include\builtin_types.h"

.file	7	"c:\cuda\include\device_types.h"

.file	8	"c:\cuda\include\driver_types.h"

.file	9	"c:\cuda\include\texture_types.h"

.file	10	"c:\cuda\include\vector_types.h"

.file	11	"c:\cuda\include\host_defines.h"

.file	12	"C:\CUDA\include\device_launch_parameters.h"

.file	13	"c:\cuda\include\crt\storage_class.h"

.file	14	"C:\Program Files\Microsoft Visual Studio 9.0\VC\INCLUDE\time.h"

.file	15	"c:/Users/Administrator/Desktop/example1/example1/Pot_49_2_better/Pot_49_2_better/sample.cu"

.file	16	"C:\CUDA\include\common_functions.h"

.file	17	"c:\cuda\include\crt/func_macro.h"

.file	18	"c:\cuda\include\math_functions.h"

.file	19	"c:\cuda\include\device_functions.h"

.file	20	"c:\cuda\include\math_constants.h"

.file	21	"c:\cuda\include\sm_11_atomic_functions.h"

.file	22	"c:\cuda\include\sm_12_atomic_functions.h"

.file	23	"c:\cuda\include\sm_13_double_functions.h"

.file	24	"c:\cuda\include\common_types.h"

.file	25	"c:\cuda\include\sm_20_atomic_functions.h"

.file	26	"c:\cuda\include\sm_20_intrinsics.h"

.file	27	"c:\cuda\include\texture_fetch_functions.h"

.file	28	"c:\cuda\include\math_functions_dbl_ptx1.h"

.global .align 8 .b8 restrictRules[8];

.const .align 4 .b8 d_D1[1768];

.const .align 4 .b8 d_D2[1768];

.const .align 4 .b8 d_D3[1768];

.const .align 4 .b8 d_D4[1768];

.const .align 4 .b8 d_D5[1768];

.const .align 4 .b8 d_D6[1768];

.const .align 4 .b8 d_ST1[84];

.const .u32 globalPrintfBuffer = 0;

.const .s32 printfBufferLength = 0;

.global .u32 printfBufferPtr = 0;

.entry _Z17CheckCombinationsPiS_S_S_S_S_j (

	.param .u32 __cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A1,

	.param .u32 __cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A2,

	.param .u32 __cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A3,

	.param .u32 __cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A4,

	.param .u32 __cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A5,

	.param .u32 __cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A6,

	.param .u32 __cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_N)

{

.reg .u32 %r<183>;

.reg .pred %p<32>;

.local .align 4 .b8 __cuda___cuda_ans_1610720[24];

.local .align 4 .b8 __cuda___cuda_D_4010744[24];

.local .align 4 .b8 __cuda___cuda_D_6410768[24];

.loc	15	329	0

$LBB1__Z17CheckCombinationsPiS_S_S_S_S_j:

cvt.u32.u16 	%r1, %ntid.x;

cvt.u32.u16 	%r2, %ctaid.x;

mul24.lo.u32 	%r3, %r1, %r2;

cvt.u32.u16 	%r4, %tid.x;

add.u32 	%r5, %r4, %r3;

ld.param.u32 	%r6, [__cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_N];

setp.le.u32 	%p1, %r6, %r5;

@%p1 bra 	$LBB55__Z17CheckCombinationsPiS_S_S_S_S_j;

.loc	15	522	0

mov.u32 	%r7, 13983815;

sub.u32 	%r8, %r7, %r5;

mov.u32 	%r9, 12271512;

setp.ge.u32 	%p2, %r8, %r9;

mov.s32 	%r10, 48;

@%p2 bra 	$Lt_0_59650;

$Lt_0_43522:

// Loop body line 523

.loc	15	523	0

sub.u32 	%r10, %r10, 1;

.loc	15	522	0

sub.u32 	%r11, %r10, 1;

sub.u32 	%r12, %r10, 2;

sub.u32 	%r13, %r10, 3;

sub.u32 	%r14, %r10, 5;

sub.u32 	%r15, %r10, 4;

mul.lo.u32 	%r16, %r14, %r15;

shr.u32 	%r17, %r16, 1;

mul.lo.u32 	%r18, %r13, %r17;

mov.u32 	%r19, -1431655765;

mul.hi.u32 	%r20, %r18, %r19;

shr.u32 	%r21, %r20, 1;

mul.lo.u32 	%r22, %r12, %r21;

shr.u32 	%r23, %r22, 2;

mul.lo.u32 	%r24, %r11, %r23;

mov.u32 	%r25, -858993459;

mul.hi.u32 	%r26, %r24, %r25;

shr.u32 	%r27, %r26, 2;

mul.lo.u32 	%r28, %r27, %r10;

mov.u32 	%r29, -1431655765;

mul.hi.u32 	%r30, %r28, %r29;

shr.u32 	%r31, %r30, 2;

setp.lt.u32 	%p3, %r8, %r31;

@%p3 bra 	$Lt_0_43522;

bra.uni 	$Lt_0_43010;

$Lt_0_59650:

mov.u32 	%r31, 12271512;

$Lt_0_43010:

mov.s32 	%r32, %r10;

.loc	15	543	0

st.local.u32 	[__cuda___cuda_ans_1610720+0], %r32;

.loc	15	544	0

add.u32 	%r33, %r5, %r31;

mov.u32 	%r34, 13983815;

sub.u32 	%r35, %r34, %r33;

.loc	15	520	0

sub.u32 	%r10, %r32, 1;

.loc	15	522	0

sub.u32 	%r11, %r10, 1;

sub.u32 	%r12, %r10, 2;

sub.u32 	%r13, %r10, 3;

sub.u32 	%r15, %r10, 4;

mul.lo.u32 	%r36, %r13, %r15;

shr.u32 	%r37, %r36, 1;

mul.lo.u32 	%r38, %r12, %r37;

mov.u32 	%r39, -1431655765;

mul.hi.u32 	%r40, %r38, %r39;

shr.u32 	%r41, %r40, 1;

mul.lo.u32 	%r42, %r11, %r41;

shr.u32 	%r43, %r42, 2;

mul.lo.u32 	%r44, %r43, %r10;

mov.u32 	%r45, -858993459;

mul.hi.u32 	%r46, %r44, %r45;

shr.u32 	%r47, %r46, 2;

setp.le.u32 	%p4, %r47, %r35;

@%p4 bra 	$Lt_0_44034;

$Lt_0_44546:

// Loop body line 523

.loc	15	523	0

sub.u32 	%r10, %r10, 1;

.loc	15	522	0

sub.u32 	%r11, %r10, 1;

sub.u32 	%r12, %r10, 2;

sub.u32 	%r13, %r10, 3;

sub.u32 	%r15, %r10, 4;

mul.lo.u32 	%r48, %r13, %r15;

shr.u32 	%r49, %r48, 1;

mul.lo.u32 	%r50, %r12, %r49;

mov.u32 	%r51, -1431655765;

mul.hi.u32 	%r52, %r50, %r51;

shr.u32 	%r53, %r52, 1;

mul.lo.u32 	%r54, %r11, %r53;

shr.u32 	%r55, %r54, 2;

mul.lo.u32 	%r56, %r55, %r10;

mov.u32 	%r57, -858993459;

mul.hi.u32 	%r58, %r56, %r57;

shr.u32 	%r47, %r58, 2;

setp.gt.u32 	%p5, %r47, %r35;

@%p5 bra 	$Lt_0_44546;

$Lt_0_44034:

mov.s32 	%r59, %r10;

.loc	15	549	0

st.local.u32 	[__cuda___cuda_ans_1610720+4], %r59;

.loc	15	550	0

sub.u32 	%r35, %r35, %r47;

.loc	15	520	0

sub.u32 	%r10, %r59, 1;

.loc	15	522	0

sub.u32 	%r11, %r10, 1;

sub.u32 	%r12, %r10, 2;

sub.u32 	%r13, %r10, 3;

mul.lo.u32 	%r60, %r12, %r13;

shr.u32 	%r61, %r60, 1;

mul.lo.u32 	%r62, %r11, %r61;

mov.u32 	%r63, -1431655765;

mul.hi.u32 	%r64, %r62, %r63;

shr.u32 	%r65, %r64, 1;

mul.lo.u32 	%r66, %r65, %r10;

shr.u32 	%r67, %r66, 2;

setp.le.u32 	%p6, %r67, %r35;

@%p6 bra 	$Lt_0_45058;

$Lt_0_45570:

// Loop body line 523

.loc	15	523	0

sub.u32 	%r10, %r10, 1;

.loc	15	522	0

sub.u32 	%r11, %r10, 1;

sub.u32 	%r12, %r10, 2;

sub.u32 	%r13, %r10, 3;

mul.lo.u32 	%r68, %r12, %r13;

shr.u32 	%r69, %r68, 1;

mul.lo.u32 	%r70, %r11, %r69;

mov.u32 	%r71, -1431655765;

mul.hi.u32 	%r72, %r70, %r71;

shr.u32 	%r73, %r72, 1;

mul.lo.u32 	%r74, %r73, %r10;

shr.u32 	%r67, %r74, 2;

setp.gt.u32 	%p7, %r67, %r35;

@%p7 bra 	$Lt_0_45570;

$Lt_0_45058:

mov.s32 	%r75, %r10;

.loc	15	555	0

st.local.u32 	[__cuda___cuda_ans_1610720+8], %r75;

.loc	15	556	0

sub.u32 	%r35, %r35, %r67;

.loc	15	520	0

sub.u32 	%r10, %r75, 1;

.loc	15	522	0

sub.u32 	%r11, %r10, 1;

sub.u32 	%r12, %r10, 2;

mul.lo.u32 	%r76, %r11, %r12;

shr.u32 	%r77, %r76, 1;

mul.lo.u32 	%r78, %r77, %r10;

mov.u32 	%r79, -1431655765;

mul.hi.u32 	%r80, %r78, %r79;

shr.u32 	%r81, %r80, 1;

setp.le.u32 	%p8, %r81, %r35;

@%p8 bra 	$Lt_0_46082;

$Lt_0_46594:

// Loop body line 523

.loc	15	523	0

sub.u32 	%r10, %r10, 1;

.loc	15	522	0

sub.u32 	%r11, %r10, 1;

sub.u32 	%r12, %r10, 2;

mul.lo.u32 	%r82, %r11, %r12;

shr.u32 	%r83, %r82, 1;

mul.lo.u32 	%r84, %r83, %r10;

mov.u32 	%r85, -1431655765;

mul.hi.u32 	%r86, %r84, %r85;

shr.u32 	%r81, %r86, 1;

setp.gt.u32 	%p9, %r81, %r35;

@%p9 bra 	$Lt_0_46594;

$Lt_0_46082:

mov.s32 	%r87, %r10;

.loc	15	561	0

st.local.u32 	[__cuda___cuda_ans_1610720+12], %r87;

.loc	15	562	0

sub.u32 	%r35, %r35, %r81;

.loc	15	520	0

sub.u32 	%r10, %r87, 1;

.loc	15	522	0

sub.u32 	%r11, %r10, 1;

mul.lo.u32 	%r88, %r11, %r10;

shr.u32 	%r89, %r88, 1;

setp.le.u32 	%p10, %r89, %r35;

@%p10 bra 	$Lt_0_47106;

mul.lo.u32 	%r90, %r10, -1;

$Lt_0_47618:

// Loop body line 523

.loc	15	523	0

sub.u32 	%r10, %r10, 1;

add.u32 	%r90, %r90, 1;

.loc	15	522	0

mul.lo.u32 	%r91, %r10, %r10;

add.u32 	%r92, %r90, %r91;

shr.u32 	%r93, %r92, 1;

setp.lt.u32 	%p11, %r35, %r93;

@%p11 bra 	$Lt_0_47618;

sub.u32 	%r11, %r10, 1;

mul.lo.u32 	%r94, %r11, %r10;

shr.u32 	%r89, %r94, 1;

$Lt_0_47106:

mov.s32 	%r95, %r10;

.loc	15	567	0

st.local.u32 	[__cuda___cuda_ans_1610720+16], %r95;

.loc	15	568	0

sub.u32 	%r35, %r35, %r89;

.loc	15	520	0

sub.u32 	%r10, %r95, 1;

.loc	15	522	0

setp.ge.u32 	%p12, %r35, %r10;

@%p12 bra 	$Lt_0_49154;

$Lt_0_49666:

// Loop body line 522, nesting depth: 1, estimated iterations: unknown

.loc	15	523	0

sub.u32 	%r10, %r10, 1;

.loc	15	522	0

setp.lt.u32 	%p13, %r35, %r10;

@%p13 bra 	$Lt_0_49666;

$Lt_0_49154:

.loc	15	573	0

st.local.u32 	[__cuda___cuda_ans_1610720+20], %r10;

.loc	15	583	0

mov.u32 	%r96, 49;

sub.u32 	%r97, %r96, %r32;

st.local.u32 	[__cuda___cuda_ans_1610720+0], %r97;

.loc	15	584	0

mov.u32 	%r98, 49;

sub.u32 	%r99, %r98, %r59;

st.local.u32 	[__cuda___cuda_ans_1610720+4], %r99;

.loc	15	585	0

mov.u32 	%r100, 49;

sub.u32 	%r101, %r100, %r75;

st.local.u32 	[__cuda___cuda_ans_1610720+8], %r101;

.loc	15	586	0

mov.u32 	%r102, 49;

sub.u32 	%r103, %r102, %r87;

st.local.u32 	[__cuda___cuda_ans_1610720+12], %r103;

.loc	15	587	0

mov.u32 	%r104, 49;

sub.u32 	%r105, %r104, %r95;

st.local.u32 	[__cuda___cuda_ans_1610720+16], %r105;

.loc	15	588	0

mov.u32 	%r106, 49;

sub.u32 	%r107, %r106, %r10;

st.local.u32 	[__cuda___cuda_ans_1610720+20], %r107;

.loc	15	413	0

mov.u32 	%r108, 0;

mov.u32 	%r109, 0;

mov.u32 	%r110, __cuda___cuda_D_4010744;

mov.u32 	%r111, d_D6;

mov.u32 	%r112, d_D4;

mov.u32 	%r113, d_D2;

mov.u32 	%r114, __cuda___cuda_ans_1610720;

mov.u32 	%r115, d_ST1;

mov.u32 	%r116, d_D5;

mov.u32 	%r117, d_D3;

mov.u32 	%r118, d_D1;

$L_0_40450:

// Loop body line 415

.loc	15	415	0

add.u32 	%r119, %r108, %r115;

ld.const.u32 	%r120, [%r119+0];

mov.u32 	%r121, 0;

setp.eq.u32 	%p14, %r120, %r121;

@%p14 bra 	$Lt_0_55042;

// Part of loop body line 415, head labeled $L_0_40450

.loc	15	223	0

st.local.u32 	[__cuda___cuda_D_4010744+0], %r118;

st.local.u32 	[__cuda___cuda_D_4010744+4], %r113;

st.local.u32 	[__cuda___cuda_D_4010744+8], %r117;

st.local.u32 	[__cuda___cuda_D_4010744+12], %r112;

st.local.u32 	[__cuda___cuda_D_4010744+16], %r116;

st.local.u32 	[__cuda___cuda_D_4010744+20], %r111;

add.u32 	%r122, %r120, %r109;

setp.gt.u32 	%p15, %r122, %r109;

mov.u32 	%r123, 0;

$Lt_0_52226:

// Loop body line 223, nesting depth: 1, estimated iterations: 6

.loc	15	233	0

mov.s32 	%r124, %r109;

@!%p15 bra 	$Lt_0_52482;

// Part of loop body line 223, head labeled $Lt_0_52226

mov.u32 	%r125, 1;

max.u32 	%r126, %r120, %r125;

mul.lo.u32 	%r127, %r123, 4;

add.u32 	%r128, %r127, %r114;

ld.local.u32 	%r129, [%r128+0];

// Part of loop body line 223, head labeled $Lt_0_52226

mov.s32 	%r130, %r126;

$Lt_0_52994:

// Loop body line 233, nesting depth: 2, estimated iterations: unknown

mul.lo.u32 	%r131, %r124, 4;

mov.u32 	%r132, 0;

$Lt_0_53762:

// Loop body line 233, nesting depth: 3, estimated iterations: 6

.loc	15	240	0

mul.lo.u32 	%r133, %r132, 4;

add.u32 	%r134, %r110, %r133;

ld.local.u32 	%r135, [%r134+0];

add.u32 	%r136, %r135, %r131;

ld.global.u32 	%r137, [%r136+0];

setp.eq.u32 	%p16, %r137, %r129;

@%p16 bra 	$Lt_0_1282;

// Part of loop body line 233, head labeled $Lt_0_53762

.loc	15	235	0

add.u32 	%r132, %r132, 1;

mov.u32 	%r138, 6;

setp.ne.u32 	%p17, %r132, %r138;

@%p17 bra 	$Lt_0_53762;

// Part of loop body line 233, head labeled $Lt_0_52994

.loc	15	233	0

add.u32 	%r124, %r124, 1;

setp.gt.u32 	%p18, %r122, %r124;

@%p18 bra 	$Lt_0_52994;

$Lt_0_52482:

// Part of loop body line 415, head labeled $L_0_40450

mov.s32 	%r139, 0;

bra.uni 	$Lt_0_1026;

$Lt_0_1282:

// Part of loop body line 223, head labeled $Lt_0_52226

.loc	15	231	0

add.u32 	%r123, %r123, 1;

mov.u32 	%r140, 6;

setp.ne.u32 	%p19, %r123, %r140;

@%p19 bra 	$Lt_0_52226;

// Part of loop body line 415, head labeled $L_0_40450

mov.s32 	%r139, 1;

$Lt_0_1026:

// Part of loop body line 415, head labeled $L_0_40450

.loc	15	423	0

mov.u32 	%r141, 1;

setp.ne.s32 	%p20, %r139, %r141;

@%p20 bra 	$Lt_0_55042;

bra.uni 	$LBB55__Z17CheckCombinationsPiS_S_S_S_S_j;

$Lt_0_55042:

$Lt_0_51202:

// Part of loop body line 415, head labeled $L_0_40450

.loc	15	427	0

add.u32 	%r108, %r108, 8;

.loc	15	428	0

add.u32 	%r109, %r109, 49;

.loc	15	415	0

mov.u32 	%r142, 68;

setp.gt.u32 	%p21, %r108, %r142;

@%p21 bra 	$L_0_41218;

// Part of loop body line 415, head labeled $L_0_40450

mov.u32 	%r143, 440;

setp.le.u32 	%p22, %r109, %r143;

@%p22 bra 	$L_0_40450;

$L_0_41218:

.loc	15	433	0

mov.u32 	%r108, 4;

mov.u32 	%r109, 0;

mov.u32 	%r144, __cuda___cuda_D_6410768;

$L_0_41474:

// Loop body line 435

.loc	15	435	0

add.u32 	%r145, %r108, %r115;

ld.const.u32 	%r146, [%r145+0];

mov.u32 	%r147, 0;

setp.eq.u32 	%p23, %r146, %r147;

@%p23 bra 	$Lt_0_59138;

// Part of loop body line 435, head labeled $L_0_41474

.loc	15	269	0

st.local.u32 	[__cuda___cuda_D_6410768+0], %r118;

st.local.u32 	[__cuda___cuda_D_6410768+4], %r113;

st.local.u32 	[__cuda___cuda_D_6410768+8], %r117;

st.local.u32 	[__cuda___cuda_D_6410768+12], %r112;

st.local.u32 	[__cuda___cuda_D_6410768+16], %r116;

st.local.u32 	[__cuda___cuda_D_6410768+20], %r111;

add.u32 	%r122, %r146, %r109;

setp.gt.u32 	%p15, %r122, %r109;

mov.u32 	%r148, 0;

$Lt_0_56322:

// Loop body line 269, nesting depth: 1, estimated iterations: 6

.loc	15	280	0

mov.s32 	%r149, %r109;

@!%p15 bra 	$Lt_0_56578;

// Part of loop body line 269, head labeled $Lt_0_56322

mov.u32 	%r150, 1;

max.u32 	%r151, %r146, %r150;

mul.lo.u32 	%r152, %r148, 4;

add.u32 	%r153, %r152, %r114;

ld.local.u32 	%r154, [%r153+0];

// Part of loop body line 269, head labeled $Lt_0_56322

mov.s32 	%r155, %r151;

$Lt_0_57090:

// Loop body line 280, nesting depth: 2, estimated iterations: unknown

mul.lo.u32 	%r156, %r149, 4;

mov.u32 	%r157, 0;

$Lt_0_57858:

// Loop body line 280, nesting depth: 3, estimated iterations: 6

.loc	15	287	0

mul.lo.u32 	%r158, %r157, 4;

add.u32 	%r159, %r144, %r158;

ld.local.u32 	%r160, [%r159+0];

add.u32 	%r161, %r160, %r156;

ld.global.u32 	%r162, [%r161+0];

setp.eq.u32 	%p24, %r162, %r154;

@%p24 bra 	$Lt_0_514;

// Part of loop body line 280, head labeled $Lt_0_57858

.loc	15	282	0

add.u32 	%r157, %r157, 1;

mov.u32 	%r163, 6;

setp.ne.u32 	%p25, %r157, %r163;

@%p25 bra 	$Lt_0_57858;

// Part of loop body line 280, head labeled $Lt_0_57090

.loc	15	280	0

add.u32 	%r149, %r149, 1;

setp.gt.u32 	%p26, %r122, %r149;

@%p26 bra 	$Lt_0_57090;

$Lt_0_56578:

// Part of loop body line 435, head labeled $L_0_41474

mov.s32 	%r164, 1;

bra.uni 	$Lt_0_258;

$Lt_0_514:

// Part of loop body line 269, head labeled $Lt_0_56322

.loc	15	278	0

add.u32 	%r148, %r148, 1;

mov.u32 	%r165, 6;

setp.ne.u32 	%p27, %r148, %r165;

@%p27 bra 	$Lt_0_56322;

// Part of loop body line 435, head labeled $L_0_41474

mov.s32 	%r164, 0;

$Lt_0_258:

// Part of loop body line 435, head labeled $L_0_41474

.loc	15	443	0

mov.u32 	%r166, 1;

setp.ne.s32 	%p28, %r164, %r166;

@%p28 bra 	$Lt_0_59138;

bra.uni 	$LBB55__Z17CheckCombinationsPiS_S_S_S_S_j;

$Lt_0_59138:

$Lt_0_55298:

// Part of loop body line 435, head labeled $L_0_41474

.loc	15	447	0

add.u32 	%r108, %r108, 8;

.loc	15	448	0

add.u32 	%r109, %r109, 49;

.loc	15	435	0

mov.u32 	%r167, 72;

setp.gt.u32 	%p29, %r108, %r167;

@%p29 bra 	$L_0_42242;

// Part of loop body line 435, head labeled $L_0_41474

mov.u32 	%r168, 440;

setp.le.u32 	%p30, %r109, %r168;

@%p30 bra 	$L_0_41474;

$L_0_42242:

.loc	15	456	0

mul.lo.u32 	%r169, %r5, 4;

ld.param.u32 	%r170, [__cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A1];

add.u32 	%r171, %r170, %r169;

st.global.s32 	[%r171+0], %r97;

.loc	15	457	0

ld.param.u32 	%r172, [__cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A2];

add.u32 	%r173, %r172, %r169;

st.global.s32 	[%r173+0], %r99;

.loc	15	458	0

ld.param.u32 	%r174, [__cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A3];

add.u32 	%r175, %r174, %r169;

st.global.s32 	[%r175+0], %r101;

.loc	15	459	0

ld.param.u32 	%r176, [__cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A4];

add.u32 	%r177, %r176, %r169;

st.global.s32 	[%r177+0], %r103;

.loc	15	460	0

ld.param.u32 	%r178, [__cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A5];

add.u32 	%r179, %r178, %r169;

st.global.s32 	[%r179+0], %r105;

.loc	15	461	0

ld.param.u32 	%r180, [__cudaparm__Z17CheckCombinationsPiS_S_S_S_S_j_A6];

add.u32 	%r181, %r180, %r169;

st.global.s32 	[%r181+0], %r107;

$LBB55__Z17CheckCombinationsPiS_S_S_S_S_j:

.loc	15	468	0

exit;

$LDWend__Z17CheckCombinationsPiS_S_S_S_S_j:

} // _Z17CheckCombinationsPiS_S_S_S_S_j